]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/coreutils/coreutils-8.32-i18n-1.patch
~/src/patches/: Clean up orphaned patches, duplicates and application patches outside...
[people/pmueller/ipfire-2.x.git] / src / patches / coreutils / coreutils-8.32-i18n-1.patch
CommitLineData
b3478306
PM
1Submitted by: Xi Ruoyao <xry111@mengyan1223.wang>
2Date: 2020-03-08
3Initial Package Version: 8.32
7cf3a80b 4Upstream Status: Rejected
e1fb4052 5Origin: Based on Fedora's i18n patches at
b3478306 6 https://src.fedoraproject.org/rpms/coreutils/
e1fb4052 7Description: Fixes i18n issues with various Coreutils programs
7cf3a80b 8
b3478306
PM
9diff -Naurp coreutils-8.32.orig/bootstrap.conf coreutils-8.32/bootstrap.conf
10--- coreutils-8.32.orig/bootstrap.conf 2020-02-25 22:25:43.000000000 +0800
11+++ coreutils-8.32/bootstrap.conf 2020-03-08 12:10:27.733236560 +0800
12@@ -154,6 +154,7 @@ gnulib_modules="
e1fb4052
MF
13 maintainer-makefile
14 malloc-gnu
15 manywarnings
16+ mbfile
17 mbrlen
18 mbrtowc
19 mbsalign
b3478306
PM
20diff -Naurp coreutils-8.32.orig/configure.ac coreutils-8.32/configure.ac
21--- coreutils-8.32.orig/configure.ac 2020-02-28 05:45:34.000000000 +0800
22+++ coreutils-8.32/configure.ac 2020-03-08 12:10:27.733236560 +0800
23@@ -446,6 +446,8 @@ fi
e1fb4052
MF
24 # I'm leaving it here for now. This whole thing needs to be modernized...
25 gl_WINSIZE_IN_PTEM
26
27+gl_MBFILE
28+
29 gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H
30
31 if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \
b3478306
PM
32diff -Naurp coreutils-8.32.orig/lib/linebuffer.h coreutils-8.32/lib/linebuffer.h
33--- coreutils-8.32.orig/lib/linebuffer.h 2020-01-01 22:14:23.000000000 +0800
34+++ coreutils-8.32/lib/linebuffer.h 2020-03-08 12:10:27.733236560 +0800
7cf3a80b
ML
35@@ -21,6 +21,11 @@
36
37 # include <stdio.h>
38
39+/* Get mbstate_t. */
40+# if HAVE_WCHAR_H
41+# include <wchar.h>
42+# endif
43+
44 /* A 'struct linebuffer' holds a line of text. */
45
46 struct linebuffer
47@@ -28,6 +33,9 @@ struct linebuffer
48 size_t size; /* Allocated. */
49 size_t length; /* Used. */
50 char *buffer;
51+# if HAVE_WCHAR_H
52+ mbstate_t state;
53+# endif
54 };
55
56 /* Initialize linebuffer LINEBUFFER for use. */
b3478306
PM
57diff -Naurp coreutils-8.32.orig/lib/mbfile.c coreutils-8.32/lib/mbfile.c
58--- coreutils-8.32.orig/lib/mbfile.c 1970-01-01 08:00:00.000000000 +0800
59+++ coreutils-8.32/lib/mbfile.c 2020-03-08 12:10:27.733236560 +0800
e1fb4052
MF
60@@ -0,0 +1,3 @@
61+#include <config.h>
62+#define MBFILE_INLINE _GL_EXTERN_INLINE
63+#include "mbfile.h"
b3478306
PM
64diff -Naurp coreutils-8.32.orig/lib/mbfile.h coreutils-8.32/lib/mbfile.h
65--- coreutils-8.32.orig/lib/mbfile.h 1970-01-01 08:00:00.000000000 +0800
66+++ coreutils-8.32/lib/mbfile.h 2020-03-08 12:10:27.734236560 +0800
e1fb4052
MF
67@@ -0,0 +1,255 @@
68+/* Multibyte character I/O: macros for multi-byte encodings.
b3478306 69+ Copyright (C) 2001, 2005, 2009-2015 Free Software Foundation, Inc.
e1fb4052
MF
70+
71+ This program is free software: you can redistribute it and/or modify
72+ it under the terms of the GNU General Public License as published by
73+ the Free Software Foundation; either version 3 of the License, or
74+ (at your option) any later version.
75+
76+ This program is distributed in the hope that it will be useful,
77+ but WITHOUT ANY WARRANTY; without even the implied warranty of
78+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
79+ GNU General Public License for more details.
80+
81+ You should have received a copy of the GNU General Public License
82+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
83+
84+/* Written by Mitsuru Chinen <mchinen@yamato.ibm.com>
85+ and Bruno Haible <bruno@clisp.org>. */
86+
87+/* The macros in this file implement multi-byte character input from a
88+ stream.
89+
90+ mb_file_t
91+ is the type for multibyte character input stream, usable for variable
92+ declarations.
93+
94+ mbf_char_t
95+ is the type for multibyte character or EOF, usable for variable
96+ declarations.
97+
98+ mbf_init (mbf, stream)
99+ initializes the MB_FILE for reading from stream.
100+
101+ mbf_getc (mbc, mbf)
102+ reads the next multibyte character from mbf and stores it in mbc.
103+
104+ mb_iseof (mbc)
105+ returns true if mbc represents the EOF value.
106+
107+ Here are the function prototypes of the macros.
108+
109+ extern void mbf_init (mb_file_t mbf, FILE *stream);
110+ extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf);
111+ extern bool mb_iseof (const mbf_char_t mbc);
112+ */
113+
114+#ifndef _MBFILE_H
115+#define _MBFILE_H 1
116+
117+#include <assert.h>
118+#include <stdbool.h>
119+#include <stdio.h>
120+#include <string.h>
121+
122+/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
123+ <wchar.h>.
124+ BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
125+ <wchar.h>. */
126+#include <stdio.h>
127+#include <time.h>
128+#include <wchar.h>
129+
130+#include "mbchar.h"
131+
132+#ifndef _GL_INLINE_HEADER_BEGIN
133+ #error "Please include config.h first."
134+#endif
135+_GL_INLINE_HEADER_BEGIN
136+#ifndef MBFILE_INLINE
137+# define MBFILE_INLINE _GL_INLINE
138+#endif
139+
140+struct mbfile_multi {
141+ FILE *fp;
142+ bool eof_seen;
143+ bool have_pushback;
144+ mbstate_t state;
145+ unsigned int bufcount;
146+ char buf[MBCHAR_BUF_SIZE];
147+ struct mbchar pushback;
148+};
149+
150+MBFILE_INLINE void
151+mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
152+{
153+ size_t bytes;
154+
155+ /* If EOF has already been seen, don't use getc. This matters if
156+ mbf->fp is connected to an interactive tty. */
157+ if (mbf->eof_seen)
158+ goto eof;
159+
160+ /* Return character pushed back, if there is one. */
161+ if (mbf->have_pushback)
162+ {
163+ mb_copy (mbc, &mbf->pushback);
164+ mbf->have_pushback = false;
165+ return;
166+ }
167+
168+ /* Before using mbrtowc, we need at least one byte. */
169+ if (mbf->bufcount == 0)
170+ {
171+ int c = getc (mbf->fp);
172+ if (c == EOF)
173+ {
174+ mbf->eof_seen = true;
175+ goto eof;
176+ }
177+ mbf->buf[0] = (unsigned char) c;
178+ mbf->bufcount++;
179+ }
180+
181+ /* Handle most ASCII characters quickly, without calling mbrtowc(). */
182+ if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
183+ {
184+ /* These characters are part of the basic character set. ISO C 99
185+ guarantees that their wide character code is identical to their
186+ char code. */
187+ mbc->wc = mbc->buf[0] = mbf->buf[0];
188+ mbc->wc_valid = true;
189+ mbc->ptr = &mbc->buf[0];
190+ mbc->bytes = 1;
191+ mbf->bufcount = 0;
192+ return;
193+ }
194+
195+ /* Use mbrtowc on an increasing number of bytes. Read only as many bytes
196+ from mbf->fp as needed. This is needed to give reasonable interactive
197+ behaviour when mbf->fp is connected to an interactive tty. */
198+ for (;;)
199+ {
200+ /* We don't know whether the 'mbrtowc' function updates the state when
201+ it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or
202+ not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We
203+ don't have an autoconf test for this, yet.
204+ The new behaviour would allow us to feed the bytes one by one into
205+ mbrtowc. But the old behaviour forces us to feed all bytes since
206+ the end of the last character into mbrtowc. Since we want to retry
207+ with more bytes when mbrtowc returns -2, we must backup the state
208+ before calling mbrtowc, because implementations with the new
209+ behaviour will clobber it. */
210+ mbstate_t backup_state = mbf->state;
211+
212+ bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
213+
214+ if (bytes == (size_t) -1)
215+ {
216+ /* An invalid multibyte sequence was encountered. */
217+ /* Return a single byte. */
218+ bytes = 1;
219+ mbc->wc_valid = false;
220+ break;
221+ }
222+ else if (bytes == (size_t) -2)
223+ {
224+ /* An incomplete multibyte character. */
225+ mbf->state = backup_state;
226+ if (mbf->bufcount == MBCHAR_BUF_SIZE)
227+ {
228+ /* An overlong incomplete multibyte sequence was encountered. */
229+ /* Return a single byte. */
230+ bytes = 1;
231+ mbc->wc_valid = false;
232+ break;
233+ }
234+ else
235+ {
236+ /* Read one more byte and retry mbrtowc. */
237+ int c = getc (mbf->fp);
238+ if (c == EOF)
239+ {
240+ /* An incomplete multibyte character at the end. */
241+ mbf->eof_seen = true;
242+ bytes = mbf->bufcount;
243+ mbc->wc_valid = false;
244+ break;
245+ }
246+ mbf->buf[mbf->bufcount] = (unsigned char) c;
247+ mbf->bufcount++;
248+ }
249+ }
250+ else
251+ {
252+ if (bytes == 0)
253+ {
254+ /* A null wide character was encountered. */
255+ bytes = 1;
256+ assert (mbf->buf[0] == '\0');
257+ assert (mbc->wc == 0);
258+ }
259+ mbc->wc_valid = true;
260+ break;
261+ }
262+ }
263+
264+ /* Return the multibyte sequence mbf->buf[0..bytes-1]. */
265+ mbc->ptr = &mbc->buf[0];
266+ memcpy (&mbc->buf[0], &mbf->buf[0], bytes);
267+ mbc->bytes = bytes;
268+
269+ mbf->bufcount -= bytes;
270+ if (mbf->bufcount > 0)
271+ {
272+ /* It's not worth calling memmove() for so few bytes. */
273+ unsigned int count = mbf->bufcount;
274+ char *p = &mbf->buf[0];
275+
276+ do
277+ {
278+ *p = *(p + bytes);
279+ p++;
280+ }
281+ while (--count > 0);
282+ }
283+ return;
284+
285+eof:
286+ /* An mbchar_t with bytes == 0 is used to indicate EOF. */
287+ mbc->ptr = NULL;
288+ mbc->bytes = 0;
289+ mbc->wc_valid = false;
290+ return;
291+}
292+
293+MBFILE_INLINE void
294+mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf)
295+{
296+ mb_copy (&mbf->pushback, mbc);
297+ mbf->have_pushback = true;
298+}
299+
300+typedef struct mbfile_multi mb_file_t;
301+
302+typedef mbchar_t mbf_char_t;
303+
304+#define mbf_init(mbf, stream) \
305+ ((mbf).fp = (stream), \
306+ (mbf).eof_seen = false, \
307+ (mbf).have_pushback = false, \
308+ memset (&(mbf).state, '\0', sizeof (mbstate_t)), \
309+ (mbf).bufcount = 0)
310+
311+#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf))
312+
313+#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf))
314+
315+#define mb_iseof(mbc) ((mbc).bytes == 0)
316+
317+#ifndef _GL_INLINE_HEADER_BEGIN
318+ #error "Please include config.h first."
319+#endif
320+_GL_INLINE_HEADER_BEGIN
321+
322+#endif /* _MBFILE_H */
b3478306
PM
323diff -Naurp coreutils-8.32.orig/m4/mbfile.m4 coreutils-8.32/m4/mbfile.m4
324--- coreutils-8.32.orig/m4/mbfile.m4 1970-01-01 08:00:00.000000000 +0800
325+++ coreutils-8.32/m4/mbfile.m4 2020-03-08 12:10:27.734236560 +0800
e1fb4052
MF
326@@ -0,0 +1,14 @@
327+# mbfile.m4 serial 7
b3478306 328+dnl Copyright (C) 2005, 2008-2015 Free Software Foundation, Inc.
e1fb4052
MF
329+dnl This file is free software; the Free Software Foundation
330+dnl gives unlimited permission to copy and/or distribute it,
331+dnl with or without modifications, as long as this notice is preserved.
332+
333+dnl autoconf tests required for use of mbfile.h
334+dnl From Bruno Haible.
335+
336+AC_DEFUN([gl_MBFILE],
337+[
338+ AC_REQUIRE([AC_TYPE_MBSTATE_T])
339+ :
340+])
b3478306
PM
341diff -Naurp coreutils-8.32.orig/src/cut.c coreutils-8.32/src/cut.c
342--- coreutils-8.32.orig/src/cut.c 2020-01-01 22:13:12.000000000 +0800
343+++ coreutils-8.32/src/cut.c 2020-03-08 12:10:27.734236560 +0800
7cf3a80b
ML
344@@ -28,6 +28,11 @@
345 #include <assert.h>
346 #include <getopt.h>
347 #include <sys/types.h>
348+
349+/* Get mbstate_t, mbrtowc(). */
350+#if HAVE_WCHAR_H
351+# include <wchar.h>
352+#endif
353 #include "system.h"
354
355 #include "error.h"
356@@ -38,6 +43,18 @@
357
358 #include "set-fields.h"
359
360+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
361+ installation; work around this configuration error. */
362+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
363+# undef MB_LEN_MAX
364+# define MB_LEN_MAX 16
365+#endif
366+
367+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
368+#if HAVE_MBRTOWC && defined mbstate_t
369+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
370+#endif
371+
372 /* The official name of this program (e.g., no 'g' prefix). */
373 #define PROGRAM_NAME "cut"
374
375@@ -54,6 +71,52 @@
376 } \
377 while (0)
378
379+/* Refill the buffer BUF to get a multibyte character. */
380+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
381+ do \
382+ { \
383+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
384+ { \
385+ memmove (BUF, BUFPOS, BUFLEN); \
386+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
387+ BUFPOS = BUF; \
388+ } \
389+ } \
390+ while (0)
391+
392+/* Get wide character on BUFPOS. BUFPOS is not included after that.
393+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
394+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
395+ do \
396+ { \
397+ mbstate_t state_bak; \
398+ \
399+ if (BUFLEN < 1) \
400+ { \
401+ WC = WEOF; \
402+ break; \
403+ } \
404+ \
405+ /* Get a wide character. */ \
406+ CONVFAIL = false; \
407+ state_bak = STATE; \
408+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
409+ \
410+ switch (MBLENGTH) \
411+ { \
412+ case (size_t)-1: \
413+ case (size_t)-2: \
414+ CONVFAIL = true; \
415+ STATE = state_bak; \
416+ /* Fall througn. */ \
417+ \
418+ case 0: \
419+ MBLENGTH = 1; \
420+ break; \
421+ } \
422+ } \
423+ while (0)
424+
425
426 /* Pointer inside RP. When checking if a byte or field is selected
427 by a finite range, we check if it is between CURRENT_RP.LO
428@@ -61,6 +124,9 @@
429 CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
430 static struct field_range_pair *current_rp;
431
432+/* Length of the delimiter given as argument to -d. */
433+size_t delimlen;
434+
435 /* This buffer is used to support the semantics of the -s option
436 (or lack of same) when the specified field list includes (does
437 not include) the first field. In both of those cases, the entire
438@@ -77,15 +143,25 @@ enum operating_mode
439 {
440 undefined_mode,
441
442- /* Output characters that are in the given bytes. */
443+ /* Output bytes that are at the given positions. */
444 byte_mode,
445
446+ /* Output characters that are at the given positions. */
447+ character_mode,
448+
449 /* Output the given delimiter-separated fields. */
450 field_mode
451 };
452
453 static enum operating_mode operating_mode;
454
455+/* If nonzero, when in byte mode, don't split multibyte characters. */
456+static int byte_mode_character_aware;
457+
458+/* If nonzero, the function for single byte locale is work
459+ if this program runs on multibyte locale. */
460+static int force_singlebyte_mode;
461+
462 /* If true do not output lines containing no delimiter characters.
463 Otherwise, all such lines are printed. This option is valid only
464 with field mode. */
465@@ -97,6 +173,9 @@ static bool complement;
466
467 /* The delimiter character for field mode. */
468 static unsigned char delim;
469+#if HAVE_WCHAR_H
470+static wchar_t wcdelim;
471+#endif
472
473 /* The delimiter for each line/record. */
474 static unsigned char line_delim = '\n';
475@@ -164,7 +243,7 @@ Print selected parts of lines from each
476 -f, --fields=LIST select only these fields; also print any line\n\
477 that contains no delimiter character, unless\n\
478 the -s option is specified\n\
479- -n (ignored)\n\
480+ -n with -b: don't split multibyte characters\n\
481 "), stdout);
482 fputs (_("\
483 --complement complement the set of selected bytes, characters\n\
484@@ -280,6 +359,82 @@ cut_bytes (FILE *stream)
485 }
486 }
487
488+#if HAVE_MBRTOWC
489+/* This function is in use for the following case.
490+
491+ 1. Read from the stream STREAM, printing to standard output any selected
492+ characters.
493+
494+ 2. Read from stream STREAM, printing to standard output any selected bytes,
495+ without splitting multibyte characters. */
496+
497+static void
498+cut_characters_or_cut_bytes_no_split (FILE *stream)
499+{
b3478306 500+ uintmax_t idx; /* number of bytes or characters in the line so far. */
7cf3a80b
ML
501+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
502+ char *bufpos; /* Next read position of BUF. */
503+ size_t buflen; /* The length of the byte sequence in buf. */
504+ wint_t wc; /* A gotten wide character. */
505+ size_t mblength; /* The byte size of a multibyte character which shows
506+ as same character as WC. */
507+ mbstate_t state; /* State of the stream. */
508+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
509+ /* Whether to begin printing delimiters between ranges for the current line.
510+ Set after we've begun printing data corresponding to the first range. */
511+ bool print_delimiter = false;
512+
513+ idx = 0;
514+ buflen = 0;
515+ bufpos = buf;
516+ memset (&state, '\0', sizeof(mbstate_t));
517+
518+ current_rp = frp;
519+
520+ while (1)
521+ {
522+ REFILL_BUFFER (buf, bufpos, buflen, stream);
523+
524+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
525+ (void) convfail; /* ignore unused */
526+
527+ if (wc == WEOF)
528+ {
529+ if (idx > 0)
530+ putchar (line_delim);
531+ break;
532+ }
533+ else if (wc == line_delim)
534+ {
535+ putchar (line_delim);
536+ idx = 0;
537+ print_delimiter = false;
538+ current_rp = frp;
539+ }
540+ else
541+ {
542+ next_item (&idx);
543+ if (print_kth (idx))
544+ {
545+ if (output_delimiter_specified)
546+ {
547+ if (print_delimiter && is_range_start_index (idx))
548+ {
549+ fwrite (output_delimiter_string, sizeof (char),
550+ output_delimiter_length, stdout);
551+ }
552+ print_delimiter = true;
553+ }
554+ fwrite (bufpos, mblength, sizeof(char), stdout);
555+ }
556+ }
557+
558+ buflen -= mblength;
559+ bufpos += mblength;
560+ }
561+}
562+#endif
563+
564 /* Read from stream STREAM, printing to standard output any selected fields. */
565
566 static void
567@@ -425,13 +580,211 @@ cut_fields (FILE *stream)
568 }
569 }
570
571+#if HAVE_MBRTOWC
572+static void
573+cut_fields_mb (FILE *stream)
574+{
575+ int c;
b3478306 576+ uintmax_t field_idx;
7cf3a80b
ML
577+ int found_any_selected_field;
578+ int buffer_first_field;
579+ int empty_input;
580+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
581+ char *bufpos; /* Next read position of BUF. */
582+ size_t buflen; /* The length of the byte sequence in buf. */
583+ wint_t wc = 0; /* A gotten wide character. */
584+ size_t mblength; /* The byte size of a multibyte character which shows
585+ as same character as WC. */
586+ mbstate_t state; /* State of the stream. */
587+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
588+
589+ current_rp = frp;
590+
591+ found_any_selected_field = 0;
592+ field_idx = 1;
593+ bufpos = buf;
594+ buflen = 0;
595+ memset (&state, '\0', sizeof(mbstate_t));
596+
597+ c = getc (stream);
598+ empty_input = (c == EOF);
599+ if (c != EOF)
600+ {
601+ ungetc (c, stream);
602+ wc = 0;
603+ }
604+ else
605+ wc = WEOF;
606+
607+ /* To support the semantics of the -s flag, we may have to buffer
608+ all of the first field to determine whether it is `delimited.'
609+ But that is unnecessary if all non-delimited lines must be printed
610+ and the first field has been selected, or if non-delimited lines
611+ must be suppressed and the first field has *not* been selected.
612+ That is because a non-delimited line has exactly one field. */
613+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
614+
615+ while (1)
616+ {
617+ if (field_idx == 1 && buffer_first_field)
618+ {
619+ int len = 0;
620+
621+ while (1)
622+ {
623+ REFILL_BUFFER (buf, bufpos, buflen, stream);
624+
625+ GET_NEXT_WC_FROM_BUFFER
626+ (wc, bufpos, buflen, mblength, state, convfail);
627+
628+ if (wc == WEOF)
629+ break;
630+
631+ field_1_buffer = xrealloc (field_1_buffer, len + mblength);
632+ memcpy (field_1_buffer + len, bufpos, mblength);
633+ len += mblength;
634+ buflen -= mblength;
635+ bufpos += mblength;
636+
637+ if (!convfail && (wc == line_delim || wc == wcdelim))
638+ break;
639+ }
640+
641+ if (len <= 0 && wc == WEOF)
642+ break;
643+
644+ /* If the first field extends to the end of line (it is not
645+ delimited) and we are printing all non-delimited lines,
646+ print this one. */
647+ if (convfail || (!convfail && wc != wcdelim))
648+ {
649+ if (suppress_non_delimited)
650+ {
651+ /* Empty. */
652+ }
653+ else
654+ {
655+ fwrite (field_1_buffer, sizeof (char), len, stdout);
656+ /* Make sure the output line is newline terminated. */
657+ if (convfail || (!convfail && wc != line_delim))
658+ putchar (line_delim);
659+ }
660+ continue;
661+ }
662+
663+ if (print_kth (1))
664+ {
665+ /* Print the field, but not the trailing delimiter. */
666+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
667+ found_any_selected_field = 1;
668+ }
669+ next_item (&field_idx);
670+ }
671+
672+ if (wc != WEOF)
673+ {
674+ if (print_kth (field_idx))
675+ {
676+ if (found_any_selected_field)
677+ {
678+ fwrite (output_delimiter_string, sizeof (char),
679+ output_delimiter_length, stdout);
680+ }
681+ found_any_selected_field = 1;
682+ }
683+
684+ while (1)
685+ {
686+ REFILL_BUFFER (buf, bufpos, buflen, stream);
687+
688+ GET_NEXT_WC_FROM_BUFFER
689+ (wc, bufpos, buflen, mblength, state, convfail);
690+
691+ if (wc == WEOF)
692+ break;
693+ else if (!convfail && (wc == wcdelim || wc == line_delim))
694+ {
695+ buflen -= mblength;
696+ bufpos += mblength;
697+ break;
698+ }
699+
700+ if (print_kth (field_idx))
701+ fwrite (bufpos, mblength, sizeof(char), stdout);
702+
703+ buflen -= mblength;
704+ bufpos += mblength;
705+ }
706+ }
707+
708+ if ((!convfail || wc == line_delim) && buflen < 1)
709+ wc = WEOF;
710+
711+ if (!convfail && wc == wcdelim)
712+ next_item (&field_idx);
713+ else if (wc == WEOF || (!convfail && wc == line_delim))
714+ {
715+ if (found_any_selected_field
716+ || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
717+ putchar (line_delim);
718+ if (wc == WEOF)
719+ break;
720+ field_idx = 1;
721+ current_rp = frp;
722+ found_any_selected_field = 0;
723+ }
724+ }
725+}
726+#endif
727+
728 static void
729 cut_stream (FILE *stream)
730 {
731- if (operating_mode == byte_mode)
732- cut_bytes (stream);
733+#if HAVE_MBRTOWC
734+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
735+ {
736+ switch (operating_mode)
737+ {
738+ case byte_mode:
739+ if (byte_mode_character_aware)
740+ cut_characters_or_cut_bytes_no_split (stream);
741+ else
742+ cut_bytes (stream);
743+ break;
744+
745+ case character_mode:
746+ cut_characters_or_cut_bytes_no_split (stream);
747+ break;
748+
749+ case field_mode:
750+ if (delimlen == 1)
751+ {
752+ /* Check if we have utf8 multibyte locale, so we can use this
753+ optimization because of uniqueness of characters, which is
754+ not true for e.g. SJIS */
755+ char * loc = setlocale(LC_CTYPE, NULL);
756+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
757+ strstr (loc, "UTF8") || strstr (loc, "utf8")))
758+ {
759+ cut_fields (stream);
760+ break;
761+ }
762+ }
763+ cut_fields_mb (stream);
764+ break;
765+
766+ default:
767+ abort ();
768+ }
769+ }
770 else
771- cut_fields (stream);
772+#endif
773+ {
774+ if (operating_mode == field_mode)
775+ cut_fields (stream);
776+ else
777+ cut_bytes (stream);
778+ }
779 }
780
781 /* Process file FILE to standard output.
782@@ -483,6 +836,7 @@ main (int argc, char **argv)
783 bool ok;
784 bool delim_specified = false;
785 char *spec_list_string IF_LINT ( = NULL);
786+ char mbdelim[MB_LEN_MAX + 1];
787
788 initialize_main (&argc, &argv);
789 set_program_name (argv[0]);
790@@ -505,7 +859,6 @@ main (int argc, char **argv)
791 switch (optc)
792 {
793 case 'b':
794- case 'c':
795 /* Build the byte list. */
796 if (operating_mode != undefined_mode)
797 FATAL_ERROR (_("only one type of list may be specified"));
798@@ -513,6 +866,14 @@ main (int argc, char **argv)
799 spec_list_string = optarg;
800 break;
801
802+ case 'c':
803+ /* Build the character list. */
804+ if (operating_mode != undefined_mode)
805+ FATAL_ERROR (_("only one type of list may be specified"));
806+ operating_mode = character_mode;
807+ spec_list_string = optarg;
808+ break;
809+
810 case 'f':
811 /* Build the field list. */
812 if (operating_mode != undefined_mode)
813@@ -524,10 +885,38 @@ main (int argc, char **argv)
814 case 'd':
815 /* New delimiter. */
816 /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
817- if (optarg[0] != '\0' && optarg[1] != '\0')
818- FATAL_ERROR (_("the delimiter must be a single character"));
819- delim = optarg[0];
820- delim_specified = true;
821+ {
822+#if HAVE_MBRTOWC
823+ if(MB_CUR_MAX > 1)
824+ {
825+ mbstate_t state;
826+
827+ memset (&state, '\0', sizeof(mbstate_t));
828+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
829+
830+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
831+ ++force_singlebyte_mode;
832+ else
833+ {
834+ delimlen = (delimlen < 1) ? 1 : delimlen;
835+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
836+ FATAL_ERROR (_("the delimiter must be a single character"));
837+ memcpy (mbdelim, optarg, delimlen);
838+ mbdelim[delimlen] = '\0';
839+ if (delimlen == 1)
840+ delim = *optarg;
841+ }
842+ }
843+
844+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
845+#endif
846+ {
847+ if (optarg[0] != '\0' && optarg[1] != '\0')
848+ FATAL_ERROR (_("the delimiter must be a single character"));
849+ delim = (unsigned char) optarg[0];
850+ }
851+ delim_specified = true;
852+ }
853 break;
854
855 case OUTPUT_DELIMITER_OPTION:
856@@ -540,6 +929,7 @@ main (int argc, char **argv)
857 break;
858
859 case 'n':
860+ byte_mode_character_aware = 1;
861 break;
862
863 case 's':
864@@ -579,15 +969,34 @@ main (int argc, char **argv)
865 | (complement ? SETFLD_COMPLEMENT : 0) );
866
867 if (!delim_specified)
868- delim = '\t';
869+ {
870+ delim = '\t';
871+#ifdef HAVE_MBRTOWC
872+ wcdelim = L'\t';
873+ mbdelim[0] = '\t';
874+ mbdelim[1] = '\0';
875+ delimlen = 1;
876+#endif
877+ }
878
879 if (output_delimiter_string == NULL)
880 {
881- static char dummy[2];
882- dummy[0] = delim;
883- dummy[1] = '\0';
884- output_delimiter_string = dummy;
885- output_delimiter_length = 1;
886+#ifdef HAVE_MBRTOWC
887+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
888+ {
889+ output_delimiter_string = xstrdup(mbdelim);
890+ output_delimiter_length = delimlen;
891+ }
892+
893+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
894+#endif
895+ {
896+ static char dummy[2];
897+ dummy[0] = delim;
898+ dummy[1] = '\0';
899+ output_delimiter_string = dummy;
900+ output_delimiter_length = 1;
901+ }
902 }
903
904 if (optind == argc)
b3478306
PM
905diff -Naurp coreutils-8.32.orig/src/expand.c coreutils-8.32/src/expand.c
906--- coreutils-8.32.orig/src/expand.c 2020-01-01 22:13:12.000000000 +0800
907+++ coreutils-8.32/src/expand.c 2020-03-08 12:10:27.735236560 +0800
e1fb4052 908@@ -37,6 +37,9 @@
7cf3a80b
ML
909 #include <stdio.h>
910 #include <getopt.h>
911 #include <sys/types.h>
912+
e1fb4052 913+#include <mbfile.h>
7cf3a80b
ML
914+
915 #include "system.h"
e1fb4052 916 #include "die.h"
7cf3a80b 917 #include "xstrndup.h"
b3478306 918@@ -98,19 +101,41 @@ expand (void)
e1fb4052
MF
919 {
920 /* Input stream. */
921 FILE *fp = next_file (NULL);
922+ mb_file_t mbf;
923+ mbf_char_t c;
924+ /* True if the starting locale is utf8. */
925+ bool using_utf_locale;
926+
927+ /* True if the first file contains BOM header. */
928+ bool found_bom;
929+ using_utf_locale=check_utf_locale();
930
931 if (!fp)
932 return;
933+ mbf_init (mbf, fp);
934+ found_bom=check_bom(fp,&mbf);
7cf3a80b 935
e1fb4052
MF
936- while (true)
937+ if (using_utf_locale == false && found_bom == true)
938+ {
939+ /*try using some predefined locale */
7cf3a80b 940+
e1fb4052
MF
941+ if (set_utf_locale () != 0)
942 {
943- /* Input character, or EOF. */
944- int c;
945+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
946+ }
947+ }
7cf3a80b 948+
7cf3a80b 949
e1fb4052
MF
950+ if (found_bom == true)
951+ {
952+ print_bom();
953+ }
954+
955+ while (true)
956+ {
957 /* If true, perform translations. */
958 bool convert = true;
959
960-
961 /* The following variables have valid values only when CONVERT
962 is true: */
963
b3478306 964@@ -120,17 +145,48 @@ expand (void)
e1fb4052
MF
965 /* Index in TAB_LIST of next tab stop to examine. */
966 size_t tab_index = 0;
967
968-
969 /* Convert a line of text. */
970
971 do
972 {
973- while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
974- continue;
975+ while (true) {
976+ mbf_getc (c, mbf);
977+ if ((mb_iseof (c)) && (fp = next_file (fp)))
978+ {
979+ mbf_init (mbf, fp);
980+ if (fp!=NULL)
981+ {
982+ if (check_bom(fp,&mbf)==true)
983+ {
984+ /*Not the first file - check BOM header*/
985+ if (using_utf_locale==false && found_bom==false)
986+ {
987+ /*BOM header in subsequent file but not in the first one. */
988+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
989+ }
990+ }
991+ else
992+ {
993+ if(using_utf_locale==false && found_bom==true)
994+ {
995+ /*First file conatined BOM header - locale was switched to UTF
b3478306 996+ *all subsequent files should contain BOM. */
e1fb4052
MF
997+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
998+ }
999+ }
1000+ }
1001+ continue;
1002+ }
1003+ else
1004+ {
1005+ break;
1006+ }
1007+ }
1008+
1009
1010 if (convert)
1011 {
1012- if (c == '\t')
1013+ if (mb_iseq (c, '\t'))
1014 {
1015 /* Column the next input tab stop is on. */
1016 uintmax_t next_tab_column;
b3478306 1017@@ -149,32 +205,34 @@ expand (void)
e1fb4052
MF
1018 if (putchar (' ') < 0)
1019 die (EXIT_FAILURE, errno, _("write error"));
1020
1021- c = ' ';
1022+ mb_setascii (&c, ' ');
1023 }
1024- else if (c == '\b')
1025+ else if (mb_iseq (c, '\b'))
1026 {
1027 /* Go back one column, and force recalculation of the
1028 next tab stop. */
1029 column -= !!column;
1030 tab_index -= !!tab_index;
1031 }
1032- else
1033+ /* A leading control character could make us trip over. */
1034+ else if (!mb_iscntrl (c))
1035 {
1036- column++;
1037+ column += mb_width (c);
1038 if (!column)
1039 die (EXIT_FAILURE, 0, _("input line is too long"));
1040 }
1041
1042- convert &= convert_entire_line || !! isblank (c);
1043+ convert &= convert_entire_line || mb_isblank (c);
1044 }
1045
1046- if (c < 0)
1047+ if (mb_iseof (c))
1048 return;
1049
1050- if (putchar (c) < 0)
1051+ mb_putc (c, stdout);
1052+ if (ferror (stdout))
1053 die (EXIT_FAILURE, errno, _("write error"));
1054 }
1055- while (c != '\n');
1056+ while (!mb_iseq (c, '\n'));
7cf3a80b
ML
1057 }
1058 }
1059
b3478306
PM
1060diff -Naurp coreutils-8.32.orig/src/expand-common.c coreutils-8.32/src/expand-common.c
1061--- coreutils-8.32.orig/src/expand-common.c 2020-01-01 22:13:12.000000000 +0800
1062+++ coreutils-8.32/src/expand-common.c 2020-03-08 12:10:27.735236560 +0800
1063@@ -19,6 +19,7 @@
1064 #include <assert.h>
e1fb4052
MF
1065 #include <stdio.h>
1066 #include <sys/types.h>
1067+#include <mbfile.h>
1068 #include "system.h"
1069 #include "die.h"
1070 #include "error.h"
b3478306 1071@@ -126,6 +127,119 @@ set_increment_size (uintmax_t tabval)
e1fb4052
MF
1072 return ok;
1073 }
1074
1075+extern int
1076+set_utf_locale (void)
7cf3a80b 1077+{
e1fb4052
MF
1078+ /*try using some predefined locale */
1079+ const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"};
7cf3a80b 1080+
e1fb4052
MF
1081+ const int predef_locales_count=3;
1082+ for (int i=0;i<predef_locales_count;i++)
1083+ {
1084+ if (setlocale(LC_ALL,predef_locales[i])!=NULL)
1085+ {
1086+ break;
1087+ }
1088+ else if (i==predef_locales_count-1)
1089+ {
1090+ return 1;
1091+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
1092+ }
1093+ }
1094+ return 0;
1095+}
7cf3a80b 1096+
e1fb4052
MF
1097+extern bool
1098+check_utf_locale(void)
1099+{
1100+ char* locale = setlocale (LC_CTYPE , NULL);
1101+ if (locale == NULL)
1102+ {
1103+ return false;
1104+ }
1105+ else if (strcasestr(locale, "utf8") == NULL && strcasestr(locale, "utf-8") == NULL)
1106+ {
1107+ return false;
1108+ }
1109+ return true;
1110+}
7cf3a80b 1111+
e1fb4052
MF
1112+extern bool
1113+check_bom(FILE* fp, mb_file_t *mbf)
1114+{
1115+ int c;
7cf3a80b 1116+
7cf3a80b 1117+
e1fb4052 1118+ c=fgetc(fp);
7cf3a80b 1119+
e1fb4052
MF
1120+ /*test BOM header of the first file */
1121+ mbf->bufcount=0;
1122+ if (c == 0xEF)
1123+ {
1124+ c=fgetc(fp);
1125+ }
1126+ else
1127+ {
1128+ if (c != EOF)
1129+ {
1130+ ungetc(c,fp);
1131+ }
1132+ return false;
1133+ }
7cf3a80b 1134+
e1fb4052
MF
1135+ if (c == 0xBB)
1136+ {
1137+ c=fgetc(fp);
1138+ }
1139+ else
1140+ {
1141+ if ( c!= EOF )
1142+ {
1143+ mbf->buf[0]=(unsigned char) 0xEF;
1144+ mbf->bufcount=1;
1145+ ungetc(c,fp);
1146+ return false;
1147+ }
1148+ else
1149+ {
1150+ ungetc(0xEF,fp);
1151+ return false;
7cf3a80b 1152+ }
e1fb4052
MF
1153+ }
1154+ if (c == 0xBF)
1155+ {
1156+ mbf->bufcount=0;
1157+ return true;
1158+ }
1159+ else
1160+ {
1161+ if (c != EOF)
1162+ {
1163+ mbf->buf[0]=(unsigned char) 0xEF;
1164+ mbf->buf[1]=(unsigned char) 0xBB;
1165+ mbf->bufcount=2;
1166+ ungetc(c,fp);
1167+ return false;
1168+ }
1169+ else
1170+ {
1171+ mbf->buf[0]=(unsigned char) 0xEF;
1172+ mbf->bufcount=1;
1173+ ungetc(0xBB,fp);
1174+ return false;
1175+ }
1176+ }
1177+ return false;
7cf3a80b 1178+}
7cf3a80b 1179+
e1fb4052
MF
1180+extern void
1181+print_bom(void)
1182+{
1183+ putc (0xEF, stdout);
1184+ putc (0xBB, stdout);
1185+ putc (0xBF, stdout);
1186+}
1187+
1188 /* Add the comma or blank separated list of tab stops STOPS
1189 to the list of tab stops. */
1190 extern void
b3478306
PM
1191diff -Naurp coreutils-8.32.orig/src/expand-common.h coreutils-8.32/src/expand-common.h
1192--- coreutils-8.32.orig/src/expand-common.h 2020-01-01 22:13:12.000000000 +0800
1193+++ coreutils-8.32/src/expand-common.h 2020-03-08 12:10:27.735236560 +0800
e1fb4052
MF
1194@@ -34,6 +34,18 @@ extern size_t max_column_width;
1195 /* The desired exit status. */
1196 extern int exit_status;
1197
1198+extern int
1199+set_utf_locale (void);
1200+
1201+extern bool
1202+check_utf_locale(void);
1203+
1204+extern bool
1205+check_bom(FILE* fp, mb_file_t *mbf);
1206+
1207+extern void
1208+print_bom(void);
1209+
1210 /* Add tab stop TABVAL to the end of 'tab_list'. */
1211 extern void
1212 add_tab_stop (uintmax_t tabval);
b3478306
PM
1213diff -Naurp coreutils-8.32.orig/src/fold.c coreutils-8.32/src/fold.c
1214--- coreutils-8.32.orig/src/fold.c 2020-01-01 22:13:12.000000000 +0800
1215+++ coreutils-8.32/src/fold.c 2020-03-08 12:10:27.736236560 +0800
e1fb4052 1216@@ -22,12 +22,34 @@
7cf3a80b
ML
1217 #include <getopt.h>
1218 #include <sys/types.h>
1219
1220+/* Get mbstate_t, mbrtowc(), wcwidth(). */
1221+#if HAVE_WCHAR_H
1222+# include <wchar.h>
1223+#endif
1224+
1225+/* Get iswprint(), iswblank(), wcwidth(). */
1226+#if HAVE_WCTYPE_H
1227+# include <wctype.h>
1228+#endif
1229+
1230 #include "system.h"
e1fb4052 1231 #include "die.h"
7cf3a80b
ML
1232 #include "error.h"
1233 #include "fadvise.h"
1234 #include "xdectoint.h"
1235
1236+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
1237+ installation; work around this configuration error. */
1238+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
1239+# undef MB_LEN_MAX
1240+# define MB_LEN_MAX 16
1241+#endif
1242+
1243+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1244+#if HAVE_MBRTOWC && defined mbstate_t
1245+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1246+#endif
1247+
1248 #define TAB_WIDTH 8
1249
1250 /* The official name of this program (e.g., no 'g' prefix). */
e1fb4052 1251@@ -35,20 +57,41 @@
7cf3a80b
ML
1252
1253 #define AUTHORS proper_name ("David MacKenzie")
1254
1255+#define FATAL_ERROR(Message) \
1256+ do \
1257+ { \
1258+ error (0, 0, (Message)); \
1259+ usage (2); \
1260+ } \
1261+ while (0)
1262+
1263+enum operating_mode
1264+{
1265+ /* Fold texts by columns that are at the given positions. */
1266+ column_mode,
1267+
1268+ /* Fold texts by bytes that are at the given positions. */
1269+ byte_mode,
1270+
1271+ /* Fold texts by characters that are at the given positions. */
1272+ character_mode,
1273+};
1274+
1275+/* The argument shows current mode. (Default: column_mode) */
1276+static enum operating_mode operating_mode;
1277+
1278 /* If nonzero, try to break on whitespace. */
1279 static bool break_spaces;
1280
1281-/* If nonzero, count bytes, not column positions. */
1282-static bool count_bytes;
1283-
1284 /* If nonzero, at least one of the files we read was standard input. */
1285 static bool have_read_stdin;
1286
1287-static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
1288+static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
1289
1290 static struct option const longopts[] =
1291 {
1292 {"bytes", no_argument, NULL, 'b'},
1293+ {"characters", no_argument, NULL, 'c'},
1294 {"spaces", no_argument, NULL, 's'},
1295 {"width", required_argument, NULL, 'w'},
1296 {GETOPT_HELP_OPTION_DECL},
e1fb4052 1297@@ -76,6 +119,7 @@ Wrap input lines in each FILE, writing t
7cf3a80b
ML
1298
1299 fputs (_("\
1300 -b, --bytes count bytes rather than columns\n\
1301+ -c, --characters count characters rather than columns\n\
1302 -s, --spaces break at spaces\n\
1303 -w, --width=WIDTH use WIDTH columns instead of 80\n\
1304 "), stdout);
e1fb4052 1305@@ -93,7 +137,7 @@ Wrap input lines in each FILE, writing t
7cf3a80b
ML
1306 static size_t
1307 adjust_column (size_t column, char c)
1308 {
1309- if (!count_bytes)
1310+ if (operating_mode != byte_mode)
1311 {
1312 if (c == '\b')
1313 {
e1fb4052 1314@@ -116,30 +160,14 @@ adjust_column (size_t column, char c)
7cf3a80b
ML
1315 to stdout, with maximum line length WIDTH.
1316 Return true if successful. */
1317
1318-static bool
1319-fold_file (char const *filename, size_t width)
1320+static void
1321+fold_text (FILE *istream, size_t width, int *saved_errno)
1322 {
1323- FILE *istream;
1324 int c;
1325 size_t column = 0; /* Screen column where next char will go. */
1326 size_t offset_out = 0; /* Index in 'line_out' for next char. */
1327 static char *line_out = NULL;
1328 static size_t allocated_out = 0;
1329- int saved_errno;
1330-
1331- if (STREQ (filename, "-"))
1332- {
1333- istream = stdin;
1334- have_read_stdin = true;
1335- }
1336- else
1337- istream = fopen (filename, "r");
1338-
1339- if (istream == NULL)
1340- {
1341- error (0, errno, "%s", quotef (filename));
1342- return false;
1343- }
1344
1345 fadvise (istream, FADVISE_SEQUENTIAL);
1346
e1fb4052 1347@@ -169,6 +197,15 @@ fold_file (char const *filename, size_t
7cf3a80b
ML
1348 bool found_blank = false;
1349 size_t logical_end = offset_out;
1350
1351+ /* If LINE_OUT has no wide character,
1352+ put a new wide character in LINE_OUT
1353+ if column is bigger than width. */
1354+ if (offset_out == 0)
1355+ {
1356+ line_out[offset_out++] = c;
1357+ continue;
1358+ }
1359+
1360 /* Look for the last blank. */
1361 while (logical_end)
1362 {
e1fb4052 1363@@ -215,11 +252,220 @@ fold_file (char const *filename, size_t
7cf3a80b
ML
1364 line_out[offset_out++] = c;
1365 }
1366
1367- saved_errno = errno;
1368+ *saved_errno = errno;
1369+
1370+ if (offset_out)
1371+ fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1372+
1373+}
1374+
1375+#if HAVE_MBRTOWC
1376+static void
1377+fold_multibyte_text (FILE *istream, size_t width, int *saved_errno)
1378+{
1379+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
1380+ size_t buflen = 0; /* The length of the byte sequence in buf. */
1381+ char *bufpos = buf; /* Next read position of BUF. */
1382+ wint_t wc; /* A gotten wide character. */
1383+ size_t mblength; /* The byte size of a multibyte character which shows
1384+ as same character as WC. */
1385+ mbstate_t state, state_bak; /* State of the stream. */
1386+ int convfail = 0; /* 1, when conversion is failed. Otherwise 0. */
1387+
1388+ static char *line_out = NULL;
1389+ size_t offset_out = 0; /* Index in `line_out' for next char. */
1390+ static size_t allocated_out = 0;
1391+
1392+ int increment;
1393+ size_t column = 0;
1394+
1395+ size_t last_blank_pos;
1396+ size_t last_blank_column;
1397+ int is_blank_seen;
1398+ int last_blank_increment = 0;
1399+ int is_bs_following_last_blank;
1400+ size_t bs_following_last_blank_num;
1401+ int is_cr_after_last_blank;
1402+
1403+#define CLEAR_FLAGS \
1404+ do \
1405+ { \
1406+ last_blank_pos = 0; \
1407+ last_blank_column = 0; \
1408+ is_blank_seen = 0; \
1409+ is_bs_following_last_blank = 0; \
1410+ bs_following_last_blank_num = 0; \
1411+ is_cr_after_last_blank = 0; \
1412+ } \
1413+ while (0)
1414+
1415+#define START_NEW_LINE \
1416+ do \
1417+ { \
1418+ putchar ('\n'); \
1419+ column = 0; \
1420+ offset_out = 0; \
1421+ CLEAR_FLAGS; \
1422+ } \
1423+ while (0)
1424+
1425+ CLEAR_FLAGS;
1426+ memset (&state, '\0', sizeof(mbstate_t));
1427+
1428+ for (;; bufpos += mblength, buflen -= mblength)
1429+ {
1430+ if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
1431+ {
1432+ memmove (buf, bufpos, buflen);
1433+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
1434+ bufpos = buf;
1435+ }
1436+
1437+ if (buflen < 1)
1438+ break;
1439+
1440+ /* Get a wide character. */
1441+ state_bak = state;
1442+ mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
1443+
1444+ switch (mblength)
1445+ {
1446+ case (size_t)-1:
1447+ case (size_t)-2:
1448+ convfail++;
1449+ state = state_bak;
1450+ /* Fall through. */
1451+
1452+ case 0:
1453+ mblength = 1;
1454+ break;
1455+ }
1456+
1457+rescan:
e1fb4052
MF
1458+ if (convfail)
1459+ increment = 1;
1460+ else if (wc == L'\n')
1461+ {
1462+ /* preserve newline */
1463+ fwrite (line_out, sizeof(char), offset_out, stdout);
1464+ START_NEW_LINE;
1465+ continue;
1466+ }
1467+ else if (operating_mode == byte_mode) /* byte mode */
7cf3a80b
ML
1468+ increment = mblength;
1469+ else if (operating_mode == character_mode) /* character mode */
1470+ increment = 1;
e1fb4052 1471+ else /* column mode */
7cf3a80b 1472+ {
e1fb4052 1473+ switch (wc)
7cf3a80b 1474+ {
e1fb4052
MF
1475+ case L'\b':
1476+ increment = (column > 0) ? -1 : 0;
1477+ break;
7cf3a80b 1478+
e1fb4052
MF
1479+ case L'\r':
1480+ increment = -1 * column;
1481+ break;
7cf3a80b 1482+
e1fb4052
MF
1483+ case L'\t':
1484+ increment = 8 - column % 8;
1485+ break;
7cf3a80b 1486+
e1fb4052
MF
1487+ default:
1488+ increment = wcwidth (wc);
1489+ increment = (increment < 0) ? 0 : increment;
7cf3a80b
ML
1490+ }
1491+ }
1492+
1493+ if (column + increment > width && break_spaces && last_blank_pos)
1494+ {
1495+ fwrite (line_out, sizeof(char), last_blank_pos, stdout);
1496+ putchar ('\n');
1497+
1498+ offset_out = offset_out - last_blank_pos;
1499+ column = column - last_blank_column + ((is_cr_after_last_blank)
1500+ ? last_blank_increment : bs_following_last_blank_num);
1501+ memmove (line_out, line_out + last_blank_pos, offset_out);
1502+ CLEAR_FLAGS;
1503+ goto rescan;
1504+ }
1505+
1506+ if (column + increment > width && column != 0)
1507+ {
1508+ fwrite (line_out, sizeof(char), offset_out, stdout);
1509+ START_NEW_LINE;
1510+ goto rescan;
1511+ }
1512+
1513+ if (allocated_out < offset_out + mblength)
1514+ {
1515+ line_out = X2REALLOC (line_out, &allocated_out);
1516+ }
1517+
1518+ memcpy (line_out + offset_out, bufpos, mblength);
1519+ offset_out += mblength;
1520+ column += increment;
1521+
1522+ if (is_blank_seen && !convfail && wc == L'\r')
1523+ is_cr_after_last_blank = 1;
1524+
1525+ if (is_bs_following_last_blank && !convfail && wc == L'\b')
1526+ ++bs_following_last_blank_num;
1527+ else
1528+ is_bs_following_last_blank = 0;
1529+
1530+ if (break_spaces && !convfail && iswblank (wc))
1531+ {
1532+ last_blank_pos = offset_out;
1533+ last_blank_column = column;
1534+ is_blank_seen = 1;
1535+ last_blank_increment = increment;
1536+ is_bs_following_last_blank = 1;
1537+ bs_following_last_blank_num = 0;
1538+ is_cr_after_last_blank = 0;
1539+ }
1540+ }
1541+
1542+ *saved_errno = errno;
1543
1544 if (offset_out)
1545 fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1546
1547+}
1548+#endif
1549+
1550+/* Fold file FILENAME, or standard input if FILENAME is "-",
1551+ to stdout, with maximum line length WIDTH.
1552+ Return 0 if successful, 1 if an error occurs. */
1553+
1554+static bool
1555+fold_file (char const *filename, size_t width)
1556+{
1557+ FILE *istream;
1558+ int saved_errno;
1559+
1560+ if (STREQ (filename, "-"))
1561+ {
1562+ istream = stdin;
1563+ have_read_stdin = 1;
1564+ }
1565+ else
1566+ istream = fopen (filename, "r");
1567+
1568+ if (istream == NULL)
1569+ {
e1fb4052 1570+ error (0, errno, "%s", filename);
7cf3a80b
ML
1571+ return 1;
1572+ }
1573+
1574+ /* Define how ISTREAM is being folded. */
1575+#if HAVE_MBRTOWC
1576+ if (MB_CUR_MAX > 1)
1577+ fold_multibyte_text (istream, width, &saved_errno);
1578+ else
1579+#endif
1580+ fold_text (istream, width, &saved_errno);
1581+
1582 if (ferror (istream))
1583 {
1584 error (0, saved_errno, "%s", quotef (filename));
e1fb4052 1585@@ -252,7 +498,8 @@ main (int argc, char **argv)
7cf3a80b
ML
1586
1587 atexit (close_stdout);
1588
1589- break_spaces = count_bytes = have_read_stdin = false;
1590+ operating_mode = column_mode;
1591+ break_spaces = have_read_stdin = false;
1592
1593 while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
1594 {
e1fb4052 1595@@ -261,7 +508,15 @@ main (int argc, char **argv)
7cf3a80b
ML
1596 switch (optc)
1597 {
1598 case 'b': /* Count bytes rather than columns. */
1599- count_bytes = true;
1600+ if (operating_mode != column_mode)
1601+ FATAL_ERROR (_("only one way of folding may be specified"));
1602+ operating_mode = byte_mode;
1603+ break;
1604+
1605+ case 'c':
1606+ if (operating_mode != column_mode)
1607+ FATAL_ERROR (_("only one way of folding may be specified"));
1608+ operating_mode = character_mode;
1609 break;
1610
1611 case 's': /* Break at word boundaries. */
b3478306
PM
1612diff -Naurp coreutils-8.32.orig/src/join.c coreutils-8.32/src/join.c
1613--- coreutils-8.32.orig/src/join.c 2020-01-01 22:13:12.000000000 +0800
1614+++ coreutils-8.32/src/join.c 2020-03-08 12:10:27.736236560 +0800
e1fb4052 1615@@ -22,19 +22,33 @@
7cf3a80b
ML
1616 #include <sys/types.h>
1617 #include <getopt.h>
1618
1619+/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth(). */
1620+#if HAVE_WCHAR_H
1621+# include <wchar.h>
1622+#endif
1623+
1624+/* Get iswblank(), towupper. */
1625+#if HAVE_WCTYPE_H
1626+# include <wctype.h>
1627+#endif
1628+
1629 #include "system.h"
e1fb4052 1630 #include "die.h"
7cf3a80b
ML
1631 #include "error.h"
1632 #include "fadvise.h"
1633 #include "hard-locale.h"
1634 #include "linebuffer.h"
1635-#include "memcasecmp.h"
1636 #include "quote.h"
1637 #include "stdio--.h"
1638 #include "xmemcoll.h"
1639 #include "xstrtol.h"
1640 #include "argmatch.h"
1641
1642+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1643+#if HAVE_MBRTOWC && defined mbstate_t
1644+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1645+#endif
1646+
1647 /* The official name of this program (e.g., no 'g' prefix). */
1648 #define PROGRAM_NAME "join"
1649
e1fb4052 1650@@ -136,10 +150,12 @@ static struct outlist outlist_head;
7cf3a80b
ML
1651 /* Last element in 'outlist', where a new element can be added. */
1652 static struct outlist *outlist_end = &outlist_head;
1653
1654-/* Tab character separating fields. If negative, fields are separated
1655- by any nonempty string of blanks, otherwise by exactly one
1656- tab character whose value (when cast to unsigned char) equals TAB. */
1657-static int tab = -1;
1658+/* Tab character separating fields. If NULL, fields are separated
1659+ by any nonempty string of blanks. */
1660+static char *tab = NULL;
1661+
1662+/* The number of bytes used for tab. */
1663+static size_t tablen = 0;
1664
1665 /* If nonzero, check that the input is correctly ordered. */
1666 static enum
e1fb4052 1667@@ -276,13 +292,14 @@ xfields (struct line *line)
7cf3a80b
ML
1668 if (ptr == lim)
1669 return;
1670
1671- if (0 <= tab && tab != '\n')
1672+ if (tab != NULL)
1673 {
1674+ unsigned char t = tab[0];
1675 char *sep;
1676- for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
1677+ for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
1678 extract_field (line, ptr, sep - ptr);
1679 }
1680- else if (tab < 0)
1681+ else
1682 {
1683 /* Skip leading blanks before the first field. */
1684 while (field_sep (*ptr))
e1fb4052 1685@@ -306,6 +323,147 @@ xfields (struct line *line)
7cf3a80b
ML
1686 extract_field (line, ptr, lim - ptr);
1687 }
1688
1689+#if HAVE_MBRTOWC
1690+static void
1691+xfields_multibyte (struct line *line)
1692+{
1693+ char *ptr = line->buf.buffer;
1694+ char const *lim = ptr + line->buf.length - 1;
1695+ wchar_t wc = 0;
1696+ size_t mblength = 1;
1697+ mbstate_t state, state_bak;
1698+
1699+ memset (&state, 0, sizeof (mbstate_t));
1700+
1701+ if (ptr >= lim)
1702+ return;
1703+
1704+ if (tab != NULL)
1705+ {
1706+ char *sep = ptr;
1707+ for (; ptr < lim; ptr = sep + mblength)
1708+ {
1709+ sep = ptr;
1710+ while (sep < lim)
1711+ {
1712+ state_bak = state;
1713+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1714+
1715+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1716+ {
1717+ mblength = 1;
1718+ state = state_bak;
1719+ }
1720+ mblength = (mblength < 1) ? 1 : mblength;
1721+
1722+ if (mblength == tablen && !memcmp (sep, tab, mblength))
1723+ break;
1724+ else
1725+ {
1726+ sep += mblength;
1727+ continue;
1728+ }
1729+ }
1730+
1731+ if (sep >= lim)
1732+ break;
1733+
1734+ extract_field (line, ptr, sep - ptr);
1735+ }
1736+ }
1737+ else
1738+ {
1739+ /* Skip leading blanks before the first field. */
1740+ while(ptr < lim)
1741+ {
1742+ state_bak = state;
1743+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1744+
1745+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1746+ {
1747+ mblength = 1;
1748+ state = state_bak;
1749+ break;
1750+ }
1751+ mblength = (mblength < 1) ? 1 : mblength;
1752+
1753+ if (!iswblank(wc) && wc != '\n')
1754+ break;
1755+ ptr += mblength;
1756+ }
1757+
1758+ do
1759+ {
1760+ char *sep;
1761+ state_bak = state;
1762+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1763+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1764+ {
1765+ mblength = 1;
1766+ state = state_bak;
1767+ break;
1768+ }
1769+ mblength = (mblength < 1) ? 1 : mblength;
1770+
1771+ sep = ptr + mblength;
1772+ while (sep < lim)
1773+ {
1774+ state_bak = state;
1775+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1776+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1777+ {
1778+ mblength = 1;
1779+ state = state_bak;
1780+ break;
1781+ }
1782+ mblength = (mblength < 1) ? 1 : mblength;
1783+
1784+ if (iswblank (wc) || wc == '\n')
1785+ break;
1786+
1787+ sep += mblength;
1788+ }
1789+
1790+ extract_field (line, ptr, sep - ptr);
1791+ if (sep >= lim)
1792+ return;
1793+
1794+ state_bak = state;
1795+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1796+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1797+ {
1798+ mblength = 1;
1799+ state = state_bak;
1800+ break;
1801+ }
1802+ mblength = (mblength < 1) ? 1 : mblength;
1803+
1804+ ptr = sep + mblength;
1805+ while (ptr < lim)
1806+ {
1807+ state_bak = state;
1808+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1809+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1810+ {
1811+ mblength = 1;
1812+ state = state_bak;
1813+ break;
1814+ }
1815+ mblength = (mblength < 1) ? 1 : mblength;
1816+
1817+ if (!iswblank (wc) && wc != '\n')
1818+ break;
1819+
1820+ ptr += mblength;
1821+ }
1822+ }
1823+ while (ptr < lim);
1824+ }
1825+
1826+ extract_field (line, ptr, lim - ptr);
1827+}
1828+#endif
1829+
1830 static void
1831 freeline (struct line *line)
1832 {
e1fb4052 1833@@ -327,56 +485,133 @@ keycmp (struct line const *line1, struct
7cf3a80b
ML
1834 size_t jf_1, size_t jf_2)
1835 {
1836 /* Start of field to compare in each file. */
1837- char *beg1;
1838- char *beg2;
1839-
1840- size_t len1;
1841- size_t len2; /* Length of fields to compare. */
1842+ char *beg[2];
1843+ char *copy[2];
1844+ size_t len[2]; /* Length of fields to compare. */
1845 int diff;
1846+ int i, j;
1847+ int mallocd = 0;
1848
1849 if (jf_1 < line1->nfields)
1850 {
1851- beg1 = line1->fields[jf_1].beg;
1852- len1 = line1->fields[jf_1].len;
1853+ beg[0] = line1->fields[jf_1].beg;
1854+ len[0] = line1->fields[jf_1].len;
1855 }
1856 else
1857 {
1858- beg1 = NULL;
1859- len1 = 0;
1860+ beg[0] = NULL;
1861+ len[0] = 0;
1862 }
1863
1864 if (jf_2 < line2->nfields)
1865 {
1866- beg2 = line2->fields[jf_2].beg;
1867- len2 = line2->fields[jf_2].len;
1868+ beg[1] = line2->fields[jf_2].beg;
1869+ len[1] = line2->fields[jf_2].len;
1870 }
1871 else
1872 {
1873- beg2 = NULL;
1874- len2 = 0;
1875+ beg[1] = NULL;
1876+ len[1] = 0;
1877 }
1878
1879- if (len1 == 0)
1880- return len2 == 0 ? 0 : -1;
1881- if (len2 == 0)
1882+ if (len[0] == 0)
1883+ return len[1] == 0 ? 0 : -1;
1884+ if (len[1] == 0)
1885 return 1;
1886
1887 if (ignore_case)
1888 {
1889- /* FIXME: ignore_case does not work with NLS (in particular,
1890- with multibyte chars). */
1891- diff = memcasecmp (beg1, beg2, MIN (len1, len2));
1892+#ifdef HAVE_MBRTOWC
1893+ if (MB_CUR_MAX > 1)
1894+ {
1895+ size_t mblength;
1896+ wchar_t wc, uwc;
1897+ mbstate_t state, state_bak;
1898+
1899+ memset (&state, '\0', sizeof (mbstate_t));
1900+
1901+ for (i = 0; i < 2; i++)
1902+ {
1903+ mallocd = 1;
1904+ copy[i] = xmalloc (len[i] + 1);
1905+ memset (copy[i], '\0',len[i] + 1);
1906+
1907+ for (j = 0; j < MIN (len[0], len[1]);)
1908+ {
1909+ state_bak = state;
1910+ mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
1911+
1912+ switch (mblength)
1913+ {
1914+ case (size_t) -1:
1915+ case (size_t) -2:
1916+ state = state_bak;
1917+ /* Fall through */
1918+ case 0:
1919+ mblength = 1;
1920+ break;
1921+
1922+ default:
1923+ uwc = towupper (wc);
1924+
1925+ if (uwc != wc)
1926+ {
1927+ mbstate_t state_wc;
1928+ size_t mblen;
1929+
1930+ memset (&state_wc, '\0', sizeof (mbstate_t));
1931+ mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
1932+ assert (mblen != (size_t)-1);
1933+ }
1934+ else
1935+ memcpy (copy[i] + j, beg[i] + j, mblength);
1936+ }
1937+ j += mblength;
1938+ }
1939+ copy[i][j] = '\0';
1940+ }
1941+ }
1942+ else
1943+#endif
1944+ {
1945+ for (i = 0; i < 2; i++)
1946+ {
1947+ mallocd = 1;
1948+ copy[i] = xmalloc (len[i] + 1);
1949+
1950+ for (j = 0; j < MIN (len[0], len[1]); j++)
1951+ copy[i][j] = toupper (beg[i][j]);
1952+
1953+ copy[i][j] = '\0';
1954+ }
1955+ }
1956 }
1957 else
1958 {
1959- if (hard_LC_COLLATE)
1960- return xmemcoll (beg1, len1, beg2, len2);
1961- diff = memcmp (beg1, beg2, MIN (len1, len2));
1962+ copy[0] = beg[0];
1963+ copy[1] = beg[1];
e1fb4052
MF
1964 }
1965
7cf3a80b
ML
1966+ if (hard_LC_COLLATE)
1967+ {
1968+ diff = xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
1969+
1970+ if (mallocd)
1971+ for (i = 0; i < 2; i++)
1972+ free (copy[i]);
1973+
1974+ return diff;
e1fb4052 1975+ }
7cf3a80b
ML
1976+ diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
1977+
1978+ if (mallocd)
1979+ for (i = 0; i < 2; i++)
1980+ free (copy[i]);
1981+
e1fb4052 1982+
7cf3a80b
ML
1983 if (diff)
1984 return diff;
1985- return len1 < len2 ? -1 : len1 != len2;
1986+ return len[0] - len[1];
1987 }
1988
1989 /* Check that successive input lines PREV and CURRENT from input file
e1fb4052 1990@@ -468,6 +703,11 @@ get_line (FILE *fp, struct line **linep,
7cf3a80b
ML
1991 }
1992 ++line_no[which - 1];
1993
1994+#if HAVE_MBRTOWC
1995+ if (MB_CUR_MAX > 1)
1996+ xfields_multibyte (line);
1997+ else
1998+#endif
1999 xfields (line);
2000
2001 if (prevline[which - 1])
b3478306 2002@@ -563,21 +803,28 @@ prfield (size_t n, struct line const *li
7cf3a80b
ML
2003
2004 /* Output all the fields in line, other than the join field. */
2005
2006+#define PUT_TAB_CHAR \
2007+ do \
2008+ { \
2009+ (tab != NULL) ? \
2010+ fwrite(tab, sizeof(char), tablen, stdout) : putchar (' '); \
2011+ } \
2012+ while (0)
2013+
2014 static void
2015 prfields (struct line const *line, size_t join_field, size_t autocount)
2016 {
2017 size_t i;
2018 size_t nfields = autoformat ? autocount : line->nfields;
2019- char output_separator = tab < 0 ? ' ' : tab;
2020
2021 for (i = 0; i < join_field && i < nfields; ++i)
2022 {
2023- putchar (output_separator);
2024+ PUT_TAB_CHAR;
2025 prfield (i, line);
2026 }
2027 for (i = join_field + 1; i < nfields; ++i)
2028 {
2029- putchar (output_separator);
2030+ PUT_TAB_CHAR;
2031 prfield (i, line);
2032 }
2033 }
b3478306 2034@@ -588,7 +835,6 @@ static void
7cf3a80b
ML
2035 prjoin (struct line const *line1, struct line const *line2)
2036 {
2037 const struct outlist *outlist;
2038- char output_separator = tab < 0 ? ' ' : tab;
2039 size_t field;
2040 struct line const *line;
2041
b3478306 2042@@ -622,7 +868,7 @@ prjoin (struct line const *line1, struct
7cf3a80b
ML
2043 o = o->next;
2044 if (o == NULL)
2045 break;
2046- putchar (output_separator);
2047+ PUT_TAB_CHAR;
2048 }
2049 putchar (eolchar);
2050 }
b3478306 2051@@ -1098,20 +1344,43 @@ main (int argc, char **argv)
7cf3a80b
ML
2052
2053 case 't':
2054 {
2055- unsigned char newtab = optarg[0];
2056+ char *newtab = NULL;
2057+ size_t newtablen;
2058+ newtab = xstrdup (optarg);
2059+#if HAVE_MBRTOWC
2060+ if (MB_CUR_MAX > 1)
2061+ {
2062+ mbstate_t state;
2063+
2064+ memset (&state, 0, sizeof (mbstate_t));
2065+ newtablen = mbrtowc (NULL, newtab,
2066+ strnlen (newtab, MB_LEN_MAX),
2067+ &state);
2068+ if (newtablen == (size_t) 0
2069+ || newtablen == (size_t) -1
2070+ || newtablen == (size_t) -2)
2071+ newtablen = 1;
2072+ }
2073+ else
2074+#endif
2075+ newtablen = 1;
2076 if (! newtab)
2077- newtab = '\n'; /* '' => process the whole line. */
7cf3a80b 2078+ newtab = (char*)"\n"; /* '' => process the whole line. */
7cf3a80b
ML
2079 else if (optarg[1])
2080 {
2081- if (STREQ (optarg, "\\0"))
2082- newtab = '\0';
2083- else
e1fb4052
MF
2084- die (EXIT_FAILURE, 0, _("multi-character tab %s"),
2085- quote (optarg));
7cf3a80b
ML
2086+ if (newtablen == 1 && newtab[1])
2087+ {
2088+ if (STREQ (newtab, "\\0"))
2089+ newtab[0] = '\0';
2090+ }
2091+ }
2092+ if (tab != NULL && strcmp (tab, newtab))
2093+ {
2094+ free (newtab);
e1fb4052 2095+ die (EXIT_FAILURE, 0, _("incompatible tabs"));
7cf3a80b
ML
2096 }
2097- if (0 <= tab && tab != newtab)
e1fb4052 2098- die (EXIT_FAILURE, 0, _("incompatible tabs"));
7cf3a80b 2099 tab = newtab;
7cf3a80b 2100+ tablen = newtablen;
e1fb4052 2101 }
7cf3a80b
ML
2102 break;
2103
b3478306
PM
2104diff -Naurp coreutils-8.32.orig/src/pr.c coreutils-8.32/src/pr.c
2105--- coreutils-8.32.orig/src/pr.c 2020-01-01 22:33:18.000000000 +0800
2106+++ coreutils-8.32/src/pr.c 2020-03-08 12:10:27.737236560 +0800
7cf3a80b
ML
2107@@ -311,6 +311,24 @@
2108
2109 #include <getopt.h>
2110 #include <sys/types.h>
2111+
2112+/* Get MB_LEN_MAX. */
2113+#include <limits.h>
2114+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
2115+ installation; work around this configuration error. */
2116+#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
2117+# define MB_LEN_MAX 16
2118+#endif
2119+
2120+/* Get MB_CUR_MAX. */
2121+#include <stdlib.h>
2122+
2123+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
2124+/* Get mbstate_t, mbrtowc(), wcwidth(). */
2125+#if HAVE_WCHAR_H
2126+# include <wchar.h>
2127+#endif
2128+
2129 #include "system.h"
e1fb4052 2130 #include "die.h"
7cf3a80b 2131 #include "error.h"
b3478306
PM
2132@@ -325,6 +343,18 @@
2133 #include "xstrtol-error.h"
7cf3a80b
ML
2134 #include "xdectoint.h"
2135
2136+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
2137+#if HAVE_MBRTOWC && defined mbstate_t
2138+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
2139+#endif
2140+
2141+#ifndef HAVE_DECL_WCWIDTH
2142+"this configure-time declaration test was not run"
2143+#endif
2144+#if !HAVE_DECL_WCWIDTH
2145+extern int wcwidth ();
2146+#endif
2147+
2148 /* The official name of this program (e.g., no 'g' prefix). */
2149 #define PROGRAM_NAME "pr"
2150
b3478306 2151@@ -417,7 +447,20 @@ struct COLUMN
7cf3a80b
ML
2152
2153 typedef struct COLUMN COLUMN;
2154
2155-static int char_to_clump (char c);
2156+/* Funtion pointers to switch functions for single byte locale or for
2157+ multibyte locale. If multibyte functions do not exist in your sysytem,
2158+ these pointers always point the function for single byte locale. */
2159+static void (*print_char) (char c);
2160+static int (*char_to_clump) (char c);
2161+
2162+/* Functions for single byte locale. */
2163+static void print_char_single (char c);
2164+static int char_to_clump_single (char c);
2165+
2166+/* Functions for multibyte locale. */
2167+static void print_char_multi (char c);
2168+static int char_to_clump_multi (char c);
2169+
2170 static bool read_line (COLUMN *p);
2171 static bool print_page (void);
2172 static bool print_stored (COLUMN *p);
b3478306 2173@@ -429,6 +472,7 @@ static void add_line_number (COLUMN *p);
7cf3a80b
ML
2174 static void getoptnum (const char *n_str, int min, int *num,
2175 const char *errfmt);
2176 static void getoptarg (char *arg, char switch_char, char *character,
2177+ int *character_length, int *character_width,
2178 int *number);
2179 static void print_files (int number_of_files, char **av);
2180 static void init_parameters (int number_of_files);
b3478306 2181@@ -442,7 +486,6 @@ static void store_char (char c);
7cf3a80b
ML
2182 static void pad_down (unsigned int lines);
2183 static void read_rest_of_line (COLUMN *p);
2184 static void skip_read (COLUMN *p, int column_number);
2185-static void print_char (char c);
2186 static void cleanup (void);
2187 static void print_sep_string (void);
2188 static void separator_string (const char *optarg_S);
b3478306 2189@@ -454,7 +497,7 @@ static COLUMN *column_vector;
7cf3a80b
ML
2190 we store the leftmost columns contiguously in buff.
2191 To print a line from buff, get the index of the first character
2192 from line_vector[i], and print up to line_vector[i + 1]. */
2193-static char *buff;
2194+static unsigned char *buff;
2195
2196 /* Index of the position in buff where the next character
2197 will be stored. */
b3478306 2198@@ -558,7 +601,7 @@ static int chars_per_column;
7cf3a80b
ML
2199 static bool untabify_input = false;
2200
2201 /* (-e) The input tab character. */
2202-static char input_tab_char = '\t';
2203+static char input_tab_char[MB_LEN_MAX] = "\t";
2204
2205 /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
2206 where the leftmost column is 1. */
b3478306 2207@@ -568,7 +611,10 @@ static int chars_per_input_tab = 8;
7cf3a80b
ML
2208 static bool tabify_output = false;
2209
2210 /* (-i) The output tab character. */
2211-static char output_tab_char = '\t';
2212+static char output_tab_char[MB_LEN_MAX] = "\t";
2213+
2214+/* (-i) The byte length of output tab character. */
2215+static int output_tab_char_length = 1;
2216
2217 /* (-i) The width of the output tab. */
2218 static int chars_per_output_tab = 8;
b3478306 2219@@ -638,7 +684,13 @@ static int line_number;
7cf3a80b
ML
2220 static bool numbered_lines = false;
2221
2222 /* (-n) Character which follows each line number. */
2223-static char number_separator = '\t';
2224+static char number_separator[MB_LEN_MAX] = "\t";
2225+
2226+/* (-n) The byte length of the character which follows each line number. */
2227+static int number_separator_length = 1;
2228+
2229+/* (-n) The character width of the character which follows each line number. */
2230+static int number_separator_width = 0;
2231
2232 /* (-n) line counting starts with 1st line of input file (not with 1st
2233 line of 1st page printed). */
b3478306 2234@@ -691,6 +743,7 @@ static bool use_col_separator = false;
7cf3a80b 2235 -a|COLUMN|-m is a 'space' and with the -J option a 'tab'. */
e1fb4052 2236 static char const *col_sep_string = "";
7cf3a80b
ML
2237 static int col_sep_length = 0;
2238+static int col_sep_width = 0;
2239 static char *column_separator = (char *) " ";
2240 static char *line_separator = (char *) "\t";
2241
b3478306 2242@@ -852,6 +905,13 @@ separator_string (const char *optarg_S)
e1fb4052
MF
2243 integer_overflow ();
2244 col_sep_length = len;
2245 col_sep_string = optarg_S;
7cf3a80b
ML
2246+
2247+#if HAVE_MBRTOWC
2248+ if (MB_CUR_MAX > 1)
2249+ col_sep_width = mbswidth (col_sep_string, 0);
2250+ else
2251+#endif
2252+ col_sep_width = col_sep_length;
2253 }
2254
2255 int
b3478306 2256@@ -876,6 +936,21 @@ main (int argc, char **argv)
7cf3a80b
ML
2257
2258 atexit (close_stdout);
2259
2260+/* Define which functions are used, the ones for single byte locale or the ones
2261+ for multibyte locale. */
2262+#if HAVE_MBRTOWC
2263+ if (MB_CUR_MAX > 1)
2264+ {
2265+ print_char = print_char_multi;
2266+ char_to_clump = char_to_clump_multi;
2267+ }
2268+ else
2269+#endif
2270+ {
2271+ print_char = print_char_single;
2272+ char_to_clump = char_to_clump_single;
2273+ }
2274+
2275 n_files = 0;
2276 file_names = (argc > 1
e1fb4052 2277 ? xnmalloc (argc - 1, sizeof (char *))
b3478306 2278@@ -952,8 +1027,12 @@ main (int argc, char **argv)
7cf3a80b
ML
2279 break;
2280 case 'e':
2281 if (optarg)
2282- getoptarg (optarg, 'e', &input_tab_char,
2283- &chars_per_input_tab);
2284+ {
2285+ int dummy_length, dummy_width;
2286+
2287+ getoptarg (optarg, 'e', input_tab_char, &dummy_length,
2288+ &dummy_width, &chars_per_input_tab);
2289+ }
2290 /* Could check tab width > 0. */
2291 untabify_input = true;
2292 break;
b3478306 2293@@ -966,8 +1045,12 @@ main (int argc, char **argv)
7cf3a80b
ML
2294 break;
2295 case 'i':
2296 if (optarg)
2297- getoptarg (optarg, 'i', &output_tab_char,
2298- &chars_per_output_tab);
2299+ {
2300+ int dummy_width;
2301+
2302+ getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
2303+ &dummy_width, &chars_per_output_tab);
2304+ }
2305 /* Could check tab width > 0. */
2306 tabify_output = true;
2307 break;
b3478306 2308@@ -985,8 +1068,8 @@ main (int argc, char **argv)
7cf3a80b
ML
2309 case 'n':
2310 numbered_lines = true;
2311 if (optarg)
2312- getoptarg (optarg, 'n', &number_separator,
2313- &chars_per_number);
2314+ getoptarg (optarg, 'n', number_separator, &number_separator_length,
2315+ &number_separator_width, &chars_per_number);
2316 break;
2317 case 'N':
2318 skip_count = false;
b3478306 2319@@ -1011,6 +1094,7 @@ main (int argc, char **argv)
7cf3a80b 2320 /* Reset an additional input of -s, -S dominates -s */
e1fb4052
MF
2321 col_sep_string = "";
2322 col_sep_length = 0;
2323+ col_sep_width = 0;
7cf3a80b
ML
2324 use_col_separator = true;
2325 if (optarg)
2326 separator_string (optarg);
e1fb4052 2327@@ -1166,10 +1250,45 @@ getoptnum (const char *n_str, int min, i
7cf3a80b
ML
2328 a number. */
2329
2330 static void
2331-getoptarg (char *arg, char switch_char, char *character, int *number)
2332+getoptarg (char *arg, char switch_char, char *character, int *character_length,
2333+ int *character_width, int *number)
2334 {
2335 if (!ISDIGIT (*arg))
2336- *character = *arg++;
2337+ {
2338+#ifdef HAVE_MBRTOWC
2339+ if (MB_CUR_MAX > 1) /* for multibyte locale. */
2340+ {
2341+ wchar_t wc;
2342+ size_t mblength;
2343+ int width;
2344+ mbstate_t state = {'\0'};
2345+
2346+ mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
2347+
2348+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
2349+ {
2350+ *character_length = 1;
2351+ *character_width = 1;
2352+ }
2353+ else
2354+ {
2355+ *character_length = (mblength < 1) ? 1 : mblength;
2356+ width = wcwidth (wc);
2357+ *character_width = (width < 0) ? 0 : width;
2358+ }
2359+
2360+ strncpy (character, arg, *character_length);
2361+ arg += *character_length;
2362+ }
2363+ else /* for single byte locale. */
2364+#endif
2365+ {
2366+ *character = *arg++;
2367+ *character_length = 1;
2368+ *character_width = 1;
2369+ }
2370+ }
2371+
2372 if (*arg)
2373 {
2374 long int tmp_long;
e1fb4052 2375@@ -1191,6 +1310,11 @@ static void
7cf3a80b
ML
2376 init_parameters (int number_of_files)
2377 {
2378 int chars_used_by_number = 0;
2379+ int mb_len = 1;
2380+#if HAVE_MBRTOWC
2381+ if (MB_CUR_MAX > 1)
2382+ mb_len = MB_LEN_MAX;
2383+#endif
2384
2385 lines_per_body = lines_per_page - lines_per_header - lines_per_footer;
2386 if (lines_per_body <= 0)
e1fb4052 2387@@ -1228,7 +1352,7 @@ init_parameters (int number_of_files)
7cf3a80b
ML
2388 else
2389 col_sep_string = column_separator;
2390
2391- col_sep_length = 1;
2392+ col_sep_length = col_sep_width = 1;
2393 use_col_separator = true;
2394 }
2395 /* It's rather pointless to define a TAB separator with column
e1fb4052 2396@@ -1258,11 +1382,11 @@ init_parameters (int number_of_files)
7cf3a80b
ML
2397 + TAB_WIDTH (chars_per_input_tab, chars_per_number); */
2398
2399 /* Estimate chars_per_text without any margin and keep it constant. */
2400- if (number_separator == '\t')
2401+ if (number_separator[0] == '\t')
2402 number_width = (chars_per_number
2403 + TAB_WIDTH (chars_per_default_tab, chars_per_number));
2404 else
2405- number_width = chars_per_number + 1;
2406+ number_width = chars_per_number + number_separator_width;
2407
2408 /* The number is part of the column width unless we are
2409 printing files in parallel. */
e1fb4052 2410@@ -1271,7 +1395,7 @@ init_parameters (int number_of_files)
7cf3a80b
ML
2411 }
2412
e1fb4052
MF
2413 int sep_chars, useful_chars;
2414- if (INT_MULTIPLY_WRAPV (columns - 1, col_sep_length, &sep_chars))
2415+ if (INT_MULTIPLY_WRAPV (columns - 1, col_sep_width, &sep_chars))
2416 sep_chars = INT_MAX;
2417 if (INT_SUBTRACT_WRAPV (chars_per_line - chars_used_by_number, sep_chars,
2418 &useful_chars))
2419@@ -1294,7 +1418,7 @@ init_parameters (int number_of_files)
7cf3a80b
ML
2420 We've to use 8 as the lower limit, if we use chars_per_default_tab = 8
2421 to expand a tab which is not an input_tab-char. */
2422 free (clump_buff);
2423- clump_buff = xmalloc (MAX (8, chars_per_input_tab));
2424+ clump_buff = xmalloc (mb_len * MAX (8, chars_per_input_tab));
2425 }
2426
2427 /* Open the necessary files,
b3478306 2428@@ -1400,7 +1524,7 @@ init_funcs (void)
7cf3a80b
ML
2429
2430 /* Enlarge p->start_position of first column to use the same form of
2431 padding_not_printed with all columns. */
2432- h = h + col_sep_length;
2433+ h = h + col_sep_width;
2434
2435 /* This loop takes care of all but the rightmost column. */
2436
b3478306 2437@@ -1434,7 +1558,7 @@ init_funcs (void)
7cf3a80b
ML
2438 }
2439 else
2440 {
2441- h = h_next + col_sep_length;
2442+ h = h_next + col_sep_width;
2443 h_next = h + chars_per_column;
2444 }
2445 }
b3478306 2446@@ -1725,9 +1849,9 @@ static void
7cf3a80b
ML
2447 align_column (COLUMN *p)
2448 {
2449 padding_not_printed = p->start_position;
e1fb4052
MF
2450- if (col_sep_length < padding_not_printed)
2451+ if (col_sep_width < padding_not_printed)
7cf3a80b
ML
2452 {
2453- pad_across_to (padding_not_printed - col_sep_length);
2454+ pad_across_to (padding_not_printed - col_sep_width);
2455 padding_not_printed = ANYWHERE;
2456 }
2457
b3478306 2458@@ -2002,13 +2126,13 @@ store_char (char c)
7cf3a80b
ML
2459 /* May be too generous. */
2460 buff = X2REALLOC (buff, &buff_allocated);
2461 }
2462- buff[buff_current++] = c;
2463+ buff[buff_current++] = (unsigned char) c;
2464 }
2465
2466 static void
2467 add_line_number (COLUMN *p)
2468 {
2469- int i;
2470+ int i, j;
2471 char *s;
2472 int num_width;
2473
b3478306 2474@@ -2025,22 +2149,24 @@ add_line_number (COLUMN *p)
7cf3a80b
ML
2475 /* Tabification is assumed for multiple columns, also for n-separators,
2476 but 'default n-separator = TAB' hasn't been given priority over
2477 equal column_width also specified by POSIX. */
2478- if (number_separator == '\t')
2479+ if (number_separator[0] == '\t')
2480 {
2481 i = number_width - chars_per_number;
2482 while (i-- > 0)
2483 (p->char_func) (' ');
2484 }
2485 else
2486- (p->char_func) (number_separator);
2487+ for (j = 0; j < number_separator_length; j++)
2488+ (p->char_func) (number_separator[j]);
2489 }
2490 else
2491 /* To comply with POSIX, we avoid any expansion of default TAB
2492 separator with a single column output. No column_width requirement
2493 has to be considered. */
2494 {
2495- (p->char_func) (number_separator);
2496- if (number_separator == '\t')
2497+ for (j = 0; j < number_separator_length; j++)
2498+ (p->char_func) (number_separator[j]);
2499+ if (number_separator[0] == '\t')
2500 output_position = POS_AFTER_TAB (chars_per_output_tab,
2501 output_position);
2502 }
b3478306 2503@@ -2199,7 +2325,7 @@ print_white_space (void)
7cf3a80b
ML
2504 while (goal - h_old > 1
2505 && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
2506 {
2507- putchar (output_tab_char);
2508+ fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
2509 h_old = h_new;
2510 }
2511 while (++h_old <= goal)
b3478306 2512@@ -2219,6 +2345,7 @@ print_sep_string (void)
7cf3a80b 2513 {
e1fb4052 2514 char const *s = col_sep_string;
7cf3a80b
ML
2515 int l = col_sep_length;
2516+ int not_space_flag;
2517
e1fb4052
MF
2518 if (separators_not_printed <= 0)
2519 {
b3478306 2520@@ -2230,6 +2357,7 @@ print_sep_string (void)
7cf3a80b
ML
2521 {
2522 for (; separators_not_printed > 0; --separators_not_printed)
2523 {
2524+ not_space_flag = 0;
2525 while (l-- > 0)
2526 {
2527 /* 3 types of sep_strings: spaces only, spaces and chars,
b3478306 2528@@ -2243,12 +2371,15 @@ print_sep_string (void)
7cf3a80b
ML
2529 }
2530 else
2531 {
2532+ not_space_flag = 1;
2533 if (spaces_not_printed > 0)
2534 print_white_space ();
2535 putchar (*s++);
2536- ++output_position;
2537 }
2538 }
2539+ if (not_space_flag)
2540+ output_position += col_sep_width;
2541+
2542 /* sep_string ends with some spaces */
2543 if (spaces_not_printed > 0)
2544 print_white_space ();
b3478306 2545@@ -2276,7 +2407,7 @@ print_clump (COLUMN *p, int n, char *clu
7cf3a80b
ML
2546 required number of tabs and spaces. */
2547
2548 static void
2549-print_char (char c)
2550+print_char_single (char c)
2551 {
2552 if (tabify_output)
2553 {
b3478306 2554@@ -2300,6 +2431,74 @@ print_char (char c)
7cf3a80b
ML
2555 putchar (c);
2556 }
2557
2558+#ifdef HAVE_MBRTOWC
2559+static void
2560+print_char_multi (char c)
2561+{
2562+ static size_t mbc_pos = 0;
2563+ static char mbc[MB_LEN_MAX] = {'\0'};
2564+ static mbstate_t state = {'\0'};
2565+ mbstate_t state_bak;
2566+ wchar_t wc;
2567+ size_t mblength;
2568+ int width;
2569+
2570+ if (tabify_output)
2571+ {
2572+ state_bak = state;
2573+ mbc[mbc_pos++] = c;
2574+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2575+
2576+ while (mbc_pos > 0)
2577+ {
2578+ switch (mblength)
2579+ {
2580+ case (size_t)-2:
2581+ state = state_bak;
2582+ return;
2583+
2584+ case (size_t)-1:
2585+ state = state_bak;
2586+ ++output_position;
2587+ putchar (mbc[0]);
2588+ memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
2589+ --mbc_pos;
2590+ break;
2591+
2592+ case 0:
2593+ mblength = 1;
2594+
2595+ default:
2596+ if (wc == L' ')
2597+ {
2598+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2599+ --mbc_pos;
2600+ ++spaces_not_printed;
2601+ return;
2602+ }
2603+ else if (spaces_not_printed > 0)
2604+ print_white_space ();
2605+
2606+ /* Nonprintables are assumed to have width 0, except L'\b'. */
2607+ if ((width = wcwidth (wc)) < 1)
2608+ {
2609+ if (wc == L'\b')
2610+ --output_position;
2611+ }
2612+ else
2613+ output_position += width;
2614+
2615+ fwrite (mbc, sizeof(char), mblength, stdout);
2616+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2617+ mbc_pos -= mblength;
2618+ }
2619+ }
2620+ return;
2621+ }
2622+ putchar (c);
2623+}
2624+#endif
2625+
2626 /* Skip to page PAGE before printing.
2627 PAGE may be larger than total number of pages. */
2628
b3478306 2629@@ -2477,9 +2676,9 @@ read_line (COLUMN *p)
7cf3a80b
ML
2630 align_empty_cols = false;
2631 }
2632
e1fb4052
MF
2633- if (col_sep_length < padding_not_printed)
2634+ if (col_sep_width < padding_not_printed)
7cf3a80b
ML
2635 {
2636- pad_across_to (padding_not_printed - col_sep_length);
2637+ pad_across_to (padding_not_printed - col_sep_width);
2638 padding_not_printed = ANYWHERE;
2639 }
2640
b3478306
PM
2641@@ -2548,7 +2747,7 @@ print_stored (COLUMN *p)
2642 COLUMN *q;
7cf3a80b
ML
2643
2644 int line = p->current_line++;
2645- char *first = &buff[line_vector[line]];
2646+ unsigned char *first = &buff[line_vector[line]];
2647 /* FIXME
2648 UMR: Uninitialized memory read:
2649 * This is occurring while in:
b3478306 2650@@ -2560,7 +2759,7 @@ print_stored (COLUMN *p)
7cf3a80b
ML
2651 xmalloc [xmalloc.c:94]
2652 init_store_cols [pr.c:1648]
2653 */
2654- char *last = &buff[line_vector[line + 1]];
2655+ unsigned char *last = &buff[line_vector[line + 1]];
2656
2657 pad_vertically = true;
2658
b3478306 2659@@ -2580,9 +2779,9 @@ print_stored (COLUMN *p)
7cf3a80b
ML
2660 }
2661 }
2662
e1fb4052
MF
2663- if (col_sep_length < padding_not_printed)
2664+ if (col_sep_width < padding_not_printed)
7cf3a80b
ML
2665 {
2666- pad_across_to (padding_not_printed - col_sep_length);
2667+ pad_across_to (padding_not_printed - col_sep_width);
2668 padding_not_printed = ANYWHERE;
2669 }
2670
b3478306 2671@@ -2595,8 +2794,8 @@ print_stored (COLUMN *p)
7cf3a80b
ML
2672 if (spaces_not_printed == 0)
2673 {
2674 output_position = p->start_position + end_vector[line];
2675- if (p->start_position - col_sep_length == chars_per_margin)
2676- output_position -= col_sep_length;
2677+ if (p->start_position - col_sep_width == chars_per_margin)
2678+ output_position -= col_sep_width;
2679 }
2680
2681 return true;
b3478306 2682@@ -2615,7 +2814,7 @@ print_stored (COLUMN *p)
7cf3a80b
ML
2683 number of characters is 1.) */
2684
2685 static int
2686-char_to_clump (char c)
2687+char_to_clump_single (char c)
2688 {
2689 unsigned char uc = c;
2690 char *s = clump_buff;
b3478306 2691@@ -2625,10 +2824,10 @@ char_to_clump (char c)
7cf3a80b
ML
2692 int chars;
2693 int chars_per_c = 8;
2694
2695- if (c == input_tab_char)
2696+ if (c == input_tab_char[0])
2697 chars_per_c = chars_per_input_tab;
2698
2699- if (c == input_tab_char || c == '\t')
2700+ if (c == input_tab_char[0] || c == '\t')
2701 {
2702 width = TAB_WIDTH (chars_per_c, input_position);
2703
b3478306 2704@@ -2709,6 +2908,164 @@ char_to_clump (char c)
7cf3a80b
ML
2705 return chars;
2706 }
2707
2708+#ifdef HAVE_MBRTOWC
2709+static int
2710+char_to_clump_multi (char c)
2711+{
2712+ static size_t mbc_pos = 0;
2713+ static char mbc[MB_LEN_MAX] = {'\0'};
2714+ static mbstate_t state = {'\0'};
2715+ mbstate_t state_bak;
2716+ wchar_t wc;
2717+ size_t mblength;
2718+ int wc_width;
2719+ register char *s = clump_buff;
2720+ register int i, j;
2721+ char esc_buff[4];
2722+ int width;
2723+ int chars;
2724+ int chars_per_c = 8;
2725+
2726+ state_bak = state;
2727+ mbc[mbc_pos++] = c;
2728+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2729+
2730+ width = 0;
2731+ chars = 0;
2732+ while (mbc_pos > 0)
2733+ {
2734+ switch (mblength)
2735+ {
2736+ case (size_t)-2:
2737+ state = state_bak;
2738+ return 0;
2739+
2740+ case (size_t)-1:
2741+ state = state_bak;
2742+ mblength = 1;
2743+
2744+ if (use_esc_sequence || use_cntrl_prefix)
2745+ {
2746+ width = +4;
2747+ chars = +4;
2748+ *s++ = '\\';
2749+ sprintf (esc_buff, "%03o", (unsigned char) mbc[0]);
2750+ for (i = 0; i <= 2; ++i)
2751+ *s++ = (int) esc_buff[i];
2752+ }
2753+ else
2754+ {
2755+ width += 1;
2756+ chars += 1;
2757+ *s++ = mbc[0];
2758+ }
2759+ break;
2760+
2761+ case 0:
2762+ mblength = 1;
2763+ /* Fall through */
2764+
2765+ default:
2766+ if (memcmp (mbc, input_tab_char, mblength) == 0)
2767+ chars_per_c = chars_per_input_tab;
2768+
2769+ if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
2770+ {
2771+ int width_inc;
2772+
2773+ width_inc = TAB_WIDTH (chars_per_c, input_position);
2774+ width += width_inc;
2775+
2776+ if (untabify_input)
2777+ {
2778+ for (i = width_inc; i; --i)
2779+ *s++ = ' ';
2780+ chars += width_inc;
2781+ }
2782+ else
2783+ {
2784+ for (i = 0; i < mblength; i++)
2785+ *s++ = mbc[i];
2786+ chars += mblength;
2787+ }
2788+ }
2789+ else if ((wc_width = wcwidth (wc)) < 1)
2790+ {
2791+ if (use_esc_sequence)
2792+ {
2793+ for (i = 0; i < mblength; i++)
2794+ {
2795+ width += 4;
2796+ chars += 4;
2797+ *s++ = '\\';
2798+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
2799+ for (j = 0; j <= 2; ++j)
2800+ *s++ = (int) esc_buff[j];
2801+ }
2802+ }
2803+ else if (use_cntrl_prefix)
2804+ {
2805+ if (wc < 0200)
2806+ {
2807+ width += 2;
2808+ chars += 2;
2809+ *s++ = '^';
2810+ *s++ = wc ^ 0100;
2811+ }
2812+ else
2813+ {
2814+ for (i = 0; i < mblength; i++)
2815+ {
2816+ width += 4;
2817+ chars += 4;
2818+ *s++ = '\\';
2819+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
2820+ for (j = 0; j <= 2; ++j)
2821+ *s++ = (int) esc_buff[j];
2822+ }
2823+ }
2824+ }
2825+ else if (wc == L'\b')
2826+ {
2827+ width += -1;
2828+ chars += 1;
2829+ *s++ = c;
2830+ }
2831+ else
2832+ {
2833+ width += 0;
2834+ chars += mblength;
2835+ for (i = 0; i < mblength; i++)
2836+ *s++ = mbc[i];
2837+ }
2838+ }
2839+ else
2840+ {
2841+ width += wc_width;
2842+ chars += mblength;
2843+ for (i = 0; i < mblength; i++)
2844+ *s++ = mbc[i];
2845+ }
2846+ }
2847+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2848+ mbc_pos -= mblength;
2849+ }
2850+
2851+ /* Too many backspaces must put us in position 0 -- never negative. */
2852+ if (width < 0 && input_position == 0)
2853+ {
2854+ chars = 0;
2855+ input_position = 0;
2856+ }
2857+ else if (width < 0 && input_position <= -width)
2858+ input_position = 0;
2859+ else
2860+ input_position += width;
2861+
2862+ return chars;
2863+}
2864+#endif
2865+
2866 /* We've just printed some files and need to clean up things before
2867 looking for more options and printing the next batch of files.
2868
b3478306
PM
2869diff -Naurp coreutils-8.32.orig/src/sort.c coreutils-8.32/src/sort.c
2870--- coreutils-8.32.orig/src/sort.c 2020-01-01 22:33:34.000000000 +0800
2871+++ coreutils-8.32/src/sort.c 2020-03-08 12:10:27.738236560 +0800
7cf3a80b
ML
2872@@ -29,6 +29,14 @@
2873 #include <sys/wait.h>
2874 #include <signal.h>
2875 #include <assert.h>
2876+#if HAVE_WCHAR_H
2877+# include <wchar.h>
2878+#endif
2879+/* Get isw* functions. */
2880+#if HAVE_WCTYPE_H
2881+# include <wctype.h>
2882+#endif
2883+
2884 #include "system.h"
2885 #include "argmatch.h"
e1fb4052 2886 #include "die.h"
b3478306 2887@@ -157,14 +165,39 @@ static int decimal_point;
7cf3a80b
ML
2888 /* Thousands separator; if -1, then there isn't one. */
2889 static int thousands_sep;
2890
2891+/* True if -f is specified. */
2892+static bool folding;
2893+
2894 /* Nonzero if the corresponding locales are hard. */
2895 static bool hard_LC_COLLATE;
2896-#if HAVE_NL_LANGINFO
2897+#if HAVE_LANGINFO_CODESET
2898 static bool hard_LC_TIME;
2899 #endif
2900
2901 #define NONZERO(x) ((x) != 0)
2902
2903+/* get a multibyte character's byte length. */
2904+#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
2905+ do \
2906+ { \
2907+ wchar_t wc; \
2908+ mbstate_t state_bak; \
2909+ \
2910+ state_bak = STATE; \
2911+ mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
2912+ \
2913+ switch (MBLENGTH) \
2914+ { \
2915+ case (size_t)-1: \
2916+ case (size_t)-2: \
2917+ STATE = state_bak; \
2918+ /* Fall through. */ \
2919+ case 0: \
2920+ MBLENGTH = 1; \
2921+ } \
2922+ } \
2923+ while (0)
2924+
2925 /* The kind of blanks for '-b' to skip in various options. */
2926 enum blanktype { bl_start, bl_end, bl_both };
2927
b3478306 2928@@ -338,13 +371,11 @@ static bool reverse;
7cf3a80b
ML
2929 they were read if all keys compare equal. */
2930 static bool stable;
2931
2932-/* If TAB has this value, blanks separate fields. */
2933-enum { TAB_DEFAULT = CHAR_MAX + 1 };
2934-
2935-/* Tab character separating fields. If TAB_DEFAULT, then fields are
2936+/* Tab character separating fields. If tab_length is 0, then fields are
2937 separated by the empty string between a non-blank character and a blank
2938 character. */
2939-static int tab = TAB_DEFAULT;
2940+static char tab[MB_LEN_MAX + 1];
2941+static size_t tab_length = 0;
2942
2943 /* Flag to remove consecutive duplicate lines from the output.
2944 Only the last of a sequence of equal lines will be output. */
b3478306 2945@@ -802,6 +833,46 @@ reap_all (void)
7cf3a80b
ML
2946 reap (-1);
2947 }
2948
2949+/* Function pointers. */
2950+static void
2951+(*inittables) (void);
2952+static char *
2953+(*begfield) (const struct line*, const struct keyfield *);
2954+static char *
2955+(*limfield) (const struct line*, const struct keyfield *);
2956+static void
2957+(*skipblanks) (char **ptr, char *lim);
2958+static int
2959+(*getmonth) (char const *, size_t, char **);
2960+static int
2961+(*keycompare) (const struct line *, const struct line *);
2962+static int
2963+(*numcompare) (const char *, const char *);
2964+
2965+/* Test for white space multibyte character.
2966+ Set LENGTH the byte length of investigated multibyte character. */
2967+#if HAVE_MBRTOWC
2968+static int
2969+ismbblank (const char *str, size_t len, size_t *length)
2970+{
2971+ size_t mblength;
2972+ wchar_t wc;
2973+ mbstate_t state;
2974+
2975+ memset (&state, '\0', sizeof(mbstate_t));
2976+ mblength = mbrtowc (&wc, str, len, &state);
2977+
2978+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
2979+ {
2980+ *length = 1;
2981+ return 0;
2982+ }
2983+
2984+ *length = (mblength < 1) ? 1 : mblength;
2985+ return iswblank (wc) || wc == '\n';
2986+}
2987+#endif
2988+
2989 /* Clean up any remaining temporary files. */
2990
2991 static void
b3478306 2992@@ -1270,7 +1341,7 @@ zaptemp (char const *name)
7cf3a80b
ML
2993 free (node);
2994 }
2995
2996-#if HAVE_NL_LANGINFO
2997+#if HAVE_LANGINFO_CODESET
2998
2999 static int
3000 struct_month_cmp (void const *m1, void const *m2)
b3478306 3001@@ -1285,7 +1356,7 @@ struct_month_cmp (void const *m1, void c
7cf3a80b
ML
3002 /* Initialize the character class tables. */
3003
3004 static void
3005-inittables (void)
3006+inittables_uni (void)
3007 {
3008 size_t i;
3009
b3478306 3010@@ -1297,7 +1368,7 @@ inittables (void)
7cf3a80b
ML
3011 fold_toupper[i] = toupper (i);
3012 }
3013
3014-#if HAVE_NL_LANGINFO
3015+#if HAVE_LANGINFO_CODESET
3016 /* If we're not in the "C" locale, read different names for months. */
3017 if (hard_LC_TIME)
3018 {
b3478306 3019@@ -1379,6 +1450,84 @@ specify_nmerge (int oi, char c, char con
7cf3a80b
ML
3020 xstrtol_fatal (e, oi, c, long_options, s);
3021 }
3022
3023+#if HAVE_MBRTOWC
3024+static void
3025+inittables_mb (void)
3026+{
3027+ int i, j, k, l;
3028+ char *name, *s, *lc_time, *lc_ctype;
3029+ size_t s_len, mblength;
3030+ char mbc[MB_LEN_MAX];
3031+ wchar_t wc, pwc;
3032+ mbstate_t state_mb, state_wc;
3033+
3034+ lc_time = setlocale (LC_TIME, "");
3035+ if (lc_time)
3036+ lc_time = xstrdup (lc_time);
3037+
3038+ lc_ctype = setlocale (LC_CTYPE, "");
3039+ if (lc_ctype)
3040+ lc_ctype = xstrdup (lc_ctype);
3041+
3042+ if (lc_time && lc_ctype)
3043+ /* temporarily set LC_CTYPE to match LC_TIME, so that we can convert
3044+ * the names of months to upper case */
3045+ setlocale (LC_CTYPE, lc_time);
3046+
3047+ for (i = 0; i < MONTHS_PER_YEAR; i++)
3048+ {
3049+ s = (char *) nl_langinfo (ABMON_1 + i);
3050+ s_len = strlen (s);
3051+ monthtab[i].name = name = (char *) xmalloc (s_len + 1);
3052+ monthtab[i].val = i + 1;
3053+
3054+ memset (&state_mb, '\0', sizeof (mbstate_t));
3055+ memset (&state_wc, '\0', sizeof (mbstate_t));
3056+
3057+ for (j = 0; j < s_len;)
3058+ {
3059+ if (!ismbblank (s + j, s_len - j, &mblength))
3060+ break;
3061+ j += mblength;
3062+ }
3063+
3064+ for (k = 0; j < s_len;)
3065+ {
3066+ mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
3067+ assert (mblength != (size_t)-1 && mblength != (size_t)-2);
3068+ if (mblength == 0)
3069+ break;
3070+
3071+ pwc = towupper (wc);
3072+ if (pwc == wc)
3073+ {
3074+ memcpy (mbc, s + j, mblength);
3075+ j += mblength;
3076+ }
3077+ else
3078+ {
3079+ j += mblength;
3080+ mblength = wcrtomb (mbc, pwc, &state_wc);
3081+ assert (mblength != (size_t)0 && mblength != (size_t)-1);
3082+ }
3083+
3084+ for (l = 0; l < mblength; l++)
3085+ name[k++] = mbc[l];
3086+ }
3087+ name[k] = '\0';
3088+ }
3089+ qsort ((void *) monthtab, MONTHS_PER_YEAR,
3090+ sizeof (struct month), struct_month_cmp);
3091+
3092+ if (lc_time && lc_ctype)
3093+ /* restore the original locales */
3094+ setlocale (LC_CTYPE, lc_ctype);
3095+
3096+ free (lc_ctype);
3097+ free (lc_time);
3098+}
3099+#endif
3100+
3101 /* Specify the amount of main memory to use when sorting. */
3102 static void
3103 specify_sort_size (int oi, char c, char const *s)
b3478306 3104@@ -1610,7 +1759,7 @@ buffer_linelim (struct buffer const *buf
7cf3a80b
ML
3105 by KEY in LINE. */
3106
3107 static char *
3108-begfield (struct line const *line, struct keyfield const *key)
3109+begfield_uni (const struct line *line, const struct keyfield *key)
3110 {
3111 char *ptr = line->text, *lim = ptr + line->length - 1;
3112 size_t sword = key->sword;
b3478306 3113@@ -1619,10 +1768,10 @@ begfield (struct line const *line, struc
7cf3a80b
ML
3114 /* The leading field separator itself is included in a field when -t
3115 is absent. */
3116
3117- if (tab != TAB_DEFAULT)
3118+ if (tab_length)
3119 while (ptr < lim && sword--)
3120 {
3121- while (ptr < lim && *ptr != tab)
3122+ while (ptr < lim && *ptr != tab[0])
3123 ++ptr;
3124 if (ptr < lim)
3125 ++ptr;
b3478306 3126@@ -1648,11 +1797,70 @@ begfield (struct line const *line, struc
7cf3a80b
ML
3127 return ptr;
3128 }
3129
3130+#if HAVE_MBRTOWC
3131+static char *
3132+begfield_mb (const struct line *line, const struct keyfield *key)
3133+{
3134+ int i;
3135+ char *ptr = line->text, *lim = ptr + line->length - 1;
3136+ size_t sword = key->sword;
3137+ size_t schar = key->schar;
3138+ size_t mblength;
3139+ mbstate_t state;
3140+
3141+ memset (&state, '\0', sizeof(mbstate_t));
3142+
3143+ if (tab_length)
3144+ while (ptr < lim && sword--)
3145+ {
3146+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
3147+ {
3148+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3149+ ptr += mblength;
3150+ }
3151+ if (ptr < lim)
3152+ {
3153+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3154+ ptr += mblength;
3155+ }
3156+ }
3157+ else
3158+ while (ptr < lim && sword--)
3159+ {
3160+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
3161+ ptr += mblength;
3162+ if (ptr < lim)
3163+ {
3164+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3165+ ptr += mblength;
3166+ }
3167+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
3168+ ptr += mblength;
3169+ }
3170+
3171+ if (key->skipsblanks)
3172+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
3173+ ptr += mblength;
3174+
3175+ for (i = 0; i < schar; i++)
3176+ {
3177+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3178+
3179+ if (ptr + mblength > lim)
3180+ break;
3181+ else
3182+ ptr += mblength;
3183+ }
3184+
3185+ return ptr;
3186+}
3187+#endif
3188+
3189 /* Return the limit of (a pointer to the first character after) the field
3190 in LINE specified by KEY. */
3191
3192 static char *
3193-limfield (struct line const *line, struct keyfield const *key)
3194+limfield_uni (const struct line *line, const struct keyfield *key)
3195 {
3196 char *ptr = line->text, *lim = ptr + line->length - 1;
3197 size_t eword = key->eword, echar = key->echar;
b3478306 3198@@ -1667,10 +1875,10 @@ limfield (struct line const *line, struc
7cf3a80b
ML
3199 'beginning' is the first character following the delimiting TAB.
3200 Otherwise, leave PTR pointing at the first 'blank' character after
3201 the preceding field. */
3202- if (tab != TAB_DEFAULT)
3203+ if (tab_length)
3204 while (ptr < lim && eword--)
3205 {
3206- while (ptr < lim && *ptr != tab)
3207+ while (ptr < lim && *ptr != tab[0])
3208 ++ptr;
3209 if (ptr < lim && (eword || echar))
3210 ++ptr;
b3478306 3211@@ -1716,10 +1924,10 @@ limfield (struct line const *line, struc
7cf3a80b
ML
3212 */
3213
3214 /* Make LIM point to the end of (one byte past) the current field. */
3215- if (tab != TAB_DEFAULT)
3216+ if (tab_length)
3217 {
3218 char *newlim;
3219- newlim = memchr (ptr, tab, lim - ptr);
3220+ newlim = memchr (ptr, tab[0], lim - ptr);
3221 if (newlim)
3222 lim = newlim;
3223 }
b3478306 3224@@ -1750,6 +1958,130 @@ limfield (struct line const *line, struc
7cf3a80b
ML
3225 return ptr;
3226 }
3227
3228+#if HAVE_MBRTOWC
3229+static char *
3230+limfield_mb (const struct line *line, const struct keyfield *key)
3231+{
3232+ char *ptr = line->text, *lim = ptr + line->length - 1;
3233+ size_t eword = key->eword, echar = key->echar;
3234+ int i;
3235+ size_t mblength;
3236+ mbstate_t state;
3237+
3238+ if (echar == 0)
3239+ eword++; /* skip all of end field. */
3240+
3241+ memset (&state, '\0', sizeof(mbstate_t));
3242+
3243+ if (tab_length)
3244+ while (ptr < lim && eword--)
3245+ {
3246+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
3247+ {
3248+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3249+ ptr += mblength;
3250+ }
3251+ if (ptr < lim && (eword | echar))
3252+ {
3253+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3254+ ptr += mblength;
3255+ }
3256+ }
3257+ else
3258+ while (ptr < lim && eword--)
3259+ {
3260+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
3261+ ptr += mblength;
3262+ if (ptr < lim)
3263+ {
3264+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3265+ ptr += mblength;
3266+ }
3267+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
3268+ ptr += mblength;
3269+ }
3270+
3271+
3272+# ifdef POSIX_UNSPECIFIED
3273+ /* Make LIM point to the end of (one byte past) the current field. */
3274+ if (tab_length)
3275+ {
3276+ char *newlim, *p;
3277+
3278+ newlim = NULL;
3279+ for (p = ptr; p < lim;)
3280+ {
3281+ if (memcmp (p, tab, tab_length) == 0)
3282+ {
3283+ newlim = p;
3284+ break;
3285+ }
3286+
3287+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3288+ p += mblength;
3289+ }
3290+ }
3291+ else
3292+ {
3293+ char *newlim;
3294+ newlim = ptr;
3295+
3296+ while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
3297+ newlim += mblength;
3298+ if (ptr < lim)
3299+ {
3300+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3301+ ptr += mblength;
3302+ }
3303+ while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
3304+ newlim += mblength;
3305+ lim = newlim;
3306+ }
3307+# endif
3308+
3309+ if (echar != 0)
3310+ {
3311+ /* If we're skipping leading blanks, don't start counting characters
3312+ * until after skipping past any leading blanks. */
3313+ if (key->skipeblanks)
3314+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
3315+ ptr += mblength;
3316+
3317+ memset (&state, '\0', sizeof(mbstate_t));
3318+
3319+ /* Advance PTR by ECHAR (if possible), but no further than LIM. */
3320+ for (i = 0; i < echar; i++)
3321+ {
3322+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3323+
3324+ if (ptr + mblength > lim)
3325+ break;
3326+ else
3327+ ptr += mblength;
3328+ }
3329+ }
3330+
3331+ return ptr;
3332+}
3333+#endif
3334+
3335+static void
3336+skipblanks_uni (char **ptr, char *lim)
3337+{
3338+ while (*ptr < lim && blanks[to_uchar (**ptr)])
3339+ ++(*ptr);
3340+}
3341+
3342+#if HAVE_MBRTOWC
3343+static void
3344+skipblanks_mb (char **ptr, char *lim)
3345+{
3346+ size_t mblength;
3347+ while (*ptr < lim && ismbblank (*ptr, lim - *ptr, &mblength))
3348+ (*ptr) += mblength;
3349+}
3350+#endif
3351+
3352 /* Fill BUF reading from FP, moving buf->left bytes from the end
3353 of buf->buf to the beginning first. If EOF is reached and the
3354 file wasn't terminated by a newline, supply one. Set up BUF's line
b3478306 3355@@ -1836,8 +2168,22 @@ fillbuf (struct buffer *buf, FILE *fp, c
7cf3a80b
ML
3356 else
3357 {
3358 if (key->skipsblanks)
3359- while (blanks[to_uchar (*line_start)])
3360- line_start++;
3361+ {
3362+#if HAVE_MBRTOWC
3363+ if (MB_CUR_MAX > 1)
3364+ {
3365+ size_t mblength;
3366+ while (line_start < line->keylim &&
3367+ ismbblank (line_start,
3368+ line->keylim - line_start,
3369+ &mblength))
3370+ line_start += mblength;
3371+ }
3372+ else
3373+#endif
3374+ while (blanks[to_uchar (*line_start)])
3375+ line_start++;
3376+ }
3377 line->keybeg = line_start;
3378 }
3379 }
b3478306 3380@@ -1971,12 +2317,10 @@ find_unit_order (char const *number)
e1fb4052
MF
3381 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
3382
3383 static int
3384-human_numcompare (char const *a, char const *b)
3385+human_numcompare (char *a, char *b)
3386 {
3387- while (blanks[to_uchar (*a)])
3388- a++;
3389- while (blanks[to_uchar (*b)])
3390- b++;
3391+ skipblanks(&a, a + strlen(a));
3392+ skipblanks(&b, b + strlen(b));
3393
3394 int diff = find_unit_order (a) - find_unit_order (b);
3395 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
b3478306 3396@@ -1987,7 +2331,7 @@ human_numcompare (char const *a, char co
7cf3a80b
ML
3397 hideously fast. */
3398
3399 static int
3400-numcompare (char const *a, char const *b)
3401+numcompare_uni (const char *a, const char *b)
3402 {
3403 while (blanks[to_uchar (*a)])
3404 a++;
b3478306 3405@@ -1997,6 +2341,25 @@ numcompare (char const *a, char const *b
7cf3a80b
ML
3406 return strnumcmp (a, b, decimal_point, thousands_sep);
3407 }
3408
3409+#if HAVE_MBRTOWC
3410+static int
3411+numcompare_mb (const char *a, const char *b)
3412+{
3413+ size_t mblength, len;
3414+ len = strlen (a); /* okay for UTF-8 */
3415+ while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
3416+ {
3417+ a += mblength;
3418+ len -= mblength;
3419+ }
3420+ len = strlen (b); /* okay for UTF-8 */
3421+ while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
3422+ b += mblength;
3423+
3424+ return strnumcmp (a, b, decimal_point, thousands_sep);
3425+}
3426+#endif /* HAV_EMBRTOWC */
3427+
3428 /* Work around a problem whereby the long double value returned by glibc's
3429 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
b3478306
PM
3430 A and B before calling strtold. FIXME: remove this function if
3431@@ -2047,7 +2410,7 @@ general_numcompare (char const *sa, char
7cf3a80b
ML
3432 Return 0 if the name in S is not recognized. */
3433
3434 static int
3435-getmonth (char const *month, char **ea)
3436+getmonth_uni (char const *month, size_t len, char **ea)
3437 {
3438 size_t lo = 0;
3439 size_t hi = MONTHS_PER_YEAR;
b3478306 3440@@ -2323,15 +2686,14 @@ debug_key (struct line const *line, stru
7cf3a80b
ML
3441 char saved = *lim;
3442 *lim = '\0';
3443
3444- while (blanks[to_uchar (*beg)])
3445- beg++;
3446+ skipblanks (&beg, lim);
3447
3448 char *tighter_lim = beg;
3449
3450 if (lim < beg)
3451 tighter_lim = lim;
3452 else if (key->month)
3453- getmonth (beg, &tighter_lim);
3454+ getmonth (beg, lim-beg, &tighter_lim);
3455 else if (key->general_numeric)
3456 ignore_value (strtold (beg, &tighter_lim));
3457 else if (key->numeric || key->human_numeric)
b3478306 3458@@ -2465,7 +2827,7 @@ key_warnings (struct keyfield const *gke
e1fb4052
MF
3459 /* Warn about significant leading blanks. */
3460 bool implicit_skip = key_numeric (key) || key->month;
7cf3a80b 3461 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
e1fb4052
MF
3462- if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
3463+ if (!zero_width && !gkey_only && !tab_length && !line_offset
3464 && ((!key->skipsblanks && !implicit_skip)
7cf3a80b
ML
3465 || (!key->skipsblanks && key->schar)
3466 || (!key->skipeblanks && key->echar)))
b3478306 3467@@ -2523,11 +2885,87 @@ key_warnings (struct keyfield const *gke
7cf3a80b
ML
3468 error (0, 0, _("option '-r' only applies to last-resort comparison"));
3469 }
3470
3471+#if HAVE_MBRTOWC
3472+static int
3473+getmonth_mb (const char *s, size_t len, char **ea)
3474+{
3475+ char *month;
3476+ register size_t i;
3477+ register int lo = 0, hi = MONTHS_PER_YEAR, result;
3478+ char *tmp;
3479+ size_t wclength, mblength;
3480+ const char *pp;
3481+ const wchar_t *wpp;
3482+ wchar_t *month_wcs;
3483+ mbstate_t state;
3484+
3485+ while (len > 0 && ismbblank (s, len, &mblength))
3486+ {
3487+ s += mblength;
3488+ len -= mblength;
3489+ }
3490+
3491+ if (len == 0)
3492+ return 0;
3493+
3494+ if (SIZE_MAX - len < 1)
3495+ xalloc_die ();
3496+
3497+ month = (char *) xnmalloc (len + 1, MB_CUR_MAX);
3498+
3499+ pp = tmp = (char *) xnmalloc (len + 1, MB_CUR_MAX);
3500+ memcpy (tmp, s, len);
3501+ tmp[len] = '\0';
3502+ wpp = month_wcs = (wchar_t *) xnmalloc (len + 1, sizeof (wchar_t));
3503+ memset (&state, '\0', sizeof (mbstate_t));
3504+
3505+ wclength = mbsrtowcs (month_wcs, &pp, len + 1, &state);
3506+ if (wclength == (size_t)-1 || pp != NULL)
3507+ error (SORT_FAILURE, 0, _("Invalid multibyte input %s."), quote(s));
3508+
3509+ for (i = 0; i < wclength; i++)
3510+ {
3511+ month_wcs[i] = towupper(month_wcs[i]);
3512+ if (iswblank (month_wcs[i]))
3513+ {
3514+ month_wcs[i] = L'\0';
3515+ break;
3516+ }
3517+ }
3518+
3519+ mblength = wcsrtombs (month, &wpp, (len + 1) * MB_CUR_MAX, &state);
3520+ assert (mblength != (-1) && wpp == NULL);
3521+
3522+ do
3523+ {
3524+ int ix = (lo + hi) / 2;
3525+
3526+ if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
3527+ hi = ix;
3528+ else
3529+ lo = ix;
3530+ }
3531+ while (hi - lo > 1);
3532+
3533+ result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
3534+ ? monthtab[lo].val : 0);
3535+
3536+ if (ea && result)
3537+ *ea = (char*) s + strlen (monthtab[lo].name);
3538+
3539+ free (month);
3540+ free (tmp);
3541+ free (month_wcs);
3542+
3543+ return result;
3544+}
3545+#endif
3546+
3547 /* Compare two lines A and B trying every key in sequence until there
3548 are no more keys or a difference is found. */
3549
3550 static int
3551-keycompare (struct line const *a, struct line const *b)
3552+keycompare_uni (const struct line *a, const struct line *b)
3553 {
3554 struct keyfield *key = keylist;
3555
b3478306 3556@@ -2612,7 +3050,7 @@ keycompare (struct line const *a, struct
7cf3a80b
ML
3557 else if (key->human_numeric)
3558 diff = human_numcompare (ta, tb);
3559 else if (key->month)
3560- diff = getmonth (ta, NULL) - getmonth (tb, NULL);
3561+ diff = getmonth (ta, tlena, NULL) - getmonth (tb, tlenb, NULL);
3562 else if (key->random)
3563 diff = compare_random (ta, tlena, tb, tlenb);
3564 else if (key->version)
b3478306 3565@@ -2728,6 +3166,211 @@ keycompare (struct line const *a, struct
7cf3a80b
ML
3566 return key->reverse ? -diff : diff;
3567 }
3568
3569+#if HAVE_MBRTOWC
3570+static int
3571+keycompare_mb (const struct line *a, const struct line *b)
3572+{
3573+ struct keyfield *key = keylist;
3574+
3575+ /* For the first iteration only, the key positions have been
3576+ precomputed for us. */
3577+ char *texta = a->keybeg;
3578+ char *textb = b->keybeg;
3579+ char *lima = a->keylim;
3580+ char *limb = b->keylim;
3581+
3582+ size_t mblength_a, mblength_b;
3583+ wchar_t wc_a, wc_b;
3584+ mbstate_t state_a, state_b;
3585+
3586+ int diff = 0;
3587+
3588+ memset (&state_a, '\0', sizeof(mbstate_t));
3589+ memset (&state_b, '\0', sizeof(mbstate_t));
3590+ /* Ignore keys with start after end. */
3591+ if (a->keybeg - a->keylim > 0)
3592+ return 0;
3593+
3594+
3595+ /* Ignore and/or translate chars before comparing. */
3596+# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
3597+ do \
3598+ { \
3599+ wchar_t uwc; \
3600+ char mbc[MB_LEN_MAX]; \
3601+ mbstate_t state_wc; \
3602+ \
3603+ for (NEW_LEN = i = 0; i < LEN;) \
3604+ { \
3605+ mbstate_t state_bak; \
3606+ \
3607+ state_bak = STATE; \
3608+ MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
3609+ \
3610+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
3611+ || MBLENGTH == 0) \
3612+ { \
3613+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
3614+ STATE = state_bak; \
3615+ if (!ignore) \
3616+ COPY[NEW_LEN++] = TEXT[i]; \
3617+ i++; \
3618+ continue; \
3619+ } \
3620+ \
3621+ if (ignore) \
3622+ { \
3623+ if ((ignore == nonprinting && !iswprint (WC)) \
3624+ || (ignore == nondictionary \
3625+ && !iswalnum (WC) && !iswblank (WC))) \
3626+ { \
3627+ i += MBLENGTH; \
3628+ continue; \
3629+ } \
3630+ } \
3631+ \
3632+ if (translate) \
3633+ { \
3634+ \
3635+ uwc = towupper(WC); \
3636+ if (WC == uwc) \
3637+ { \
3638+ memcpy (mbc, TEXT + i, MBLENGTH); \
3639+ i += MBLENGTH; \
3640+ } \
3641+ else \
3642+ { \
3643+ i += MBLENGTH; \
3644+ WC = uwc; \
3645+ memset (&state_wc, '\0', sizeof (mbstate_t)); \
3646+ \
3647+ MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
3648+ assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
3649+ } \
3650+ \
3651+ for (j = 0; j < MBLENGTH; j++) \
3652+ COPY[NEW_LEN++] = mbc[j]; \
3653+ } \
3654+ else \
3655+ for (j = 0; j < MBLENGTH; j++) \
3656+ COPY[NEW_LEN++] = TEXT[i++]; \
3657+ } \
3658+ COPY[NEW_LEN] = '\0'; \
3659+ } \
3660+ while (0)
3661+
3662+ /* Actually compare the fields. */
3663+
3664+ for (;;)
3665+ {
3666+ /* Find the lengths. */
3667+ size_t lena = lima <= texta ? 0 : lima - texta;
3668+ size_t lenb = limb <= textb ? 0 : limb - textb;
3669+
3670+ char enda IF_LINT (= 0);
3671+ char endb IF_LINT (= 0);
3672+
3673+ char const *translate = key->translate;
3674+ bool const *ignore = key->ignore;
3675+
3676+ if (ignore || translate)
3677+ {
3678+ if (SIZE_MAX - lenb - 2 < lena)
3679+ xalloc_die ();
3680+ char *copy_a = (char *) xnmalloc (lena + lenb + 2, MB_CUR_MAX);
3681+ char *copy_b = copy_a + lena * MB_CUR_MAX + 1;
3682+ size_t new_len_a, new_len_b;
3683+ size_t i, j;
3684+
3685+ IGNORE_CHARS (new_len_a, lena, texta, copy_a,
3686+ wc_a, mblength_a, state_a);
3687+ IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
3688+ wc_b, mblength_b, state_b);
3689+ texta = copy_a; textb = copy_b;
3690+ lena = new_len_a; lenb = new_len_b;
3691+ }
3692+ else
3693+ {
3694+ /* Use the keys in-place, temporarily null-terminated. */
3695+ enda = texta[lena]; texta[lena] = '\0';
3696+ endb = textb[lenb]; textb[lenb] = '\0';
3697+ }
3698+
3699+ if (key->random)
3700+ diff = compare_random (texta, lena, textb, lenb);
3701+ else if (key->numeric | key->general_numeric | key->human_numeric)
3702+ {
3703+ char savea = *lima, saveb = *limb;
3704+
3705+ *lima = *limb = '\0';
3706+ diff = (key->numeric ? numcompare (texta, textb)
3707+ : key->general_numeric ? general_numcompare (texta, textb)
3708+ : human_numcompare (texta, textb));
3709+ *lima = savea, *limb = saveb;
3710+ }
3711+ else if (key->version)
3712+ diff = filevercmp (texta, textb);
3713+ else if (key->month)
3714+ diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL);
3715+ else if (lena == 0)
3716+ diff = - NONZERO (lenb);
3717+ else if (lenb == 0)
3718+ diff = 1;
3719+ else if (hard_LC_COLLATE && !folding)
3720+ {
3721+ diff = xmemcoll0 (texta, lena + 1, textb, lenb + 1);
3722+ }
3723+ else
3724+ {
3725+ diff = memcmp (texta, textb, MIN (lena, lenb));
3726+ if (diff == 0)
3727+ diff = lena < lenb ? -1 : lena != lenb;
3728+ }
3729+
3730+ if (ignore || translate)
3731+ free (texta);
3732+ else
3733+ {
3734+ texta[lena] = enda;
3735+ textb[lenb] = endb;
3736+ }
3737+
3738+ if (diff)
3739+ goto not_equal;
3740+
3741+ key = key->next;
3742+ if (! key)
3743+ break;
3744+
3745+ /* Find the beginning and limit of the next field. */
3746+ if (key->eword != -1)
3747+ lima = limfield (a, key), limb = limfield (b, key);
3748+ else
3749+ lima = a->text + a->length - 1, limb = b->text + b->length - 1;
3750+
3751+ if (key->sword != -1)
3752+ texta = begfield (a, key), textb = begfield (b, key);
3753+ else
3754+ {
3755+ texta = a->text, textb = b->text;
3756+ if (key->skipsblanks)
3757+ {
3758+ while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
3759+ texta += mblength_a;
3760+ while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
3761+ textb += mblength_b;
3762+ }
3763+ }
3764+ }
3765+
3766+not_equal:
3767+ if (key && key->reverse)
3768+ return -diff;
3769+ else
3770+ return diff;
3771+}
3772+#endif
3773+
3774 /* Compare two lines A and B, returning negative, zero, or positive
3775 depending on whether A compares less than, equal to, or greater than B. */
3776
b3478306 3777@@ -2755,7 +3398,7 @@ compare (struct line const *a, struct li
7cf3a80b
ML
3778 diff = - NONZERO (blen);
3779 else if (blen == 0)
3780 diff = 1;
3781- else if (hard_LC_COLLATE)
3782+ else if (hard_LC_COLLATE && !folding)
3783 {
b3478306 3784 /* xmemcoll0 is a performance enhancement as
7cf3a80b 3785 it will not unconditionally write '\0' after the
b3478306 3786@@ -4145,6 +4788,7 @@ set_ordering (char const *s, struct keyf
7cf3a80b
ML
3787 break;
3788 case 'f':
3789 key->translate = fold_toupper;
3790+ folding = true;
3791 break;
3792 case 'g':
3793 key->general_numeric = true;
b3478306 3794@@ -4224,7 +4868,7 @@ main (int argc, char **argv)
7cf3a80b
ML
3795 initialize_exit_failure (SORT_FAILURE);
3796
3797 hard_LC_COLLATE = hard_locale (LC_COLLATE);
3798-#if HAVE_NL_LANGINFO
3799+#if HAVE_LANGINFO_CODESET
3800 hard_LC_TIME = hard_locale (LC_TIME);
3801 #endif
3802
b3478306 3803@@ -4245,6 +4889,29 @@ main (int argc, char **argv)
7cf3a80b
ML
3804 thousands_sep = -1;
3805 }
3806
3807+#if HAVE_MBRTOWC
3808+ if (MB_CUR_MAX > 1)
3809+ {
3810+ inittables = inittables_mb;
3811+ begfield = begfield_mb;
3812+ limfield = limfield_mb;
3813+ skipblanks = skipblanks_mb;
3814+ getmonth = getmonth_mb;
3815+ keycompare = keycompare_mb;
3816+ numcompare = numcompare_mb;
3817+ }
3818+ else
3819+#endif
3820+ {
3821+ inittables = inittables_uni;
3822+ begfield = begfield_uni;
3823+ limfield = limfield_uni;
3824+ skipblanks = skipblanks_uni;
3825+ getmonth = getmonth_uni;
3826+ keycompare = keycompare_uni;
3827+ numcompare = numcompare_uni;
3828+ }
3829+
3830 have_read_stdin = false;
3831 inittables ();
3832
b3478306 3833@@ -4519,13 +5186,34 @@ main (int argc, char **argv)
7cf3a80b
ML
3834
3835 case 't':
3836 {
3837- char newtab = optarg[0];
3838- if (! newtab)
3839+ char newtab[MB_LEN_MAX + 1];
3840+ size_t newtab_length = 1;
3841+ strncpy (newtab, optarg, MB_LEN_MAX);
3842+ if (! newtab[0])
e1fb4052 3843 die (SORT_FAILURE, 0, _("empty tab"));
7cf3a80b
ML
3844- if (optarg[1])
3845+#if HAVE_MBRTOWC
3846+ if (MB_CUR_MAX > 1)
3847+ {
3848+ wchar_t wc;
3849+ mbstate_t state;
3850+
3851+ memset (&state, '\0', sizeof (mbstate_t));
3852+ newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
3853+ MB_LEN_MAX),
3854+ &state);
3855+ switch (newtab_length)
3856+ {
3857+ case (size_t) -1:
3858+ case (size_t) -2:
3859+ case 0:
3860+ newtab_length = 1;
3861+ }
3862+ }
3863+#endif
3864+ if (newtab_length == 1 && optarg[1])
3865 {
3866 if (STREQ (optarg, "\\0"))
3867- newtab = '\0';
3868+ newtab[0] = '\0';
3869 else
3870 {
3871 /* Provoke with 'sort -txx'. Complain about
b3478306 3872@@ -4536,9 +5224,11 @@ main (int argc, char **argv)
e1fb4052 3873 quote (optarg));
7cf3a80b
ML
3874 }
3875 }
3876- if (tab != TAB_DEFAULT && tab != newtab)
e1fb4052
MF
3877+ if (tab_length && (tab_length != newtab_length
3878+ || memcmp (tab, newtab, tab_length) != 0))
3879 die (SORT_FAILURE, 0, _("incompatible tabs"));
7cf3a80b
ML
3880- tab = newtab;
3881+ memcpy (tab, newtab, newtab_length);
3882+ tab_length = newtab_length;
3883 }
3884 break;
3885
b3478306 3886@@ -4767,12 +5457,10 @@ main (int argc, char **argv)
7cf3a80b
ML
3887 sort (files, nfiles, outfile, nthreads);
3888 }
3889
3890-#ifdef lint
3891 if (files_from)
3892 readtokens0_free (&tok);
3893 else
3894 free (files);
3895-#endif
3896
3897 if (have_read_stdin && fclose (stdin) == EOF)
e1fb4052 3898 sort_die (_("close failed"), "-");
b3478306
PM
3899diff -Naurp coreutils-8.32.orig/src/unexpand.c coreutils-8.32/src/unexpand.c
3900--- coreutils-8.32.orig/src/unexpand.c 2020-01-01 22:13:12.000000000 +0800
3901+++ coreutils-8.32/src/unexpand.c 2020-03-08 12:10:27.738236560 +0800
e1fb4052 3902@@ -38,6 +38,9 @@
7cf3a80b
ML
3903 #include <stdio.h>
3904 #include <getopt.h>
3905 #include <sys/types.h>
3906+
e1fb4052 3907+#include <mbfile.h>
7cf3a80b
ML
3908+
3909 #include "system.h"
e1fb4052 3910 #include "die.h"
7cf3a80b 3911 #include "xstrndup.h"
e1fb4052
MF
3912@@ -107,24 +110,47 @@ unexpand (void)
3913 {
3914 /* Input stream. */
3915 FILE *fp = next_file (NULL);
3916+ mb_file_t mbf;
3917
3918 /* The array of pending blanks. In non-POSIX locales, blanks can
3919 include characters other than spaces, so the blanks must be
3920 stored, not merely counted. */
3921- char *pending_blank;
3922+ mbf_char_t *pending_blank;
3923+ /* True if the starting locale is utf8. */
3924+ bool using_utf_locale;
3925+
3926+ /* True if the first file contains BOM header. */
3927+ bool found_bom;
3928+ using_utf_locale=check_utf_locale();
3929
3930 if (!fp)
3931 return;
3932+ mbf_init (mbf, fp);
3933+ found_bom=check_bom(fp,&mbf);
7cf3a80b 3934
e1fb4052
MF
3935+ if (using_utf_locale == false && found_bom == true)
3936+ {
3937+ /*try using some predefined locale */
7cf3a80b 3938+
e1fb4052 3939+ if (set_utf_locale () != 0)
7cf3a80b 3940+ {
e1fb4052 3941+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
7cf3a80b 3942+ }
e1fb4052
MF
3943+ }
3944 /* The worst case is a non-blank character, then one blank, then a
3945 tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
3946 allocate MAX_COLUMN_WIDTH bytes to store the blanks. */
3947- pending_blank = xmalloc (max_column_width);
3948+ pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t));
7cf3a80b 3949+
e1fb4052
MF
3950+ if (found_bom == true)
3951+ {
3952+ print_bom();
3953+ }
3954
3955 while (true)
3956 {
3957 /* Input character, or EOF. */
3958- int c;
3959+ mbf_char_t c;
3960
3961 /* If true, perform translations. */
3962 bool convert = true;
3963@@ -158,12 +184,44 @@ unexpand (void)
3964
3965 do
3966 {
3967- while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
3968- continue;
3969+ while (true) {
3970+ mbf_getc (c, mbf);
3971+ if ((mb_iseof (c)) && (fp = next_file (fp)))
3972+ {
3973+ mbf_init (mbf, fp);
3974+ if (fp!=NULL)
3975+ {
3976+ if (check_bom(fp,&mbf)==true)
3977+ {
3978+ /*Not the first file - check BOM header*/
3979+ if (using_utf_locale==false && found_bom==false)
3980+ {
3981+ /*BOM header in subsequent file but not in the first one. */
3982+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
3983+ }
3984+ }
3985+ else
3986+ {
3987+ if(using_utf_locale==false && found_bom==true)
3988+ {
3989+ /*First file conatined BOM header - locale was switched to UTF
b3478306 3990+ *all subsequent files should contain BOM. */
e1fb4052
MF
3991+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
3992+ }
3993+ }
3994+ }
3995+ continue;
3996+ }
3997+ else
3998+ {
3999+ break;
4000+ }
4001+ }
7cf3a80b 4002+
7cf3a80b 4003
e1fb4052
MF
4004 if (convert)
4005 {
4006- bool blank = !! isblank (c);
4007+ bool blank = mb_isblank (c);
7cf3a80b 4008
e1fb4052
MF
4009 if (blank)
4010 {
4011@@ -180,16 +238,16 @@ unexpand (void)
4012 if (next_tab_column < column)
4013 die (EXIT_FAILURE, 0, _("input line is too long"));
4014
4015- if (c == '\t')
4016+ if (mb_iseq (c, '\t'))
4017 {
4018 column = next_tab_column;
4019
4020 if (pending)
4021- pending_blank[0] = '\t';
4022+ mb_setascii (&pending_blank[0], '\t');
4023 }
4024 else
4025 {
4026- column++;
4027+ column += mb_width (c);
4028
4029 if (! (prev_blank && column == next_tab_column))
4030 {
4031@@ -197,13 +255,14 @@ unexpand (void)
4032 will be replaced by tabs. */
4033 if (column == next_tab_column)
4034 one_blank_before_tab_stop = true;
4035- pending_blank[pending++] = c;
4036+ mb_copy (&pending_blank[pending++], &c);
4037 prev_blank = true;
4038 continue;
4039 }
4040
4041 /* Replace the pending blanks by a tab or two. */
4042- pending_blank[0] = c = '\t';
4043+ mb_setascii (&c, '\t');
4044+ mb_setascii (&pending_blank[0], '\t');
4045 }
4046
4047 /* Discard pending blanks, unless it was a single
4048@@ -211,7 +270,7 @@ unexpand (void)
4049 pending = one_blank_before_tab_stop;
4050 }
4051 }
4052- else if (c == '\b')
4053+ else if (mb_iseq (c, '\b'))
4054 {
4055 /* Go back one column, and force recalculation of the
4056 next tab stop. */
4057@@ -219,9 +278,9 @@ unexpand (void)
4058 next_tab_column = column;
4059 tab_index -= !!tab_index;
4060 }
4061- else
4062+ else if (!mb_iseq (c, '\n'))
4063 {
4064- column++;
4065+ column += mb_width (c);
4066 if (!column)
4067 die (EXIT_FAILURE, 0, _("input line is too long"));
4068 }
4069@@ -229,8 +288,11 @@ unexpand (void)
4070 if (pending)
4071 {
4072 if (pending > 1 && one_blank_before_tab_stop)
4073- pending_blank[0] = '\t';
4074- if (fwrite (pending_blank, 1, pending, stdout) != pending)
4075+ mb_setascii (&pending_blank[0], '\t');
4076+
4077+ for (int n = 0; n < pending; ++n)
4078+ mb_putc (pending_blank[n], stdout);
4079+ if (ferror (stdout))
4080 die (EXIT_FAILURE, errno, _("write error"));
4081 pending = 0;
4082 one_blank_before_tab_stop = false;
4083@@ -240,16 +302,17 @@ unexpand (void)
4084 convert &= convert_entire_line || blank;
4085 }
4086
4087- if (c < 0)
4088+ if (mb_iseof (c))
4089 {
4090 free (pending_blank);
4091 return;
4092 }
4093
4094- if (putchar (c) < 0)
4095+ mb_putc (c, stdout);
4096+ if (ferror (stdout))
4097 die (EXIT_FAILURE, errno, _("write error"));
4098 }
4099- while (c != '\n');
4100+ while (!mb_iseq (c, '\n'));
4101 }
4102 }
7cf3a80b 4103
b3478306
PM
4104diff -Naurp coreutils-8.32.orig/src/uniq.c coreutils-8.32/src/uniq.c
4105--- coreutils-8.32.orig/src/uniq.c 2020-02-25 07:18:16.000000000 +0800
4106+++ coreutils-8.32/src/uniq.c 2020-03-08 12:10:35.436236531 +0800
7cf3a80b
ML
4107@@ -21,6 +21,17 @@
4108 #include <getopt.h>
4109 #include <sys/types.h>
4110
4111+/* Get mbstate_t, mbrtowc(). */
4112+#if HAVE_WCHAR_H
4113+# include <wchar.h>
4114+#endif
4115+
4116+/* Get isw* functions. */
4117+#if HAVE_WCTYPE_H
4118+# include <wctype.h>
4119+#endif
4120+#include <assert.h>
4121+
4122 #include "system.h"
4123 #include "argmatch.h"
4124 #include "linebuffer.h"
b3478306
PM
4125@@ -33,6 +44,18 @@
4126 #include "memcasecmp.h"
e1fb4052
MF
4127 #include "quote.h"
4128
7cf3a80b
ML
4129+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
4130+ installation; work around this configuration error. */
4131+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
4132+# define MB_LEN_MAX 16
4133+#endif
4134+
4135+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
4136+#if HAVE_MBRTOWC && defined mbstate_t
4137+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
4138+#endif
e1fb4052
MF
4139+
4140+
7cf3a80b
ML
4141 /* The official name of this program (e.g., no 'g' prefix). */
4142 #define PROGRAM_NAME "uniq"
e1fb4052 4143
b3478306 4144@@ -139,6 +162,10 @@ enum
7cf3a80b
ML
4145 GROUP_OPTION = CHAR_MAX + 1
4146 };
4147
4148+/* Function pointers. */
4149+static char *
4150+(*find_field) (struct linebuffer *line);
4151+
4152 static struct option const longopts[] =
4153 {
4154 {"count", no_argument, NULL, 'c'},
b3478306 4155@@ -253,7 +280,7 @@ size_opt (char const *opt, char const *m
7cf3a80b
ML
4156 return a pointer to the beginning of the line's field to be compared. */
4157
4158 static char * _GL_ATTRIBUTE_PURE
4159-find_field (struct linebuffer const *line)
4160+find_field_uni (struct linebuffer *line)
4161 {
4162 size_t count;
4163 char const *lp = line->buffer;
b3478306 4164@@ -273,6 +300,83 @@ find_field (struct linebuffer const *lin
7cf3a80b
ML
4165 return line->buffer + i;
4166 }
4167
4168+#if HAVE_MBRTOWC
4169+
4170+# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
4171+ do \
4172+ { \
4173+ mbstate_t state_bak; \
4174+ \
4175+ CONVFAIL = 0; \
4176+ state_bak = *STATEP; \
4177+ \
4178+ MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
4179+ \
4180+ switch (MBLENGTH) \
4181+ { \
4182+ case (size_t)-2: \
4183+ case (size_t)-1: \
4184+ *STATEP = state_bak; \
4185+ CONVFAIL++; \
4186+ /* Fall through */ \
4187+ case 0: \
4188+ MBLENGTH = 1; \
4189+ } \
4190+ } \
4191+ while (0)
4192+
4193+static char *
4194+find_field_multi (struct linebuffer *line)
4195+{
4196+ size_t count;
4197+ char *lp = line->buffer;
4198+ size_t size = line->length - 1;
4199+ size_t pos;
4200+ size_t mblength;
4201+ wchar_t wc;
4202+ mbstate_t *statep;
4203+ int convfail = 0;
4204+
4205+ pos = 0;
4206+ statep = &(line->state);
4207+
4208+ /* skip fields. */
4209+ for (count = 0; count < skip_fields && pos < size; count++)
4210+ {
4211+ while (pos < size)
4212+ {
4213+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
4214+
4215+ if (convfail || !(iswblank (wc) || wc == '\n'))
4216+ {
4217+ pos += mblength;
4218+ break;
4219+ }
4220+ pos += mblength;
4221+ }
4222+
4223+ while (pos < size)
4224+ {
4225+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
4226+
4227+ if (!convfail && (iswblank (wc) || wc == '\n'))
4228+ break;
4229+
4230+ pos += mblength;
4231+ }
4232+ }
4233+
4234+ /* skip fields. */
4235+ for (count = 0; count < skip_chars && pos < size; count++)
4236+ {
4237+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
4238+ pos += mblength;
4239+ }
4240+
4241+ return lp + pos;
4242+}
4243+#endif
4244+
4245 /* Return false if two strings OLD and NEW match, true if not.
4246 OLD and NEW point not to the beginnings of the lines
4247 but rather to the beginnings of the fields to compare.
b3478306
PM
4248@@ -292,6 +396,79 @@ different (char *old, char *new, size_t
4249 return oldlen != newlen || memcmp (old, new, oldlen);
4250 }
7cf3a80b 4251
7cf3a80b
ML
4252+#if HAVE_MBRTOWC
4253+static int
4254+different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate)
4255+{
4256+ size_t i, j, chars;
4257+ const char *str[2];
4258+ char *copy[2];
4259+ size_t len[2];
4260+ mbstate_t state[2];
4261+ size_t mblength;
4262+ wchar_t wc, uwc;
4263+ mbstate_t state_bak;
4264+
4265+ str[0] = old;
4266+ str[1] = new;
4267+ len[0] = oldlen;
4268+ len[1] = newlen;
4269+ state[0] = oldstate;
4270+ state[1] = newstate;
4271+
4272+ for (i = 0; i < 2; i++)
4273+ {
4274+ copy[i] = xmalloc (len[i] + 1);
4275+ memset (copy[i], '\0', len[i] + 1);
4276+
4277+ for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++)
4278+ {
4279+ state_bak = state[i];
4280+ mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i]));
4281+
4282+ switch (mblength)
4283+ {
4284+ case (size_t)-1:
4285+ case (size_t)-2:
4286+ state[i] = state_bak;
4287+ /* Fall through */
4288+ case 0:
4289+ mblength = 1;
4290+ break;
4291+
4292+ default:
4293+ if (ignore_case)
4294+ {
4295+ uwc = towupper (wc);
4296+
4297+ if (uwc != wc)
4298+ {
4299+ mbstate_t state_wc;
4300+ size_t mblen;
4301+
4302+ memset (&state_wc, '\0', sizeof(mbstate_t));
4303+ mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
4304+ assert (mblen != (size_t)-1);
4305+ }
4306+ else
4307+ memcpy (copy[i] + j, str[i] + j, mblength);
4308+ }
4309+ else
4310+ memcpy (copy[i] + j, str[i] + j, mblength);
4311+ }
4312+ j += mblength;
4313+ }
4314+ copy[i][j] = '\0';
4315+ len[i] = j;
4316+ }
b3478306 4317+ int rc = len[0] != len[1] || memcmp(copy[0], copy[1], len[0]);
7cf3a80b
ML
4318+ free (copy[0]);
4319+ free (copy[1]);
4320+ return rc;
4321+
b3478306 4322+}
7cf3a80b 4323+#endif
b3478306 4324+
7cf3a80b
ML
4325 /* Output the line in linebuffer LINE to standard output
4326 provided that the switches say it should be output.
b3478306
PM
4327 MATCH is true if the line matches the previous line.
4328@@ -355,19 +532,38 @@ check_file (const char *infile, const ch
7cf3a80b
ML
4329 char *prevfield IF_LINT ( = NULL);
4330 size_t prevlen IF_LINT ( = 0);
4331 bool first_group_printed = false;
4332+#if HAVE_MBRTOWC
4333+ mbstate_t prevstate;
4334+
4335+ memset (&prevstate, '\0', sizeof (mbstate_t));
4336+#endif
4337
4338 while (!feof (stdin))
4339 {
4340 char *thisfield;
4341 size_t thislen;
4342 bool new_group;
4343+#if HAVE_MBRTOWC
4344+ mbstate_t thisstate;
4345+#endif
4346
4347 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
4348 break;
4349
4350 thisfield = find_field (thisline);
4351 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
4352+#if HAVE_MBRTOWC
4353+ if (MB_CUR_MAX > 1)
4354+ {
4355+ thisstate = thisline->state;
4356
4357+ new_group = (prevline->length == 0
4358+ || different_multi (thisfield, prevfield,
4359+ thislen, prevlen,
4360+ thisstate, prevstate));
4361+ }
4362+ else
4363+#endif
4364 new_group = (prevline->length == 0
4365 || different (thisfield, prevfield, thislen, prevlen));
4366
b3478306 4367@@ -385,6 +581,10 @@ check_file (const char *infile, const ch
7cf3a80b
ML
4368 SWAP_LINES (prevline, thisline);
4369 prevfield = thisfield;
4370 prevlen = thislen;
4371+#if HAVE_MBRTOWC
4372+ if (MB_CUR_MAX > 1)
4373+ prevstate = thisstate;
4374+#endif
4375 first_group_printed = true;
4376 }
4377 }
b3478306 4378@@ -397,17 +597,26 @@ check_file (const char *infile, const ch
7cf3a80b
ML
4379 size_t prevlen;
4380 uintmax_t match_count = 0;
4381 bool first_delimiter = true;
4382+#if HAVE_MBRTOWC
4383+ mbstate_t prevstate;
4384+#endif
4385
4386 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
4387 goto closefiles;
4388 prevfield = find_field (prevline);
4389 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
4390+#if HAVE_MBRTOWC
4391+ prevstate = prevline->state;
4392+#endif
4393
4394 while (!feof (stdin))
4395 {
4396 bool match;
4397 char *thisfield;
4398 size_t thislen;
4399+#if HAVE_MBRTOWC
4400+ mbstate_t thisstate = thisline->state;
4401+#endif
4402 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
4403 {
4404 if (ferror (stdin))
b3478306 4405@@ -416,6 +625,14 @@ check_file (const char *infile, const ch
7cf3a80b
ML
4406 }
4407 thisfield = find_field (thisline);
4408 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
4409+#if HAVE_MBRTOWC
4410+ if (MB_CUR_MAX > 1)
4411+ {
4412+ match = !different_multi (thisfield, prevfield,
4413+ thislen, prevlen, thisstate, prevstate);
4414+ }
4415+ else
4416+#endif
4417 match = !different (thisfield, prevfield, thislen, prevlen);
4418 match_count += match;
4419
b3478306 4420@@ -448,6 +665,9 @@ check_file (const char *infile, const ch
7cf3a80b
ML
4421 SWAP_LINES (prevline, thisline);
4422 prevfield = thisfield;
4423 prevlen = thislen;
4424+#if HAVE_MBRTOWC
4425+ prevstate = thisstate;
4426+#endif
4427 if (!match)
4428 match_count = 0;
4429 }
b3478306 4430@@ -493,6 +713,19 @@ main (int argc, char **argv)
7cf3a80b
ML
4431
4432 atexit (close_stdout);
4433
4434+#if HAVE_MBRTOWC
4435+ if (MB_CUR_MAX > 1)
4436+ {
4437+ find_field = find_field_multi;
4438+ }
4439+ else
4440+#endif
4441+ {
4442+ find_field = find_field_uni;
4443+ }
4444+
4445+
4446+
4447 skip_chars = 0;
4448 skip_fields = 0;
4449 check_chars = SIZE_MAX;
b3478306
PM
4450diff -Naurp coreutils-8.32.orig/tests/expand/mb.sh coreutils-8.32/tests/expand/mb.sh
4451--- coreutils-8.32.orig/tests/expand/mb.sh 1970-01-01 08:00:00.000000000 +0800
4452+++ coreutils-8.32/tests/expand/mb.sh 2020-03-08 12:10:27.738236560 +0800
e1fb4052 4453@@ -0,0 +1,183 @@
7cf3a80b 4454+#!/bin/sh
7cf3a80b 4455+
b3478306 4456+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
7cf3a80b 4457+
e1fb4052
MF
4458+# This program is free software: you can redistribute it and/or modify
4459+# it under the terms of the GNU General Public License as published by
4460+# the Free Software Foundation, either version 3 of the License, or
4461+# (at your option) any later version.
4462+
4463+# This program is distributed in the hope that it will be useful,
4464+# but WITHOUT ANY WARRANTY; without even the implied warranty of
4465+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4466+# GNU General Public License for more details.
4467+
4468+# You should have received a copy of the GNU General Public License
4469+# along with this program. If not, see <http://www.gnu.org/licenses/>.
4470+
4471+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
4472+print_ver_ expand
7cf3a80b
ML
4473+
4474+export LC_ALL=en_US.UTF-8
7cf3a80b 4475+
e1fb4052
MF
4476+#input containing multibyte characters
4477+cat <<\EOF > in || framework_failure_
4478+1234567812345678123456781
4479+. . . .
4480+a b c d
4481+. . . .
4482+ä ö ü ß
4483+. . . .
4484+EOF
4485+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
4486+
4487+cat <<\EOF > exp || framework_failure_
4488+1234567812345678123456781
4489+. . . .
4490+a b c d
4491+. . . .
4492+ä ö ü ß
4493+. . . .
4494+ äöü . öüä. ä xx
7cf3a80b
ML
4495+EOF
4496+
e1fb4052
MF
4497+expand < in > out || fail=1
4498+compare exp out > /dev/null 2>&1 || fail=1
4499+
4500+#multiple files as an input
4501+cat <<\EOF >> exp || framework_failure_
4502+1234567812345678123456781
4503+. . . .
4504+a b c d
4505+. . . .
4506+ä ö ü ß
4507+. . . .
4508+ äöü . öüä. ä xx
4509+EOF
7cf3a80b 4510+
e1fb4052
MF
4511+expand ./in ./in > out || fail=1
4512+compare exp out > /dev/null 2>&1 || fail=1
4513+
4514+#test characters with display widths != 1
4515+env printf '12345678
4516+e\t|ascii(1)
4517+\u00E9\t|composed(1)
4518+e\u0301\t|decomposed(1)
4519+\u3000\t|ideo-space(2)
4520+\uFF0D\t|full-hypen(2)
4521+' > in || framework_failure_
4522+
4523+env printf '12345678
4524+e |ascii(1)
4525+\u00E9 |composed(1)
4526+e\u0301 |decomposed(1)
4527+\u3000 |ideo-space(2)
4528+\uFF0D |full-hypen(2)
4529+' > exp || framework_failure_
4530+
4531+expand < in > out || fail=1
4532+compare exp out > /dev/null 2>&1 || fail=1
4533+
4534+#shouldn't fail with "input line too long"
4535+#when a line starts with a control character
4536+env printf '\n' > in || framework_failure_
4537+
4538+expand < in > out || fail=1
4539+compare in out > /dev/null 2>&1 || fail=1
4540+
4541+#non-Unicode characters interspersed between Unicode ones
4542+env printf '12345678
4543+\t\xFF|
4544+\xFF\t|
4545+\t\xFFä|
4546+ä\xFF\t|
4547+\tä\xFF|
4548+\xFF\tä|
4549+äbcdef\xFF\t|
4550+' > in || framework_failure_
4551+
4552+env printf '12345678
4553+ \xFF|
4554+\xFF |
4555+ \xFFä|
4556+ä\xFF |
4557+ ä\xFF|
4558+\xFF ä|
4559+äbcdef\xFF |
4560+' > exp || framework_failure_
4561+
4562+expand < in > out || fail=1
4563+compare exp out > /dev/null 2>&1 || fail=1
4564+
4565+
4566+
4567+#BOM header test 1
4568+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
4569+1234567812345678123456781
4570+. . . .
4571+a b c d
4572+. . . .
4573+ä ö ü ß
4574+. . . .
4575+EOF
4576+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
4577+
4578+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
4579+1234567812345678123456781
4580+. . . .
4581+a b c d
4582+. . . .
4583+ä ö ü ß
4584+. . . .
4585+ äöü . öüä. ä xx
4586+EOF
7cf3a80b
ML
4587+
4588+
e1fb4052
MF
4589+expand < in > out || fail=1
4590+compare exp out > /dev/null 2>&1 || fail=1
4591+
4592+LANG=C expand < in > out || fail=1
4593+compare exp out > /dev/null 2>&1 || fail=1
4594+
4595+LC_ALL=C expand < in > out || fail=1
4596+compare exp out > /dev/null 2>&1 || fail=1
4597+
4598+
4599+printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_
4600+1234567812345678123456781
4601+. . . .
4602+a b c d
4603+. . . .
4604+ä ö ü ß
4605+. . . .
4606+EOF
4607+env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_
4608+
4609+
4610+printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_
4611+1234567812345678123456781
4612+. . . .
4613+a b c d
4614+. . . .
4615+ä ö ü ß
4616+. . . .
4617+ äöü . öüä. ä xx
4618+1234567812345678123456781
4619+. . . .
4620+a b c d
4621+. . . .
4622+ä ö ü ß
4623+. . . .
4624+ äöü . öüä. ä xx
4625+EOF
4626+
4627+expand in1 in1 > out || fail=1
4628+compare exp out > /dev/null 2>&1 || fail=1
4629+
4630+LANG=C expand in1 in1 > out || fail=1
4631+compare exp out > /dev/null 2>&1 || fail=1
4632+
4633+LC_ALL=C expand in1 in1 > out || fail=1
4634+compare exp out > /dev/null 2>&1 || fail=1
4635+
4636+exit $fail
b3478306
PM
4637diff -Naurp coreutils-8.32.orig/tests/i18n/sort.sh coreutils-8.32/tests/i18n/sort.sh
4638--- coreutils-8.32.orig/tests/i18n/sort.sh 1970-01-01 08:00:00.000000000 +0800
4639+++ coreutils-8.32/tests/i18n/sort.sh 2020-03-08 12:10:27.738236560 +0800
7cf3a80b
ML
4640@@ -0,0 +1,29 @@
4641+#!/bin/sh
4642+# Verify sort's multi-byte support.
4643+
4644+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
4645+print_ver_ sort
4646+
4647+export LC_ALL=en_US.UTF-8
4648+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
4649+ || skip_ "No UTF-8 locale available"
4650+
4651+# Enable heap consistency checkng on older systems
4652+export MALLOC_CHECK_=2
4653+
4654+
4655+# check buffer overflow issue due to
4656+# expanding multi-byte representation due to case conversion
4657+# https://bugzilla.suse.com/show_bug.cgi?id=928749
4658+cat <<EOF > exp
4659+.
4660
4661+EOF
4662+cat <<EOF | sort -f > out || fail=1
4663+.
4664
4665+EOF
4666+compare exp out || { fail=1; cat out; }
4667+
4668+
4669+Exit $fail
b3478306
PM
4670diff -Naurp coreutils-8.32.orig/tests/local.mk coreutils-8.32/tests/local.mk
4671--- coreutils-8.32.orig/tests/local.mk 2020-03-02 07:25:03.000000000 +0800
4672+++ coreutils-8.32/tests/local.mk 2020-03-08 12:10:27.738236560 +0800
4673@@ -369,6 +369,8 @@ all_tests = \
7cf3a80b
ML
4674 tests/misc/sort-discrim.sh \
4675 tests/misc/sort-files0-from.pl \
4676 tests/misc/sort-float.sh \
4677+ tests/misc/sort-mb-tests.sh \
4678+ tests/i18n/sort.sh \
e1fb4052 4679 tests/misc/sort-h-thousands-sep.sh \
7cf3a80b
ML
4680 tests/misc/sort-merge.pl \
4681 tests/misc/sort-merge-fdlimit.sh \
b3478306 4682@@ -567,6 +569,7 @@ all_tests = \
e1fb4052
MF
4683 tests/du/threshold.sh \
4684 tests/du/trailing-slash.sh \
4685 tests/du/two-args.sh \
4686+ tests/expand/mb.sh \
4687 tests/id/gnu-zero-uids.sh \
4688 tests/id/no-context.sh \
4689 tests/id/context.sh \
b3478306 4690@@ -714,6 +717,7 @@ all_tests = \
e1fb4052
MF
4691 tests/touch/read-only.sh \
4692 tests/touch/relative.sh \
4693 tests/touch/trailing-slash.sh \
4694+ tests/unexpand/mb.sh \
4695 $(all_root_tests)
4696
4697 # See tests/factor/create-test.sh.
b3478306
PM
4698diff -Naurp coreutils-8.32.orig/tests/misc/expand.pl coreutils-8.32/tests/misc/expand.pl
4699--- coreutils-8.32.orig/tests/misc/expand.pl 2020-01-01 22:13:13.000000000 +0800
4700+++ coreutils-8.32/tests/misc/expand.pl 2020-03-08 12:10:27.738236560 +0800
e1fb4052 4701@@ -27,6 +27,15 @@ my $prog = 'expand';
7cf3a80b
ML
4702 # Turn off localization of executable's output.
4703 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4704
4705+#comment out next line to disable multibyte tests
4706+my $mb_locale = $ENV{LOCALE_FR_UTF8};
4707+! defined $mb_locale || $mb_locale eq 'none'
4708+ and $mb_locale = 'C';
4709+
4710+my $prog = 'expand';
4711+my $try = "Try \`$prog --help' for more information.\n";
4712+my $inval = "$prog: invalid byte, character or field list\n$try";
4713+
4714 my @Tests =
4715 (
4716 ['t1', '--tabs=3', {IN=>"a\tb"}, {OUT=>"a b"}],
b3478306
PM
4717@@ -168,6 +177,8 @@ my @Tests =
4718
e1fb4052
MF
4719
4720 # Test errors
4721+ # FIXME: The following tests contain ‘quoting’ specific to LC_MESSAGES
4722+ # So we force LC_MESSAGES=C to make them pass.
4723 ['e1', '--tabs="a"', {IN=>''}, {OUT=>''}, {EXIT=>1},
4724 {ERR => "$prog: tab size contains invalid character(s): 'a'\n"}],
4725 ['e2', "-t $UINTMAX_OFLOW", {IN=>''}, {OUT=>''}, {EXIT=>1},
b3478306 4726@@ -184,6 +195,37 @@ my @Tests =
e1fb4052 4727 {ERR => "$prog: '/' specifier not at start of number: '/'\n"}],
7cf3a80b
ML
4728 );
4729
4730+if ($mb_locale ne 'C')
4731+ {
4732+ # Duplicate each test vector, appending "-mb" to the test name and
4733+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4734+ # provide coverage for the distro-added multi-byte code paths.
4735+ my @new;
4736+ foreach my $t (@Tests)
4737+ {
4738+ my @new_t = @$t;
4739+ my $test_name = shift @new_t;
4740+
4741+ # Depending on whether expand is multi-byte-patched,
4742+ # it emits different diagnostics:
4743+ # non-MB: invalid byte or field list
4744+ # MB: invalid byte, character or field list
4745+ # Adjust the expected error output accordingly.
4746+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4747+ (@new_t))
4748+ {
4749+ my $sub = {ERR_SUBST => 's/, character//'};
4750+ push @new_t, $sub;
4751+ push @$t, $sub;
4752+ }
e1fb4052 4753+ push @new, ["$test_name-mb", @new_t, {ENV => "LANG=$mb_locale LC_MESSAGES=C"}];
7cf3a80b
ML
4754+ }
4755+ push @Tests, @new;
4756+ }
4757+
4758+
4759+@Tests = triple_test \@Tests;
4760+
4761 my $save_temps = $ENV{DEBUG};
4762 my $verbose = $ENV{VERBOSE};
4763
b3478306
PM
4764diff -Naurp coreutils-8.32.orig/tests/misc/fold.pl coreutils-8.32/tests/misc/fold.pl
4765--- coreutils-8.32.orig/tests/misc/fold.pl 2020-01-01 22:13:13.000000000 +0800
4766+++ coreutils-8.32/tests/misc/fold.pl 2020-03-08 12:10:27.738236560 +0800
7cf3a80b
ML
4767@@ -20,9 +20,18 @@ use strict;
4768
4769 (my $program_name = $0) =~ s|.*/||;
4770
4771+my $prog = 'fold';
4772+my $try = "Try \`$prog --help' for more information.\n";
4773+my $inval = "$prog: invalid byte, character or field list\n$try";
4774+
4775 # Turn off localization of executable's output.
4776 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4777
4778+# uncommented to enable multibyte paths
4779+my $mb_locale = $ENV{LOCALE_FR_UTF8};
4780+! defined $mb_locale || $mb_locale eq 'none'
4781+ and $mb_locale = 'C';
4782+
4783 my @Tests =
4784 (
4785 ['s1', '-w2 -s', {IN=>"a\t"}, {OUT=>"a\n\t"}],
4786@@ -31,9 +40,48 @@ my @Tests =
4787 ['s4', '-w4 -s', {IN=>"abc ef\n"}, {OUT=>"abc \nef\n"}],
4788 );
4789
4790+# Add _POSIX2_VERSION=199209 to the environment of each test
4791+# that uses an old-style option like +1.
4792+if ($mb_locale ne 'C')
4793+ {
4794+ # Duplicate each test vector, appending "-mb" to the test name and
4795+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4796+ # provide coverage for the distro-added multi-byte code paths.
4797+ my @new;
4798+ foreach my $t (@Tests)
4799+ {
4800+ my @new_t = @$t;
4801+ my $test_name = shift @new_t;
4802+
4803+ # Depending on whether fold is multi-byte-patched,
4804+ # it emits different diagnostics:
4805+ # non-MB: invalid byte or field list
4806+ # MB: invalid byte, character or field list
4807+ # Adjust the expected error output accordingly.
4808+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4809+ (@new_t))
4810+ {
4811+ my $sub = {ERR_SUBST => 's/, character//'};
4812+ push @new_t, $sub;
4813+ push @$t, $sub;
4814+ }
4815+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4816+ }
4817+ push @Tests, @new;
4818+ }
4819+
4820+@Tests = triple_test \@Tests;
4821+
4822+# Remember that triple_test creates from each test with exactly one "IN"
4823+# file two more tests (.p and .r suffix on name) corresponding to reading
4824+# input from a file and from a pipe. The pipe-reading test would fail
4825+# due to a race condition about 1 in 20 times.
4826+# Remove the IN_PIPE version of the "output-is-input" test above.
4827+# The others aren't susceptible because they have three inputs each.
4828+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
4829+
4830 my $save_temps = $ENV{DEBUG};
4831 my $verbose = $ENV{VERBOSE};
4832
4833-my $prog = 'fold';
4834 my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
4835 exit $fail;
b3478306
PM
4836diff -Naurp coreutils-8.32.orig/tests/misc/join.pl coreutils-8.32/tests/misc/join.pl
4837--- coreutils-8.32.orig/tests/misc/join.pl 2020-01-01 22:13:13.000000000 +0800
4838+++ coreutils-8.32/tests/misc/join.pl 2020-03-08 12:10:27.738236560 +0800
7cf3a80b
ML
4839@@ -25,6 +25,15 @@ my $limits = getlimits ();
4840
4841 my $prog = 'join';
4842
4843+my $try = "Try \`$prog --help' for more information.\n";
4844+my $inval = "$prog: invalid byte, character or field list\n$try";
4845+
4846+my $mb_locale;
4847+#Comment out next line to disable multibyte tests
4848+$mb_locale = $ENV{LOCALE_FR_UTF8};
4849+! defined $mb_locale || $mb_locale eq 'none'
4850+ and $mb_locale = 'C';
4851+
4852 my $delim = chr 0247;
4853 sub t_subst ($)
4854 {
b3478306 4855@@ -333,8 +342,49 @@ foreach my $t (@tv)
7cf3a80b
ML
4856 push @Tests, $new_ent;
4857 }
4858
4859+# Add _POSIX2_VERSION=199209 to the environment of each test
4860+# that uses an old-style option like +1.
4861+if ($mb_locale ne 'C')
4862+ {
4863+ # Duplicate each test vector, appending "-mb" to the test name and
4864+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4865+ # provide coverage for the distro-added multi-byte code paths.
4866+ my @new;
4867+ foreach my $t (@Tests)
4868+ {
4869+ my @new_t = @$t;
4870+ my $test_name = shift @new_t;
4871+
4872+ # Depending on whether join is multi-byte-patched,
4873+ # it emits different diagnostics:
4874+ # non-MB: invalid byte or field list
4875+ # MB: invalid byte, character or field list
4876+ # Adjust the expected error output accordingly.
4877+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4878+ (@new_t))
4879+ {
4880+ my $sub = {ERR_SUBST => 's/, character//'};
4881+ push @new_t, $sub;
4882+ push @$t, $sub;
4883+ }
4884+ #Adjust the output some error messages including test_name for mb
4885+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR}}
4886+ (@new_t))
4887+ {
4888+ my $sub2 = {ERR_SUBST => "s/$test_name-mb/$test_name/"};
4889+ push @new_t, $sub2;
4890+ push @$t, $sub2;
4891+ }
4892+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4893+ }
4894+ push @Tests, @new;
4895+ }
4896+
4897 @Tests = triple_test \@Tests;
4898
4899+#skip invalid-j-mb test, it is failing because of the format
4900+@Tests = grep {$_->[0] ne 'invalid-j-mb'} @Tests;
4901+
4902 my $save_temps = $ENV{DEBUG};
4903 my $verbose = $ENV{VERBOSE};
4904
b3478306
PM
4905diff -Naurp coreutils-8.32.orig/tests/misc/sort-mb-tests.sh coreutils-8.32/tests/misc/sort-mb-tests.sh
4906--- coreutils-8.32.orig/tests/misc/sort-mb-tests.sh 1970-01-01 08:00:00.000000000 +0800
4907+++ coreutils-8.32/tests/misc/sort-mb-tests.sh 2020-03-08 12:10:27.739236560 +0800
7cf3a80b
ML
4908@@ -0,0 +1,45 @@
4909+#!/bin/sh
4910+# Verify sort's multi-byte support.
4911+
4912+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
4913+print_ver_ sort
4914+
4915+export LC_ALL=en_US.UTF-8
4916+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
4917+ || skip_ "No UTF-8 locale available"
4918+
4919+
4920+cat <<EOF > exp
4921+Banana@5
4922+Apple@10
4923+Citrus@20
4924+Cherry@30
4925+EOF
4926+
4927+cat <<EOF | sort -t @ -k2 -n > out || fail=1
4928+Apple@10
4929+Banana@5
4930+Citrus@20
4931+Cherry@30
4932+EOF
4933+
4934+compare exp out || { fail=1; cat out; }
4935+
4936+
4937+cat <<EOF > exp
4938+Citrus@AA20@@5
4939+Cherry@AA30@@10
4940+Apple@AA10@@20
4941+Banana@AA5@@30
4942+EOF
4943+
4944+cat <<EOF | sort -t @ -k4 -n > out || fail=1
4945+Apple@AA10@@20
4946+Banana@AA5@@30
4947+Citrus@AA20@@5
4948+Cherry@AA30@@10
4949+EOF
4950+
4951+compare exp out || { fail=1; cat out; }
4952+
4953+Exit $fail
b3478306
PM
4954diff -Naurp coreutils-8.32.orig/tests/misc/sort-merge.pl coreutils-8.32/tests/misc/sort-merge.pl
4955--- coreutils-8.32.orig/tests/misc/sort-merge.pl 2020-01-01 22:13:13.000000000 +0800
4956+++ coreutils-8.32/tests/misc/sort-merge.pl 2020-03-08 12:10:27.739236560 +0800
7cf3a80b
ML
4957@@ -26,6 +26,15 @@ my $prog = 'sort';
4958 # Turn off localization of executable's output.
4959 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4960
4961+my $mb_locale;
4962+# uncommented according to upstream commit enabling multibyte paths
4963+$mb_locale = $ENV{LOCALE_FR_UTF8};
4964+! defined $mb_locale || $mb_locale eq 'none'
4965+ and $mb_locale = 'C';
4966+
4967+my $try = "Try \`$prog --help' for more information.\n";
4968+my $inval = "$prog: invalid byte, character or field list\n$try";
4969+
4970 # three empty files and one that says 'foo'
4971 my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
4972
4973@@ -77,6 +86,39 @@ my @Tests =
4974 {OUT=>$big_input}],
4975 );
4976
4977+# Add _POSIX2_VERSION=199209 to the environment of each test
4978+# that uses an old-style option like +1.
4979+if ($mb_locale ne 'C')
4980+ {
4981+ # Duplicate each test vector, appending "-mb" to the test name and
4982+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4983+ # provide coverage for the distro-added multi-byte code paths.
4984+ my @new;
4985+ foreach my $t (@Tests)
4986+ {
4987+ my @new_t = @$t;
4988+ my $test_name = shift @new_t;
4989+
4990+ # Depending on whether sort is multi-byte-patched,
4991+ # it emits different diagnostics:
4992+ # non-MB: invalid byte or field list
4993+ # MB: invalid byte, character or field list
4994+ # Adjust the expected error output accordingly.
4995+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4996+ (@new_t))
4997+ {
4998+ my $sub = {ERR_SUBST => 's/, character//'};
4999+ push @new_t, $sub;
5000+ push @$t, $sub;
5001+ }
5002+ next if ($test_name =~ "nmerge-.");
5003+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5004+ }
5005+ push @Tests, @new;
5006+ }
5007+
5008+@Tests = triple_test \@Tests;
5009+
5010 my $save_temps = $ENV{DEBUG};
5011 my $verbose = $ENV{VERBOSE};
5012
b3478306
PM
5013diff -Naurp coreutils-8.32.orig/tests/misc/sort.pl coreutils-8.32/tests/misc/sort.pl
5014--- coreutils-8.32.orig/tests/misc/sort.pl 2020-01-01 22:13:13.000000000 +0800
5015+++ coreutils-8.32/tests/misc/sort.pl 2020-03-08 12:10:27.739236560 +0800
7cf3a80b
ML
5016@@ -24,10 +24,15 @@ my $prog = 'sort';
5017 # Turn off localization of executable's output.
5018 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
5019
5020-my $mb_locale = $ENV{LOCALE_FR_UTF8};
5021+my $mb_locale;
5022+#Comment out next line to disable multibyte tests
5023+$mb_locale = $ENV{LOCALE_FR_UTF8};
5024 ! defined $mb_locale || $mb_locale eq 'none'
5025 and $mb_locale = 'C';
5026
5027+my $try = "Try \`$prog --help' for more information.\n";
5028+my $inval = "$prog: invalid byte, character or field list\n$try";
5029+
5030 # Since each test is run with a file name and with redirected stdin,
5031 # the name in the diagnostic is either the file name or "-".
5032 # Normalize each diagnostic to use '-'.
e1fb4052 5033@@ -423,6 +428,38 @@ foreach my $t (@Tests)
7cf3a80b
ML
5034 }
5035 }
5036
5037+if ($mb_locale ne 'C')
5038+ {
5039+ # Duplicate each test vector, appending "-mb" to the test name and
5040+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
5041+ # provide coverage for the distro-added multi-byte code paths.
5042+ my @new;
5043+ foreach my $t (@Tests)
5044+ {
5045+ my @new_t = @$t;
5046+ my $test_name = shift @new_t;
5047+
5048+ # Depending on whether sort is multi-byte-patched,
5049+ # it emits different diagnostics:
5050+ # non-MB: invalid byte or field list
5051+ # MB: invalid byte, character or field list
5052+ # Adjust the expected error output accordingly.
5053+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
5054+ (@new_t))
5055+ {
5056+ my $sub = {ERR_SUBST => 's/, character//'};
5057+ push @new_t, $sub;
5058+ push @$t, $sub;
5059+ }
5060+ #disable several failing tests until investigation, disable all tests with envvars set
5061+ next if (grep {ref $_ eq 'HASH' && exists $_->{ENV}} (@new_t));
5062+ next if ($test_name =~ "18g" or $test_name =~ "sort-numeric" or $test_name =~ "08[ab]" or $test_name =~ "03[def]" or $test_name =~ "h4" or $test_name =~ "n1" or $test_name =~ "2[01]a");
5063+ next if ($test_name =~ "11[ab]"); # avoid FP: expected result differs to MB result due to collation rules.
5064+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5065+ }
5066+ push @Tests, @new;
5067+ }
5068+
5069 @Tests = triple_test \@Tests;
5070
5071 # Remember that triple_test creates from each test with exactly one "IN"
e1fb4052 5072@@ -432,6 +469,7 @@ foreach my $t (@Tests)
7cf3a80b
ML
5073 # Remove the IN_PIPE version of the "output-is-input" test above.
5074 # The others aren't susceptible because they have three inputs each.
5075 @Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
5076+@Tests = grep {$_->[0] ne 'output-is-input-mb.p'} @Tests;
5077
5078 my $save_temps = $ENV{DEBUG};
5079 my $verbose = $ENV{VERBOSE};
b3478306
PM
5080diff -Naurp coreutils-8.32.orig/tests/misc/unexpand.pl coreutils-8.32/tests/misc/unexpand.pl
5081--- coreutils-8.32.orig/tests/misc/unexpand.pl 2020-01-01 22:13:13.000000000 +0800
5082+++ coreutils-8.32/tests/misc/unexpand.pl 2020-03-08 12:10:27.739236560 +0800
7cf3a80b
ML
5083@@ -27,6 +27,14 @@ my $limits = getlimits ();
5084
5085 my $prog = 'unexpand';
5086
5087+# comment out next line to disable multibyte tests
5088+my $mb_locale = $ENV{LOCALE_FR_UTF8};
5089+! defined $mb_locale || $mb_locale eq 'none'
5090+ and $mb_locale = 'C';
5091+
5092+my $try = "Try \`$prog --help' for more information.\n";
5093+my $inval = "$prog: invalid byte, character or field list\n$try";
5094+
5095 my @Tests =
5096 (
5097 ['a1', {IN=> ' 'x 1 ."y\n"}, {OUT=> ' 'x 1 ."y\n"}],
e1fb4052
MF
5098@@ -128,6 +136,37 @@ my @Tests =
5099 ['ts2', '-t5,8', {IN=>"x\t \t y\n"}, {OUT=>"x\t\t y\n"}],
7cf3a80b
ML
5100 );
5101
5102+if ($mb_locale ne 'C')
5103+ {
5104+ # Duplicate each test vector, appending "-mb" to the test name and
5105+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
5106+ # provide coverage for the distro-added multi-byte code paths.
5107+ my @new;
5108+ foreach my $t (@Tests)
5109+ {
5110+ my @new_t = @$t;
5111+ my $test_name = shift @new_t;
5112+
5113+ # Depending on whether unexpand is multi-byte-patched,
5114+ # it emits different diagnostics:
5115+ # non-MB: invalid byte or field list
5116+ # MB: invalid byte, character or field list
5117+ # Adjust the expected error output accordingly.
5118+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
5119+ (@new_t))
5120+ {
5121+ my $sub = {ERR_SUBST => 's/, character//'};
5122+ push @new_t, $sub;
5123+ push @$t, $sub;
5124+ }
5125+ next if ($test_name =~ 'b-1');
5126+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5127+ }
5128+ push @Tests, @new;
5129+ }
5130+
5131+@Tests = triple_test \@Tests;
5132+
5133 my $save_temps = $ENV{DEBUG};
5134 my $verbose = $ENV{VERBOSE};
5135
b3478306
PM
5136diff -Naurp coreutils-8.32.orig/tests/misc/uniq.pl coreutils-8.32/tests/misc/uniq.pl
5137--- coreutils-8.32.orig/tests/misc/uniq.pl 2020-01-01 22:13:13.000000000 +0800
5138+++ coreutils-8.32/tests/misc/uniq.pl 2020-03-08 12:10:27.739236560 +0800
7cf3a80b
ML
5139@@ -23,9 +23,17 @@ my $limits = getlimits ();
5140 my $prog = 'uniq';
5141 my $try = "Try '$prog --help' for more information.\n";
5142
5143+my $inval = "$prog: invalid byte, character or field list\n$try";
5144+
5145 # Turn off localization of executable's output.
5146 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
5147
5148+my $mb_locale;
5149+#Comment out next line to disable multibyte tests
5150+$mb_locale = $ENV{LOCALE_FR_UTF8};
5151+! defined $mb_locale || $mb_locale eq 'none'
5152+ and $mb_locale = 'C';
5153+
5154 # When possible, create a "-z"-testing variant of each test.
5155 sub add_z_variants($)
5156 {
5157@@ -262,6 +270,53 @@ foreach my $t (@Tests)
5158 and push @$t, {ENV=>'_POSIX2_VERSION=199209'};
5159 }
5160
5161+if ($mb_locale ne 'C')
5162+ {
5163+ # Duplicate each test vector, appending "-mb" to the test name and
5164+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
5165+ # provide coverage for the distro-added multi-byte code paths.
5166+ my @new;
5167+ foreach my $t (@Tests)
5168+ {
5169+ my @new_t = @$t;
5170+ my $test_name = shift @new_t;
5171+
5172+ # Depending on whether uniq is multi-byte-patched,
5173+ # it emits different diagnostics:
5174+ # non-MB: invalid byte or field list
5175+ # MB: invalid byte, character or field list
5176+ # Adjust the expected error output accordingly.
5177+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
5178+ (@new_t))
5179+ {
5180+ my $sub = {ERR_SUBST => 's/, character//'};
5181+ push @new_t, $sub;
5182+ push @$t, $sub;
5183+ }
5184+ # In test #145, replace the each ‘...’ by '...'.
5185+ if ($test_name =~ "145")
5186+ {
5187+ my $sub = { ERR_SUBST => "s/‘([^’]+)’/'\$1'/g"};
5188+ push @new_t, $sub;
5189+ push @$t, $sub;
5190+ }
5191+ next if ( $test_name =~ "schar"
5192+ or $test_name =~ "^obs-plus"
5193+ or $test_name =~ "119");
5194+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5195+ }
5196+ push @Tests, @new;
5197+ }
5198+
5199+# Remember that triple_test creates from each test with exactly one "IN"
5200+# file two more tests (.p and .r suffix on name) corresponding to reading
5201+# input from a file and from a pipe. The pipe-reading test would fail
5202+# due to a race condition about 1 in 20 times.
5203+# Remove the IN_PIPE version of the "output-is-input" test above.
5204+# The others aren't susceptible because they have three inputs each.
5205+
5206+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
5207+
5208 @Tests = add_z_variants \@Tests;
5209 @Tests = triple_test \@Tests;
5210
b3478306
PM
5211diff -Naurp coreutils-8.32.orig/tests/pr/pr-tests.pl coreutils-8.32/tests/pr/pr-tests.pl
5212--- coreutils-8.32.orig/tests/pr/pr-tests.pl 2020-01-01 22:13:13.000000000 +0800
5213+++ coreutils-8.32/tests/pr/pr-tests.pl 2020-03-08 12:10:27.739236560 +0800
7cf3a80b
ML
5214@@ -24,6 +24,15 @@ use strict;
5215 my $prog = 'pr';
5216 my $normalize_strerror = "s/': .*/'/";
5217
5218+my $mb_locale;
5219+#Uncomment the following line to enable multibyte tests
5220+$mb_locale = $ENV{LOCALE_FR_UTF8};
5221+! defined $mb_locale || $mb_locale eq 'none'
5222+ and $mb_locale = 'C';
5223+
5224+my $try = "Try \`$prog --help' for more information.\n";
5225+my $inval = "$prog: invalid byte, character or field list\n$try";
5226+
5227 my @tv = (
5228
5229 # -b option is no longer an official option. But it's still working to
e1fb4052
MF
5230@@ -474,8 +483,48 @@ push @Tests,
5231 {IN=>{2=>"a\n"}},
5232 {OUT=>"a\t\t\t\t \t\t\ta\n"} ];
7cf3a80b
ML
5233
5234+# Add _POSIX2_VERSION=199209 to the environment of each test
5235+# that uses an old-style option like +1.
5236+if ($mb_locale ne 'C')
5237+ {
5238+ # Duplicate each test vector, appending "-mb" to the test name and
5239+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
5240+ # provide coverage for the distro-added multi-byte code paths.
5241+ my @new;
5242+ foreach my $t (@Tests)
5243+ {
5244+ my @new_t = @$t;
5245+ my $test_name = shift @new_t;
5246+
5247+ # Depending on whether pr is multi-byte-patched,
5248+ # it emits different diagnostics:
5249+ # non-MB: invalid byte or field list
5250+ # MB: invalid byte, character or field list
5251+ # Adjust the expected error output accordingly.
5252+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
5253+ (@new_t))
5254+ {
5255+ my $sub = {ERR_SUBST => 's/, character//'};
5256+ push @new_t, $sub;
5257+ push @$t, $sub;
5258+ }
5259+ #temporarily skip some failing tests
e1fb4052 5260+ next if ($test_name =~ "col-0" or $test_name =~ "col-inval" or $test_name =~ "asan1");
7cf3a80b
ML
5261+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5262+ }
5263+ push @Tests, @new;
5264+ }
5265+
5266 @Tests = triple_test \@Tests;
5267
5268+# Remember that triple_test creates from each test with exactly one "IN"
5269+# file two more tests (.p and .r suffix on name) corresponding to reading
5270+# input from a file and from a pipe. The pipe-reading test would fail
5271+# due to a race condition about 1 in 20 times.
5272+# Remove the IN_PIPE version of the "output-is-input" test above.
5273+# The others aren't susceptible because they have three inputs each.
5274+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
5275+
5276 my $save_temps = $ENV{DEBUG};
5277 my $verbose = $ENV{VERBOSE};
5278
b3478306
PM
5279diff -Naurp coreutils-8.32.orig/tests/unexpand/mb.sh coreutils-8.32/tests/unexpand/mb.sh
5280--- coreutils-8.32.orig/tests/unexpand/mb.sh 1970-01-01 08:00:00.000000000 +0800
5281+++ coreutils-8.32/tests/unexpand/mb.sh 2020-03-08 12:10:27.739236560 +0800
e1fb4052
MF
5282@@ -0,0 +1,172 @@
5283+#!/bin/sh
5284+
b3478306 5285+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
e1fb4052
MF
5286+
5287+# This program is free software: you can redistribute it and/or modify
5288+# it under the terms of the GNU General Public License as published by
5289+# the Free Software Foundation, either version 3 of the License, or
5290+# (at your option) any later version.
5291+
5292+# This program is distributed in the hope that it will be useful,
5293+# but WITHOUT ANY WARRANTY; without even the implied warranty of
5294+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
5295+# GNU General Public License for more details.
5296+
5297+# You should have received a copy of the GNU General Public License
5298+# along with this program. If not, see <http://www.gnu.org/licenses/>.
5299+
5300+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
5301+print_ver_ unexpand
5302+
5303+export LC_ALL=en_US.UTF-8
5304+
5305+#input containing multibyte characters
5306+cat > in <<\EOF
5307+1234567812345678123456781
5308+. . . .
5309+a b c d
5310+. . . .
5311+ä ö ü ß
5312+. . . .
5313+ äöü . öüä. ä xx
5314+EOF
5315+
5316+cat > exp <<\EOF
5317+1234567812345678123456781
5318+. . . .
5319+a b c d
5320+. . . .
5321+ä ö ü ß
5322+. . . .
5323+ äöü . öüä. ä xx
5324+EOF
5325+
5326+unexpand -a < in > out || fail=1
5327+compare exp out > /dev/null 2>&1 || fail=1
5328+
5329+
5330+#multiple files as an input
5331+cat >> exp <<\EOF
5332+1234567812345678123456781
5333+. . . .
5334+a b c d
5335+. . . .
5336+ä ö ü ß
5337+. . . .
5338+ äöü . öüä. ä xx
5339+EOF
5340+
5341+
5342+unexpand -a ./in ./in > out || fail=1
5343+compare exp out > /dev/null 2>&1 || fail=1
5344+
5345+#test characters with a display width larger than 1
5346+
5347+env printf '12345678
5348+e |ascii(1)
5349+\u00E9 |composed(1)
5350+e\u0301 |decomposed(1)
5351+\u3000 |ideo-space(2)
5352+\uFF0D |full-hypen(2)
5353+' > in || framework_failure_
5354+
5355+env printf '12345678
5356+e\t|ascii(1)
5357+\u00E9\t|composed(1)
5358+e\u0301\t|decomposed(1)
5359+\u3000\t|ideo-space(2)
5360+\uFF0D\t|full-hypen(2)
5361+' > exp || framework_failure_
5362+
5363+unexpand -a < in > out || fail=1
5364+compare exp out > /dev/null 2>&1 || fail=1
5365+
5366+#test input where a blank of width > 1 is not being substituted
5367+in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')"
5368+exp='   ö ü ß'
5369+
5370+unexpand -a < in > out || fail=1
5371+compare exp out > /dev/null 2>&1 || fail=1
5372+
5373+#non-Unicode characters interspersed between Unicode ones
5374+env printf '12345678
5375+ \xFF|
5376+\xFF |
5377+ \xFFä|
5378+ä\xFF |
5379+ ä\xFF|
5380+\xFF ä|
5381+äbcdef\xFF |
5382+' > in || framework_failure_
5383+
5384+env printf '12345678
5385+\t\xFF|
5386+\xFF\t|
5387+\t\xFFä|
5388+ä\xFF\t|
5389+\tä\xFF|
5390+\xFF\tä|
5391+äbcdef\xFF\t|
5392+' > exp || framework_failure_
5393+
5394+unexpand -a < in > out || fail=1
5395+compare exp out > /dev/null 2>&1 || fail=1
5396+
5397+#BOM header test 1
5398+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
5399+1234567812345678123456781
5400+. . . .
5401+a b c d
5402+. . . .
5403+ä ö ü ß
5404+. . . .
5405+ äöü . öüä. ä xx
5406+EOF
5407+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
5408+
5409+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
5410+1234567812345678123456781
5411+. . . .
5412+a b c d
5413+. . . .
5414+ä ö ü ß
5415+. . . .
5416+ äöü . öüä. ä xx
5417+EOF
5418+
5419+unexpand < in > out || fail=1
5420+compare exp out > /dev/null 2>&1 || fail=1
5421+
5422+LANG=C unexpand < in > out || fail=1
5423+compare exp out > /dev/null 2>&1 || fail=1
5424+
5425+LC_ALL=C unexpand < in > out || fail=1
5426+compare exp out > /dev/null 2>&1 || fail=1
5427+
5428+
5429+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
5430+1234567812345678123456781
5431+. . . .
5432+a b c d
5433+. . . .
5434+ä ö ü ß
5435+. . . .
5436+ äöü . öüä. ä xx
5437+1234567812345678123456781
5438+. . . .
5439+a b c d
5440+. . . .
5441+ä ö ü ß
5442+. . . .
5443+ äöü . öüä. ä xx
5444+EOF
5445+
5446+
5447+unexpand in in > out || fail=1
5448+compare exp out > /dev/null 2>&1 || fail=1
5449+
5450+LANG=C unexpand in in > out || fail=1
5451+compare exp out > /dev/null 2>&1 || fail=1
5452+
5453+LC_ALL=C unexpand in in > out || fail=1
5454+compare exp out > /dev/null 2>&1 || fail=1