]> git.ipfire.org Git - people/amarx/ipfire-3.x.git/blame - coreutils/patches/coreutils-i18n.patch
Merge branch 'master' of ssh://git.ipfire.org/pub/git/ipfire-3.x into samba4
[people/amarx/ipfire-3.x.git] / coreutils / patches / coreutils-i18n.patch
CommitLineData
fca5c2d6
SS
1Submitted by: DJ Lucas (dj_AT_linuxfromscratch_DOT_org)
2Date: 2016-02-09
3Initial Package Version: 8.25
fbb9790b 4Upstream Status: Rejected
fca5c2d6 5Origin: Based on Suse's i18n patches at https://build.opensuse.org/package/view_file/Base:System/coreutils/coreutils-i18n.patch
fbb9790b
SS
6Description: Fixes several i18n issues with various Coreutils programs
7
fca5c2d6
SS
8diff -Naurp coreutils-8.25-orig/lib/linebuffer.h coreutils-8.25/lib/linebuffer.h
9--- coreutils-8.25-orig/lib/linebuffer.h 2016-01-01 07:45:55.000000000 -0600
10+++ coreutils-8.25/lib/linebuffer.h 2016-02-08 19:07:10.298944609 -0600
56ae3f82
SS
11@@ -21,6 +21,11 @@
12
13 # include <stdio.h>
14
15+/* Get mbstate_t. */
16+# if HAVE_WCHAR_H
17+# include <wchar.h>
18+# endif
19+
fa4603be 20 /* A 'struct linebuffer' holds a line of text. */
56ae3f82
SS
21
22 struct linebuffer
fca5c2d6 23@@ -28,6 +33,9 @@ struct linebuffer
56ae3f82
SS
24 size_t size; /* Allocated. */
25 size_t length; /* Used. */
26 char *buffer;
27+# if HAVE_WCHAR_H
28+ mbstate_t state;
29+# endif
30 };
31
32 /* Initialize linebuffer LINEBUFFER for use. */
fca5c2d6
SS
33diff -Naurp coreutils-8.25-orig/src/cut.c coreutils-8.25/src/cut.c
34--- coreutils-8.25-orig/src/cut.c 2016-01-13 05:08:59.000000000 -0600
35+++ coreutils-8.25/src/cut.c 2016-02-08 19:07:10.300944616 -0600
56ae3f82
SS
36@@ -28,6 +28,11 @@
37 #include <assert.h>
38 #include <getopt.h>
39 #include <sys/types.h>
40+
41+/* Get mbstate_t, mbrtowc(). */
42+#if HAVE_WCHAR_H
43+# include <wchar.h>
44+#endif
45 #include "system.h"
46
47 #include "error.h"
fca5c2d6
SS
48@@ -38,6 +43,18 @@
49
50 #include "set-fields.h"
56ae3f82
SS
51
52+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
53+ installation; work around this configuration error. */
54+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
55+# undef MB_LEN_MAX
56+# define MB_LEN_MAX 16
57+#endif
58+
59+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
60+#if HAVE_MBRTOWC && defined mbstate_t
61+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
62+#endif
63+
6987acf5 64 /* The official name of this program (e.g., no 'g' prefix). */
56ae3f82
SS
65 #define PROGRAM_NAME "cut"
66
fca5c2d6 67@@ -54,6 +71,52 @@
fbb9790b 68 } \
56ae3f82
SS
69 while (0)
70
71+/* Refill the buffer BUF to get a multibyte character. */
72+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
73+ do \
74+ { \
75+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
76+ { \
77+ memmove (BUF, BUFPOS, BUFLEN); \
78+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
79+ BUFPOS = BUF; \
80+ } \
81+ } \
82+ while (0)
83+
84+/* Get wide character on BUFPOS. BUFPOS is not included after that.
fbb9790b 85+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
56ae3f82
SS
86+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
87+ do \
88+ { \
89+ mbstate_t state_bak; \
90+ \
91+ if (BUFLEN < 1) \
92+ { \
93+ WC = WEOF; \
94+ break; \
95+ } \
96+ \
97+ /* Get a wide character. */ \
fbb9790b 98+ CONVFAIL = false; \
56ae3f82
SS
99+ state_bak = STATE; \
100+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
101+ \
102+ switch (MBLENGTH) \
103+ { \
104+ case (size_t)-1: \
105+ case (size_t)-2: \
fbb9790b 106+ CONVFAIL = true; \
56ae3f82
SS
107+ STATE = state_bak; \
108+ /* Fall througn. */ \
109+ \
110+ case 0: \
111+ MBLENGTH = 1; \
112+ break; \
113+ } \
114+ } \
115+ while (0)
116+
fbb9790b 117
fca5c2d6
SS
118 /* Pointer inside RP. When checking if a byte or field is selected
119 by a finite range, we check if it is between CURRENT_RP.LO
120@@ -61,6 +124,9 @@
121 CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
122 static struct field_range_pair *current_rp;
fbb9790b
SS
123
124+/* Length of the delimiter given as argument to -d. */
125+size_t delimlen;
fca5c2d6
SS
126+
127 /* This buffer is used to support the semantics of the -s option
128 (or lack of same) when the specified field list includes (does
129 not include) the first field. In both of those cases, the entire
130@@ -77,15 +143,25 @@ enum operating_mode
56ae3f82
SS
131 {
132 undefined_mode,
133
134- /* Output characters that are in the given bytes. */
135+ /* Output bytes that are at the given positions. */
136 byte_mode,
137
138+ /* Output characters that are at the given positions. */
139+ character_mode,
140+
fbb9790b 141 /* Output the given delimiter-separated fields. */
56ae3f82
SS
142 field_mode
143 };
144
145 static enum operating_mode operating_mode;
146
147+/* If nonzero, when in byte mode, don't split multibyte characters. */
148+static int byte_mode_character_aware;
149+
150+/* If nonzero, the function for single byte locale is work
151+ if this program runs on multibyte locale. */
152+static int force_singlebyte_mode;
153+
fbb9790b 154 /* If true do not output lines containing no delimiter characters.
56ae3f82
SS
155 Otherwise, all such lines are printed. This option is valid only
156 with field mode. */
fca5c2d6 157@@ -97,6 +173,9 @@ static bool complement;
56ae3f82 158
fbb9790b 159 /* The delimiter character for field mode. */
56ae3f82
SS
160 static unsigned char delim;
161+#if HAVE_WCHAR_H
162+static wchar_t wcdelim;
163+#endif
164
fca5c2d6
SS
165 /* The delimiter for each line/record. */
166 static unsigned char line_delim = '\n';
167@@ -164,7 +243,7 @@ Print selected parts of lines from each
56ae3f82
SS
168 -f, --fields=LIST select only these fields; also print any line\n\
169 that contains no delimiter character, unless\n\
170 the -s option is specified\n\
171- -n (ignored)\n\
172+ -n with -b: don't split multibyte characters\n\
173 "), stdout);
174 fputs (_("\
175 --complement complement the set of selected bytes, characters\n\
fca5c2d6 176@@ -280,6 +359,82 @@ cut_bytes (FILE *stream)
56ae3f82
SS
177 }
178 }
179
180+#if HAVE_MBRTOWC
181+/* This function is in use for the following case.
182+
183+ 1. Read from the stream STREAM, printing to standard output any selected
e7f6ab54 184+ characters.
56ae3f82
SS
185+
186+ 2. Read from stream STREAM, printing to standard output any selected bytes,
187+ without splitting multibyte characters. */
e7f6ab54 188+
56ae3f82
SS
189+static void
190+cut_characters_or_cut_bytes_no_split (FILE *stream)
191+{
fbb9790b 192+ size_t idx; /* number of bytes or characters in the line so far. */
56ae3f82
SS
193+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
194+ char *bufpos; /* Next read position of BUF. */
195+ size_t buflen; /* The length of the byte sequence in buf. */
196+ wint_t wc; /* A gotten wide character. */
197+ size_t mblength; /* The byte size of a multibyte character which shows
198+ as same character as WC. */
199+ mbstate_t state; /* State of the stream. */
fbb9790b 200+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
e7f6ab54
SS
201+ /* Whether to begin printing delimiters between ranges for the current line.
202+ Set after we've begun printing data corresponding to the first range. */
203+ bool print_delimiter = false;
56ae3f82
SS
204+
205+ idx = 0;
206+ buflen = 0;
207+ bufpos = buf;
208+ memset (&state, '\0', sizeof(mbstate_t));
209+
fca5c2d6 210+ current_rp = frp;
fbb9790b 211+
56ae3f82
SS
212+ while (1)
213+ {
214+ REFILL_BUFFER (buf, bufpos, buflen, stream);
215+
216+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
fbb9790b 217+ (void) convfail; /* ignore unused */
56ae3f82
SS
218+
219+ if (wc == WEOF)
220+ {
221+ if (idx > 0)
fca5c2d6 222+ putchar (line_delim);
56ae3f82
SS
223+ break;
224+ }
fca5c2d6 225+ else if (wc == line_delim)
56ae3f82 226+ {
fca5c2d6 227+ putchar (line_delim);
56ae3f82 228+ idx = 0;
e7f6ab54 229+ print_delimiter = false;
fca5c2d6 230+ current_rp = frp;
56ae3f82
SS
231+ }
232+ else
233+ {
fbb9790b
SS
234+ next_item (&idx);
235+ if (print_kth (idx))
e7f6ab54 236+ {
fbb9790b 237+ if (output_delimiter_specified)
e7f6ab54 238+ {
fbb9790b
SS
239+ if (print_delimiter && is_range_start_index (idx))
240+ {
241+ fwrite (output_delimiter_string, sizeof (char),
242+ output_delimiter_length, stdout);
243+ }
244+ print_delimiter = true;
245+ }
e7f6ab54
SS
246+ fwrite (bufpos, mblength, sizeof(char), stdout);
247+ }
56ae3f82
SS
248+ }
249+
250+ buflen -= mblength;
251+ bufpos += mblength;
252+ }
253+}
254+#endif
e7f6ab54 255+
56ae3f82
SS
256 /* Read from stream STREAM, printing to standard output any selected fields. */
257
258 static void
fca5c2d6 259@@ -425,13 +580,211 @@ cut_fields (FILE *stream)
56ae3f82
SS
260 }
261 }
262
263+#if HAVE_MBRTOWC
264+static void
265+cut_fields_mb (FILE *stream)
266+{
267+ int c;
fbb9790b 268+ size_t field_idx;
56ae3f82
SS
269+ int found_any_selected_field;
270+ int buffer_first_field;
271+ int empty_input;
272+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
273+ char *bufpos; /* Next read position of BUF. */
274+ size_t buflen; /* The length of the byte sequence in buf. */
275+ wint_t wc = 0; /* A gotten wide character. */
276+ size_t mblength; /* The byte size of a multibyte character which shows
277+ as same character as WC. */
278+ mbstate_t state; /* State of the stream. */
fbb9790b
SS
279+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
280+
fca5c2d6 281+ current_rp = frp;
56ae3f82
SS
282+
283+ found_any_selected_field = 0;
284+ field_idx = 1;
285+ bufpos = buf;
286+ buflen = 0;
287+ memset (&state, '\0', sizeof(mbstate_t));
288+
289+ c = getc (stream);
290+ empty_input = (c == EOF);
291+ if (c != EOF)
e7f6ab54 292+ {
56ae3f82 293+ ungetc (c, stream);
e7f6ab54
SS
294+ wc = 0;
295+ }
56ae3f82
SS
296+ else
297+ wc = WEOF;
298+
299+ /* To support the semantics of the -s flag, we may have to buffer
300+ all of the first field to determine whether it is `delimited.'
301+ But that is unnecessary if all non-delimited lines must be printed
302+ and the first field has been selected, or if non-delimited lines
303+ must be suppressed and the first field has *not* been selected.
304+ That is because a non-delimited line has exactly one field. */
fbb9790b 305+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
56ae3f82
SS
306+
307+ while (1)
308+ {
309+ if (field_idx == 1 && buffer_first_field)
310+ {
311+ int len = 0;
312+
313+ while (1)
314+ {
315+ REFILL_BUFFER (buf, bufpos, buflen, stream);
316+
317+ GET_NEXT_WC_FROM_BUFFER
318+ (wc, bufpos, buflen, mblength, state, convfail);
319+
320+ if (wc == WEOF)
321+ break;
322+
323+ field_1_buffer = xrealloc (field_1_buffer, len + mblength);
324+ memcpy (field_1_buffer + len, bufpos, mblength);
325+ len += mblength;
326+ buflen -= mblength;
327+ bufpos += mblength;
328+
fca5c2d6 329+ if (!convfail && (wc == line_delim || wc == wcdelim))
56ae3f82
SS
330+ break;
331+ }
332+
effd5ec1 333+ if (len <= 0 && wc == WEOF)
56ae3f82
SS
334+ break;
335+
336+ /* If the first field extends to the end of line (it is not
337+ delimited) and we are printing all non-delimited lines,
338+ print this one. */
339+ if (convfail || (!convfail && wc != wcdelim))
340+ {
341+ if (suppress_non_delimited)
342+ {
343+ /* Empty. */
344+ }
345+ else
346+ {
347+ fwrite (field_1_buffer, sizeof (char), len, stdout);
348+ /* Make sure the output line is newline terminated. */
fca5c2d6
SS
349+ if (convfail || (!convfail && wc != line_delim))
350+ putchar (line_delim);
56ae3f82
SS
351+ }
352+ continue;
353+ }
354+
fbb9790b 355+ if (print_kth (1))
56ae3f82
SS
356+ {
357+ /* Print the field, but not the trailing delimiter. */
358+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
359+ found_any_selected_field = 1;
360+ }
fbb9790b 361+ next_item (&field_idx);
56ae3f82
SS
362+ }
363+
364+ if (wc != WEOF)
365+ {
fbb9790b 366+ if (print_kth (field_idx))
56ae3f82
SS
367+ {
368+ if (found_any_selected_field)
369+ {
370+ fwrite (output_delimiter_string, sizeof (char),
371+ output_delimiter_length, stdout);
372+ }
373+ found_any_selected_field = 1;
374+ }
375+
376+ while (1)
377+ {
378+ REFILL_BUFFER (buf, bufpos, buflen, stream);
379+
380+ GET_NEXT_WC_FROM_BUFFER
381+ (wc, bufpos, buflen, mblength, state, convfail);
382+
383+ if (wc == WEOF)
384+ break;
fca5c2d6 385+ else if (!convfail && (wc == wcdelim || wc == line_delim))
56ae3f82
SS
386+ {
387+ buflen -= mblength;
388+ bufpos += mblength;
389+ break;
390+ }
391+
fbb9790b 392+ if (print_kth (field_idx))
56ae3f82
SS
393+ fwrite (bufpos, mblength, sizeof(char), stdout);
394+
395+ buflen -= mblength;
396+ bufpos += mblength;
397+ }
398+ }
399+
fca5c2d6 400+ if ((!convfail || wc == line_delim) && buflen < 1)
56ae3f82
SS
401+ wc = WEOF;
402+
403+ if (!convfail && wc == wcdelim)
fbb9790b 404+ next_item (&field_idx);
fca5c2d6 405+ else if (wc == WEOF || (!convfail && wc == line_delim))
56ae3f82
SS
406+ {
407+ if (found_any_selected_field
408+ || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
fca5c2d6 409+ putchar (line_delim);
56ae3f82
SS
410+ if (wc == WEOF)
411+ break;
412+ field_idx = 1;
fca5c2d6 413+ current_rp = frp;
56ae3f82
SS
414+ found_any_selected_field = 0;
415+ }
416+ }
417+}
418+#endif
419+
420 static void
421 cut_stream (FILE *stream)
422 {
423- if (operating_mode == byte_mode)
424- cut_bytes (stream);
425+#if HAVE_MBRTOWC
426+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
427+ {
428+ switch (operating_mode)
429+ {
430+ case byte_mode:
431+ if (byte_mode_character_aware)
432+ cut_characters_or_cut_bytes_no_split (stream);
433+ else
434+ cut_bytes (stream);
435+ break;
436+
437+ case character_mode:
438+ cut_characters_or_cut_bytes_no_split (stream);
439+ break;
440+
441+ case field_mode:
fbb9790b
SS
442+ if (delimlen == 1)
443+ {
444+ /* Check if we have utf8 multibyte locale, so we can use this
445+ optimization because of uniqueness of characters, which is
446+ not true for e.g. SJIS */
447+ char * loc = setlocale(LC_CTYPE, NULL);
448+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
449+ strstr (loc, "UTF8") || strstr (loc, "utf8")))
450+ {
451+ cut_fields (stream);
452+ break;
453+ }
454+ }
56ae3f82
SS
455+ cut_fields_mb (stream);
456+ break;
457+
458+ default:
459+ abort ();
460+ }
461+ }
462 else
463- cut_fields (stream);
464+#endif
465+ {
466+ if (operating_mode == field_mode)
467+ cut_fields (stream);
468+ else
469+ cut_bytes (stream);
470+ }
471 }
472
473 /* Process file FILE to standard output.
fca5c2d6 474@@ -483,6 +836,7 @@ main (int argc, char **argv)
56ae3f82
SS
475 bool ok;
476 bool delim_specified = false;
1555d43c 477 char *spec_list_string IF_LINT ( = NULL);
56ae3f82 478+ char mbdelim[MB_LEN_MAX + 1];
56ae3f82
SS
479
480 initialize_main (&argc, &argv);
481 set_program_name (argv[0]);
fca5c2d6 482@@ -505,7 +859,6 @@ main (int argc, char **argv)
56ae3f82
SS
483 switch (optc)
484 {
485 case 'b':
486- case 'c':
487 /* Build the byte list. */
488 if (operating_mode != undefined_mode)
489 FATAL_ERROR (_("only one type of list may be specified"));
fca5c2d6 490@@ -513,6 +866,14 @@ main (int argc, char **argv)
56ae3f82
SS
491 spec_list_string = optarg;
492 break;
493
494+ case 'c':
495+ /* Build the character list. */
496+ if (operating_mode != undefined_mode)
497+ FATAL_ERROR (_("only one type of list may be specified"));
498+ operating_mode = character_mode;
499+ spec_list_string = optarg;
500+ break;
501+
502 case 'f':
503 /* Build the field list. */
504 if (operating_mode != undefined_mode)
fca5c2d6 505@@ -524,10 +885,38 @@ main (int argc, char **argv)
56ae3f82
SS
506 case 'd':
507 /* New delimiter. */
6987acf5 508 /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
56ae3f82
SS
509- if (optarg[0] != '\0' && optarg[1] != '\0')
510- FATAL_ERROR (_("the delimiter must be a single character"));
511- delim = optarg[0];
512- delim_specified = true;
513+ {
514+#if HAVE_MBRTOWC
515+ if(MB_CUR_MAX > 1)
516+ {
517+ mbstate_t state;
518+
519+ memset (&state, '\0', sizeof(mbstate_t));
520+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
521+
522+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
523+ ++force_singlebyte_mode;
524+ else
525+ {
526+ delimlen = (delimlen < 1) ? 1 : delimlen;
527+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
528+ FATAL_ERROR (_("the delimiter must be a single character"));
529+ memcpy (mbdelim, optarg, delimlen);
e5317bd9 530+ mbdelim[delimlen] = '\0';
fbb9790b
SS
531+ if (delimlen == 1)
532+ delim = *optarg;
56ae3f82
SS
533+ }
534+ }
535+
536+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
537+#endif
538+ {
539+ if (optarg[0] != '\0' && optarg[1] != '\0')
540+ FATAL_ERROR (_("the delimiter must be a single character"));
541+ delim = (unsigned char) optarg[0];
542+ }
543+ delim_specified = true;
544+ }
545 break;
546
547 case OUTPUT_DELIMITER_OPTION:
fca5c2d6 548@@ -540,6 +929,7 @@ main (int argc, char **argv)
56ae3f82
SS
549 break;
550
551 case 'n':
552+ byte_mode_character_aware = 1;
553 break;
554
555 case 's':
fca5c2d6
SS
556@@ -579,15 +969,34 @@ main (int argc, char **argv)
557 | (complement ? SETFLD_COMPLEMENT : 0) );
56ae3f82
SS
558
559 if (!delim_specified)
560- delim = '\t';
561+ {
562+ delim = '\t';
563+#ifdef HAVE_MBRTOWC
564+ wcdelim = L'\t';
565+ mbdelim[0] = '\t';
566+ mbdelim[1] = '\0';
567+ delimlen = 1;
568+#endif
569+ }
570
571 if (output_delimiter_string == NULL)
572 {
573- static char dummy[2];
574- dummy[0] = delim;
575- dummy[1] = '\0';
576- output_delimiter_string = dummy;
577- output_delimiter_length = 1;
578+#ifdef HAVE_MBRTOWC
579+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
580+ {
581+ output_delimiter_string = xstrdup(mbdelim);
582+ output_delimiter_length = delimlen;
583+ }
584+
585+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
586+#endif
587+ {
e7f6ab54 588+ static char dummy[2];
56ae3f82
SS
589+ dummy[0] = delim;
590+ dummy[1] = '\0';
591+ output_delimiter_string = dummy;
592+ output_delimiter_length = 1;
593+ }
594 }
595
596 if (optind == argc)
fca5c2d6
SS
597diff -Naurp coreutils-8.25-orig/src/expand.c coreutils-8.25/src/expand.c
598--- coreutils-8.25-orig/src/expand.c 2016-01-01 07:48:50.000000000 -0600
599+++ coreutils-8.25/src/expand.c 2016-02-08 19:07:10.301944619 -0600
fbb9790b 600@@ -37,12 +37,34 @@
56ae3f82
SS
601 #include <stdio.h>
602 #include <getopt.h>
603 #include <sys/types.h>
604+
605+/* Get mbstate_t, mbrtowc(), wcwidth(). */
606+#if HAVE_WCHAR_H
607+# include <wchar.h>
608+#endif
fbb9790b
SS
609+
610+/* Get iswblank(). */
611+#if HAVE_WCTYPE_H
612+# include <wctype.h>
613+#endif
56ae3f82
SS
614+
615 #include "system.h"
616 #include "error.h"
1555d43c 617 #include "fadvise.h"
56ae3f82
SS
618 #include "quote.h"
619 #include "xstrndup.h"
620
621+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
622+ installation; work around this configuration error. */
623+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
624+# define MB_LEN_MAX 16
625+#endif
626+
627+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
628+#if HAVE_MBRTOWC && defined mbstate_t
629+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
630+#endif
631+
6987acf5 632 /* The official name of this program (e.g., no 'g' prefix). */
56ae3f82
SS
633 #define PROGRAM_NAME "expand"
634
fca5c2d6 635@@ -357,6 +379,142 @@ expand (void)
56ae3f82
SS
636 }
637 }
638
639+#if HAVE_MBRTOWC
640+static void
641+expand_multibyte (void)
642+{
643+ FILE *fp; /* Input strem. */
644+ mbstate_t i_state; /* Current shift state of the input stream. */
645+ mbstate_t i_state_bak; /* Back up the I_STATE. */
646+ mbstate_t o_state; /* Current shift state of the output stream. */
647+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
3badd2da 648+ char *bufpos = buf; /* Next read position of BUF. */
56ae3f82
SS
649+ size_t buflen = 0; /* The length of the byte sequence in buf. */
650+ wchar_t wc; /* A gotten wide character. */
651+ size_t mblength; /* The byte size of a multibyte character
652+ which shows as same character as WC. */
653+ int tab_index = 0; /* Index in `tab_list' of next tabstop. */
654+ int column = 0; /* Column on screen of the next char. */
655+ int next_tab_column; /* Column the next tab stop is on. */
656+ int convert = 1; /* If nonzero, perform translations. */
657+
658+ fp = next_file ((FILE *) NULL);
659+ if (fp == NULL)
660+ return;
661+
662+ memset (&o_state, '\0', sizeof(mbstate_t));
663+ memset (&i_state, '\0', sizeof(mbstate_t));
664+
665+ for (;;)
666+ {
667+ /* Refill the buffer BUF. */
668+ if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
669+ {
670+ memmove (buf, bufpos, buflen);
671+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
672+ bufpos = buf;
673+ }
674+
675+ /* No character is left in BUF. */
676+ if (buflen < 1)
677+ {
678+ fp = next_file (fp);
679+
680+ if (fp == NULL)
681+ break; /* No more files. */
682+ else
683+ {
684+ memset (&i_state, '\0', sizeof(mbstate_t));
685+ continue;
686+ }
687+ }
688+
689+ /* Get a wide character. */
690+ i_state_bak = i_state;
691+ mblength = mbrtowc (&wc, bufpos, buflen, &i_state);
692+
693+ switch (mblength)
694+ {
695+ case (size_t)-1: /* illegal byte sequence. */
696+ case (size_t)-2:
697+ mblength = 1;
698+ i_state = i_state_bak;
699+ if (convert)
700+ {
701+ ++column;
effd5ec1 702+ if (convert_entire_line == 0 && !isblank(*bufpos))
56ae3f82
SS
703+ convert = 0;
704+ }
705+ putchar (*bufpos);
706+ break;
707+
708+ case 0: /* null. */
709+ mblength = 1;
710+ if (convert && convert_entire_line == 0)
711+ convert = 0;
712+ putchar ('\0');
713+ break;
714+
715+ default:
716+ if (wc == L'\n') /* LF. */
717+ {
718+ tab_index = 0;
719+ column = 0;
720+ convert = 1;
721+ putchar ('\n');
722+ }
723+ else if (wc == L'\t' && convert) /* Tab. */
724+ {
725+ if (tab_size == 0)
726+ {
727+ /* Do not let tab_index == first_free_tab;
728+ stop when it is 1 less. */
729+ while (tab_index < first_free_tab - 1
730+ && column >= tab_list[tab_index])
731+ tab_index++;
732+ next_tab_column = tab_list[tab_index];
733+ if (tab_index < first_free_tab - 1)
734+ tab_index++;
735+ if (column >= next_tab_column)
736+ next_tab_column = column + 1;
737+ }
738+ else
739+ next_tab_column = column + tab_size - column % tab_size;
740+
741+ while (column < next_tab_column)
742+ {
743+ putchar (' ');
744+ ++column;
745+ }
746+ }
747+ else /* Others. */
748+ {
749+ if (convert)
750+ {
751+ if (wc == L'\b')
752+ {
753+ if (column > 0)
754+ --column;
755+ }
756+ else
757+ {
758+ int width; /* The width of WC. */
759+
760+ width = wcwidth (wc);
761+ column += (width > 0) ? width : 0;
effd5ec1 762+ if (convert_entire_line == 0 && !iswblank(wc))
56ae3f82
SS
763+ convert = 0;
764+ }
765+ }
766+ fwrite (bufpos, sizeof(char), mblength, stdout);
767+ }
768+ }
769+ buflen -= mblength;
770+ bufpos += mblength;
771+ }
772+}
773+#endif
774+
775 int
776 main (int argc, char **argv)
777 {
fca5c2d6 778@@ -421,7 +579,12 @@ main (int argc, char **argv)
56ae3f82
SS
779
780 file_list = (optind < argc ? &argv[optind] : stdin_argv);
781
782- expand ();
783+#if HAVE_MBRTOWC
784+ if (MB_CUR_MAX > 1)
785+ expand_multibyte ();
786+ else
787+#endif
788+ expand ();
789
790 if (have_read_stdin && fclose (stdin) != 0)
791 error (EXIT_FAILURE, errno, "-");
fca5c2d6
SS
792diff -Naurp coreutils-8.25-orig/src/fold.c coreutils-8.25/src/fold.c
793--- coreutils-8.25-orig/src/fold.c 2016-01-01 07:48:50.000000000 -0600
794+++ coreutils-8.25/src/fold.c 2016-02-08 19:07:10.302944622 -0600
795@@ -22,11 +22,33 @@
56ae3f82
SS
796 #include <getopt.h>
797 #include <sys/types.h>
798
799+/* Get mbstate_t, mbrtowc(), wcwidth(). */
800+#if HAVE_WCHAR_H
801+# include <wchar.h>
802+#endif
803+
804+/* Get iswprint(), iswblank(), wcwidth(). */
805+#if HAVE_WCTYPE_H
806+# include <wctype.h>
807+#endif
808+
809 #include "system.h"
810 #include "error.h"
1555d43c 811 #include "fadvise.h"
fca5c2d6 812 #include "xdectoint.h"
56ae3f82
SS
813
814+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
815+ installation; work around this configuration error. */
816+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
817+# undef MB_LEN_MAX
818+# define MB_LEN_MAX 16
819+#endif
820+
821+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
822+#if HAVE_MBRTOWC && defined mbstate_t
823+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
824+#endif
825+
826 #define TAB_WIDTH 8
827
6987acf5 828 /* The official name of this program (e.g., no 'g' prefix). */
fca5c2d6 829@@ -34,20 +56,41 @@
56ae3f82
SS
830
831 #define AUTHORS proper_name ("David MacKenzie")
832
833+#define FATAL_ERROR(Message) \
834+ do \
835+ { \
836+ error (0, 0, (Message)); \
837+ usage (2); \
838+ } \
839+ while (0)
840+
841+enum operating_mode
842+{
843+ /* Fold texts by columns that are at the given positions. */
844+ column_mode,
845+
846+ /* Fold texts by bytes that are at the given positions. */
847+ byte_mode,
848+
849+ /* Fold texts by characters that are at the given positions. */
850+ character_mode,
851+};
852+
853+/* The argument shows current mode. (Default: column_mode) */
854+static enum operating_mode operating_mode;
855+
856 /* If nonzero, try to break on whitespace. */
857 static bool break_spaces;
858
859-/* If nonzero, count bytes, not column positions. */
860-static bool count_bytes;
861-
862 /* If nonzero, at least one of the files we read was standard input. */
863 static bool have_read_stdin;
864
865-static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
866+static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
867
868 static struct option const longopts[] =
869 {
870 {"bytes", no_argument, NULL, 'b'},
871+ {"characters", no_argument, NULL, 'c'},
872 {"spaces", no_argument, NULL, 's'},
873 {"width", required_argument, NULL, 'w'},
874 {GETOPT_HELP_OPTION_DECL},
fca5c2d6 875@@ -75,6 +118,7 @@ Wrap input lines in each FILE, writing t
e5317bd9 876
56ae3f82
SS
877 fputs (_("\
878 -b, --bytes count bytes rather than columns\n\
879+ -c, --characters count characters rather than columns\n\
880 -s, --spaces break at spaces\n\
881 -w, --width=WIDTH use WIDTH columns instead of 80\n\
882 "), stdout);
fca5c2d6 883@@ -92,7 +136,7 @@ Wrap input lines in each FILE, writing t
56ae3f82
SS
884 static size_t
885 adjust_column (size_t column, char c)
886 {
887- if (!count_bytes)
888+ if (operating_mode != byte_mode)
889 {
890 if (c == '\b')
891 {
fca5c2d6 892@@ -115,30 +159,14 @@ adjust_column (size_t column, char c)
56ae3f82
SS
893 to stdout, with maximum line length WIDTH.
894 Return true if successful. */
895
896-static bool
897-fold_file (char const *filename, size_t width)
898+static void
899+fold_text (FILE *istream, size_t width, int *saved_errno)
900 {
901- FILE *istream;
902 int c;
903 size_t column = 0; /* Screen column where next char will go. */
6987acf5 904 size_t offset_out = 0; /* Index in 'line_out' for next char. */
56ae3f82
SS
905 static char *line_out = NULL;
906 static size_t allocated_out = 0;
907- int saved_errno;
908-
909- if (STREQ (filename, "-"))
910- {
911- istream = stdin;
912- have_read_stdin = true;
913- }
914- else
915- istream = fopen (filename, "r");
916-
917- if (istream == NULL)
918- {
fca5c2d6 919- error (0, errno, "%s", quotef (filename));
56ae3f82
SS
920- return false;
921- }
922
1555d43c
SS
923 fadvise (istream, FADVISE_SEQUENTIAL);
924
fca5c2d6 925@@ -168,6 +196,15 @@ fold_file (char const *filename, size_t
56ae3f82
SS
926 bool found_blank = false;
927 size_t logical_end = offset_out;
928
929+ /* If LINE_OUT has no wide character,
930+ put a new wide character in LINE_OUT
931+ if column is bigger than width. */
932+ if (offset_out == 0)
933+ {
934+ line_out[offset_out++] = c;
935+ continue;
936+ }
937+
938 /* Look for the last blank. */
939 while (logical_end)
940 {
fca5c2d6 941@@ -214,11 +251,221 @@ fold_file (char const *filename, size_t
56ae3f82
SS
942 line_out[offset_out++] = c;
943 }
944
945- saved_errno = errno;
946+ *saved_errno = errno;
fbb9790b
SS
947+
948+ if (offset_out)
949+ fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
950+
56ae3f82
SS
951+}
952+
953+#if HAVE_MBRTOWC
954+static void
955+fold_multibyte_text (FILE *istream, size_t width, int *saved_errno)
956+{
957+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
958+ size_t buflen = 0; /* The length of the byte sequence in buf. */
3badd2da 959+ char *bufpos = buf; /* Next read position of BUF. */
56ae3f82
SS
960+ wint_t wc; /* A gotten wide character. */
961+ size_t mblength; /* The byte size of a multibyte character which shows
962+ as same character as WC. */
963+ mbstate_t state, state_bak; /* State of the stream. */
3badd2da 964+ int convfail = 0; /* 1, when conversion is failed. Otherwise 0. */
56ae3f82
SS
965+
966+ static char *line_out = NULL;
967+ size_t offset_out = 0; /* Index in `line_out' for next char. */
968+ static size_t allocated_out = 0;
969+
970+ int increment;
971+ size_t column = 0;
972+
973+ size_t last_blank_pos;
974+ size_t last_blank_column;
975+ int is_blank_seen;
976+ int last_blank_increment = 0;
977+ int is_bs_following_last_blank;
978+ size_t bs_following_last_blank_num;
979+ int is_cr_after_last_blank;
980+
981+#define CLEAR_FLAGS \
982+ do \
983+ { \
984+ last_blank_pos = 0; \
985+ last_blank_column = 0; \
986+ is_blank_seen = 0; \
987+ is_bs_following_last_blank = 0; \
988+ bs_following_last_blank_num = 0; \
989+ is_cr_after_last_blank = 0; \
990+ } \
991+ while (0)
992+
993+#define START_NEW_LINE \
994+ do \
995+ { \
996+ putchar ('\n'); \
997+ column = 0; \
998+ offset_out = 0; \
999+ CLEAR_FLAGS; \
1000+ } \
1001+ while (0)
1002+
1003+ CLEAR_FLAGS;
1004+ memset (&state, '\0', sizeof(mbstate_t));
1005+
1006+ for (;; bufpos += mblength, buflen -= mblength)
1007+ {
1008+ if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
1009+ {
1010+ memmove (buf, bufpos, buflen);
1011+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
1012+ bufpos = buf;
1013+ }
1014+
1015+ if (buflen < 1)
1016+ break;
1017+
1018+ /* Get a wide character. */
56ae3f82
SS
1019+ state_bak = state;
1020+ mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
1021+
1022+ switch (mblength)
1023+ {
1024+ case (size_t)-1:
1025+ case (size_t)-2:
1026+ convfail++;
1027+ state = state_bak;
1028+ /* Fall through. */
1029+
1030+ case 0:
1031+ mblength = 1;
1032+ break;
1033+ }
1034+
1035+rescan:
1036+ if (operating_mode == byte_mode) /* byte mode */
1037+ increment = mblength;
1038+ else if (operating_mode == character_mode) /* character mode */
1039+ increment = 1;
1040+ else /* column mode */
1041+ {
1042+ if (convfail)
1043+ increment = 1;
1044+ else
1045+ {
1046+ switch (wc)
1047+ {
1048+ case L'\n':
1049+ fwrite (line_out, sizeof(char), offset_out, stdout);
1050+ START_NEW_LINE;
1051+ continue;
fca5c2d6 1052+
56ae3f82
SS
1053+ case L'\b':
1054+ increment = (column > 0) ? -1 : 0;
1055+ break;
1056+
1057+ case L'\r':
1058+ increment = -1 * column;
1059+ break;
1060+
1061+ case L'\t':
1062+ increment = 8 - column % 8;
1063+ break;
1064+
1065+ default:
1066+ increment = wcwidth (wc);
1067+ increment = (increment < 0) ? 0 : increment;
1068+ }
1069+ }
1070+ }
1071+
1072+ if (column + increment > width && break_spaces && last_blank_pos)
1073+ {
1074+ fwrite (line_out, sizeof(char), last_blank_pos, stdout);
1075+ putchar ('\n');
1076+
1077+ offset_out = offset_out - last_blank_pos;
1078+ column = column - last_blank_column + ((is_cr_after_last_blank)
1079+ ? last_blank_increment : bs_following_last_blank_num);
1080+ memmove (line_out, line_out + last_blank_pos, offset_out);
1081+ CLEAR_FLAGS;
1082+ goto rescan;
1083+ }
1084+
1085+ if (column + increment > width && column != 0)
1086+ {
1087+ fwrite (line_out, sizeof(char), offset_out, stdout);
1088+ START_NEW_LINE;
1089+ goto rescan;
1090+ }
1091+
1092+ if (allocated_out < offset_out + mblength)
1093+ {
1094+ line_out = X2REALLOC (line_out, &allocated_out);
1095+ }
1096+
1097+ memcpy (line_out + offset_out, bufpos, mblength);
1098+ offset_out += mblength;
1099+ column += increment;
1100+
1101+ if (is_blank_seen && !convfail && wc == L'\r')
1102+ is_cr_after_last_blank = 1;
1103+
1104+ if (is_bs_following_last_blank && !convfail && wc == L'\b')
1105+ ++bs_following_last_blank_num;
1106+ else
1107+ is_bs_following_last_blank = 0;
1108+
1109+ if (break_spaces && !convfail && iswblank (wc))
1110+ {
1111+ last_blank_pos = offset_out;
1112+ last_blank_column = column;
1113+ is_blank_seen = 1;
1114+ last_blank_increment = increment;
1115+ is_bs_following_last_blank = 1;
1116+ bs_following_last_blank_num = 0;
1117+ is_cr_after_last_blank = 0;
1118+ }
1119+ }
1120+
1121+ *saved_errno = errno;
fbb9790b
SS
1122
1123 if (offset_out)
1124 fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1125
56ae3f82
SS
1126+}
1127+#endif
1128+
1129+/* Fold file FILENAME, or standard input if FILENAME is "-",
1130+ to stdout, with maximum line length WIDTH.
1131+ Return 0 if successful, 1 if an error occurs. */
1132+
1133+static bool
fbb9790b 1134+fold_file (char const *filename, size_t width)
56ae3f82
SS
1135+{
1136+ FILE *istream;
1137+ int saved_errno;
1138+
1139+ if (STREQ (filename, "-"))
1140+ {
1141+ istream = stdin;
1142+ have_read_stdin = 1;
1143+ }
1144+ else
1145+ istream = fopen (filename, "r");
1146+
1147+ if (istream == NULL)
1148+ {
fca5c2d6 1149+ error (0, errno, "%s", quotef (filename));
56ae3f82
SS
1150+ return 1;
1151+ }
1152+
1153+ /* Define how ISTREAM is being folded. */
1154+#if HAVE_MBRTOWC
1155+ if (MB_CUR_MAX > 1)
1156+ fold_multibyte_text (istream, width, &saved_errno);
1157+ else
1158+#endif
1159+ fold_text (istream, width, &saved_errno);
1160+
1161 if (ferror (istream))
1162 {
fca5c2d6
SS
1163 error (0, saved_errno, "%s", quotef (filename));
1164@@ -251,7 +498,8 @@ main (int argc, char **argv)
56ae3f82
SS
1165
1166 atexit (close_stdout);
1167
1168- break_spaces = count_bytes = have_read_stdin = false;
1169+ operating_mode = column_mode;
1170+ break_spaces = have_read_stdin = false;
1171
1172 while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
1173 {
fca5c2d6 1174@@ -260,7 +508,15 @@ main (int argc, char **argv)
56ae3f82
SS
1175 switch (optc)
1176 {
1177 case 'b': /* Count bytes rather than columns. */
1178- count_bytes = true;
1179+ if (operating_mode != column_mode)
1180+ FATAL_ERROR (_("only one way of folding may be specified"));
1181+ operating_mode = byte_mode;
1182+ break;
1183+
1184+ case 'c':
1185+ if (operating_mode != column_mode)
1186+ FATAL_ERROR (_("only one way of folding may be specified"));
1187+ operating_mode = character_mode;
1188 break;
1189
1190 case 's': /* Break at word boundaries. */
fca5c2d6
SS
1191diff -Naurp coreutils-8.25-orig/src/join.c coreutils-8.25/src/join.c
1192--- coreutils-8.25-orig/src/join.c 2016-01-13 05:08:59.000000000 -0600
1193+++ coreutils-8.25/src/join.c 2016-02-08 19:07:10.303944625 -0600
1555d43c 1194@@ -22,18 +22,32 @@
56ae3f82
SS
1195 #include <sys/types.h>
1196 #include <getopt.h>
1197
1198+/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth(). */
1199+#if HAVE_WCHAR_H
1200+# include <wchar.h>
1201+#endif
1202+
1203+/* Get iswblank(), towupper. */
1204+#if HAVE_WCTYPE_H
1205+# include <wctype.h>
1206+#endif
1207+
1208 #include "system.h"
1209 #include "error.h"
1555d43c 1210 #include "fadvise.h"
56ae3f82
SS
1211 #include "hard-locale.h"
1212 #include "linebuffer.h"
1213-#include "memcasecmp.h"
1214 #include "quote.h"
1215 #include "stdio--.h"
1216 #include "xmemcoll.h"
1217 #include "xstrtol.h"
1218 #include "argmatch.h"
1219
1220+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1221+#if HAVE_MBRTOWC && defined mbstate_t
1222+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1223+#endif
1224+
6987acf5 1225 /* The official name of this program (e.g., no 'g' prefix). */
56ae3f82
SS
1226 #define PROGRAM_NAME "join"
1227
fca5c2d6 1228@@ -135,10 +149,12 @@ static struct outlist outlist_head;
6987acf5 1229 /* Last element in 'outlist', where a new element can be added. */
56ae3f82
SS
1230 static struct outlist *outlist_end = &outlist_head;
1231
1232-/* Tab character separating fields. If negative, fields are separated
1233- by any nonempty string of blanks, otherwise by exactly one
1234- tab character whose value (when cast to unsigned char) equals TAB. */
1235-static int tab = -1;
1236+/* Tab character separating fields. If NULL, fields are separated
1237+ by any nonempty string of blanks. */
1238+static char *tab = NULL;
1239+
1240+/* The number of bytes used for tab. */
1241+static size_t tablen = 0;
1242
1243 /* If nonzero, check that the input is correctly ordered. */
1244 static enum
fca5c2d6 1245@@ -275,13 +291,14 @@ xfields (struct line *line)
56ae3f82
SS
1246 if (ptr == lim)
1247 return;
1248
1555d43c 1249- if (0 <= tab && tab != '\n')
56ae3f82
SS
1250+ if (tab != NULL)
1251 {
1252+ unsigned char t = tab[0];
1253 char *sep;
1254- for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
1255+ for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
1256 extract_field (line, ptr, sep - ptr);
1257 }
1555d43c
SS
1258- else if (tab < 0)
1259+ else
1260 {
1261 /* Skip leading blanks before the first field. */
fca5c2d6
SS
1262 while (field_sep (*ptr))
1263@@ -305,6 +322,147 @@ xfields (struct line *line)
56ae3f82
SS
1264 extract_field (line, ptr, lim - ptr);
1265 }
1266
1267+#if HAVE_MBRTOWC
1268+static void
1269+xfields_multibyte (struct line *line)
1270+{
1271+ char *ptr = line->buf.buffer;
1272+ char const *lim = ptr + line->buf.length - 1;
1273+ wchar_t wc = 0;
1274+ size_t mblength = 1;
1275+ mbstate_t state, state_bak;
1276+
1277+ memset (&state, 0, sizeof (mbstate_t));
1278+
1279+ if (ptr >= lim)
1280+ return;
1281+
1282+ if (tab != NULL)
1283+ {
56ae3f82
SS
1284+ char *sep = ptr;
1285+ for (; ptr < lim; ptr = sep + mblength)
1286+ {
1287+ sep = ptr;
1288+ while (sep < lim)
1289+ {
1290+ state_bak = state;
1291+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1292+
1293+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1294+ {
1295+ mblength = 1;
1296+ state = state_bak;
1297+ }
1298+ mblength = (mblength < 1) ? 1 : mblength;
1299+
1300+ if (mblength == tablen && !memcmp (sep, tab, mblength))
1301+ break;
1302+ else
1303+ {
1304+ sep += mblength;
1305+ continue;
1306+ }
1307+ }
1308+
1309+ if (sep >= lim)
1310+ break;
1311+
1312+ extract_field (line, ptr, sep - ptr);
1313+ }
1314+ }
1315+ else
1316+ {
1317+ /* Skip leading blanks before the first field. */
1318+ while(ptr < lim)
1319+ {
1320+ state_bak = state;
1321+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1322+
1323+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1324+ {
1325+ mblength = 1;
1326+ state = state_bak;
1327+ break;
1328+ }
1329+ mblength = (mblength < 1) ? 1 : mblength;
1330+
fca5c2d6 1331+ if (!iswblank(wc) && wc != '\n')
56ae3f82
SS
1332+ break;
1333+ ptr += mblength;
1334+ }
1335+
1336+ do
1337+ {
1338+ char *sep;
1339+ state_bak = state;
1340+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1341+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1342+ {
1343+ mblength = 1;
1344+ state = state_bak;
1345+ break;
1346+ }
1347+ mblength = (mblength < 1) ? 1 : mblength;
1348+
1349+ sep = ptr + mblength;
1350+ while (sep < lim)
1351+ {
1352+ state_bak = state;
1353+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1354+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1355+ {
1356+ mblength = 1;
1357+ state = state_bak;
1358+ break;
1359+ }
1360+ mblength = (mblength < 1) ? 1 : mblength;
1361+
fca5c2d6 1362+ if (iswblank (wc) || wc == '\n')
56ae3f82
SS
1363+ break;
1364+
1365+ sep += mblength;
1366+ }
1367+
1368+ extract_field (line, ptr, sep - ptr);
1369+ if (sep >= lim)
1370+ return;
1371+
1372+ state_bak = state;
1373+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1374+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1375+ {
1376+ mblength = 1;
1377+ state = state_bak;
1378+ break;
1379+ }
1380+ mblength = (mblength < 1) ? 1 : mblength;
1381+
1382+ ptr = sep + mblength;
1383+ while (ptr < lim)
1384+ {
1385+ state_bak = state;
1386+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1387+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1388+ {
1389+ mblength = 1;
1390+ state = state_bak;
1391+ break;
1392+ }
1393+ mblength = (mblength < 1) ? 1 : mblength;
1394+
fca5c2d6 1395+ if (!iswblank (wc) && wc != '\n')
56ae3f82
SS
1396+ break;
1397+
1398+ ptr += mblength;
1399+ }
1400+ }
1401+ while (ptr < lim);
1402+ }
1403+
1404+ extract_field (line, ptr, lim - ptr);
1405+}
1406+#endif
fca5c2d6 1407+
56ae3f82
SS
1408 static void
1409 freeline (struct line *line)
1410 {
fca5c2d6 1411@@ -326,56 +484,133 @@ keycmp (struct line const *line1, struct
56ae3f82
SS
1412 size_t jf_1, size_t jf_2)
1413 {
1414 /* Start of field to compare in each file. */
1415- char *beg1;
1416- char *beg2;
1417-
1418- size_t len1;
1419- size_t len2; /* Length of fields to compare. */
1420+ char *beg[2];
1421+ char *copy[2];
1422+ size_t len[2]; /* Length of fields to compare. */
1423 int diff;
1424+ int i, j;
e5317bd9 1425+ int mallocd = 0;
56ae3f82
SS
1426
1427 if (jf_1 < line1->nfields)
1428 {
1429- beg1 = line1->fields[jf_1].beg;
1430- len1 = line1->fields[jf_1].len;
1431+ beg[0] = line1->fields[jf_1].beg;
1432+ len[0] = line1->fields[jf_1].len;
1433 }
1434 else
1435 {
1436- beg1 = NULL;
1437- len1 = 0;
1438+ beg[0] = NULL;
1439+ len[0] = 0;
1440 }
1441
1442 if (jf_2 < line2->nfields)
1443 {
1444- beg2 = line2->fields[jf_2].beg;
1445- len2 = line2->fields[jf_2].len;
1446+ beg[1] = line2->fields[jf_2].beg;
1447+ len[1] = line2->fields[jf_2].len;
1448 }
1449 else
1450 {
1451- beg2 = NULL;
1452- len2 = 0;
1453+ beg[1] = NULL;
1454+ len[1] = 0;
1455 }
1456
1457- if (len1 == 0)
1458- return len2 == 0 ? 0 : -1;
1459- if (len2 == 0)
1460+ if (len[0] == 0)
1461+ return len[1] == 0 ? 0 : -1;
1462+ if (len[1] == 0)
1463 return 1;
1464
1465 if (ignore_case)
1466 {
1467- /* FIXME: ignore_case does not work with NLS (in particular,
1468- with multibyte chars). */
1469- diff = memcasecmp (beg1, beg2, MIN (len1, len2));
1470+#ifdef HAVE_MBRTOWC
1471+ if (MB_CUR_MAX > 1)
1472+ {
1473+ size_t mblength;
1474+ wchar_t wc, uwc;
1475+ mbstate_t state, state_bak;
1476+
1477+ memset (&state, '\0', sizeof (mbstate_t));
1478+
1479+ for (i = 0; i < 2; i++)
1480+ {
e5317bd9
SS
1481+ mallocd = 1;
1482+ copy[i] = xmalloc (len[i] + 1);
fbb9790b 1483+ memset (copy[i], '\0',len[i] + 1);
56ae3f82
SS
1484+
1485+ for (j = 0; j < MIN (len[0], len[1]);)
1486+ {
1487+ state_bak = state;
1488+ mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
1489+
1490+ switch (mblength)
1491+ {
1492+ case (size_t) -1:
1493+ case (size_t) -2:
1494+ state = state_bak;
1495+ /* Fall through */
1496+ case 0:
1497+ mblength = 1;
1498+ break;
1499+
1500+ default:
1501+ uwc = towupper (wc);
1502+
1503+ if (uwc != wc)
1504+ {
1505+ mbstate_t state_wc;
fbb9790b 1506+ size_t mblen;
56ae3f82
SS
1507+
1508+ memset (&state_wc, '\0', sizeof (mbstate_t));
fbb9790b
SS
1509+ mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
1510+ assert (mblen != (size_t)-1);
56ae3f82
SS
1511+ }
1512+ else
1513+ memcpy (copy[i] + j, beg[i] + j, mblength);
1514+ }
1515+ j += mblength;
1516+ }
1517+ copy[i][j] = '\0';
1518+ }
1519+ }
1520+ else
1521+#endif
1522+ {
1523+ for (i = 0; i < 2; i++)
1524+ {
e5317bd9
SS
1525+ mallocd = 1;
1526+ copy[i] = xmalloc (len[i] + 1);
56ae3f82
SS
1527+
1528+ for (j = 0; j < MIN (len[0], len[1]); j++)
1529+ copy[i][j] = toupper (beg[i][j]);
1530+
1531+ copy[i][j] = '\0';
1532+ }
1533+ }
1534 }
1535 else
1536 {
1537- if (hard_LC_COLLATE)
1538- return xmemcoll (beg1, len1, beg2, len2);
1539- diff = memcmp (beg1, beg2, MIN (len1, len2));
fbb9790b
SS
1540+ copy[0] = beg[0];
1541+ copy[1] = beg[1];
1542+ }
1543+
56ae3f82 1544+ if (hard_LC_COLLATE)
e5317bd9
SS
1545+ {
1546+ diff = xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
1547+
1548+ if (mallocd)
1549+ for (i = 0; i < 2; i++)
1550+ free (copy[i]);
1551+
1552+ return diff;
fbb9790b 1553 }
56ae3f82
SS
1554+ diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
1555+
e5317bd9
SS
1556+ if (mallocd)
1557+ for (i = 0; i < 2; i++)
1558+ free (copy[i]);
1559+
fbb9790b 1560
56ae3f82
SS
1561 if (diff)
1562 return diff;
1563- return len1 < len2 ? -1 : len1 != len2;
1564+ return len[0] - len[1];
1565 }
1566
1567 /* Check that successive input lines PREV and CURRENT from input file
fca5c2d6 1568@@ -467,6 +702,11 @@ get_line (FILE *fp, struct line **linep,
56ae3f82 1569 }
e7f6ab54 1570 ++line_no[which - 1];
56ae3f82
SS
1571
1572+#if HAVE_MBRTOWC
1573+ if (MB_CUR_MAX > 1)
1574+ xfields_multibyte (line);
1575+ else
1576+#endif
1577 xfields (line);
1578
1579 if (prevline[which - 1])
fca5c2d6 1580@@ -566,21 +806,28 @@ prfield (size_t n, struct line const *li
56ae3f82 1581
3badd2da 1582 /* Output all the fields in line, other than the join field. */
56ae3f82
SS
1583
1584+#define PUT_TAB_CHAR \
1585+ do \
1586+ { \
1587+ (tab != NULL) ? \
1588+ fwrite(tab, sizeof(char), tablen, stdout) : putchar (' '); \
1589+ } \
3badd2da 1590+ while (0)
56ae3f82
SS
1591+
1592 static void
3badd2da
SS
1593 prfields (struct line const *line, size_t join_field, size_t autocount)
1594 {
1595 size_t i;
1596 size_t nfields = autoformat ? autocount : line->nfields;
1597- char output_separator = tab < 0 ? ' ' : tab;
1598
1599 for (i = 0; i < join_field && i < nfields; ++i)
1600 {
1601- putchar (output_separator);
1602+ PUT_TAB_CHAR;
1603 prfield (i, line);
1604 }
1605 for (i = join_field + 1; i < nfields; ++i)
1606 {
1607- putchar (output_separator);
1608+ PUT_TAB_CHAR;
1609 prfield (i, line);
1610 }
1611 }
fca5c2d6 1612@@ -591,7 +838,6 @@ static void
56ae3f82
SS
1613 prjoin (struct line const *line1, struct line const *line2)
1614 {
1615 const struct outlist *outlist;
1616- char output_separator = tab < 0 ? ' ' : tab;
3badd2da
SS
1617 size_t field;
1618 struct line const *line;
56ae3f82 1619
fca5c2d6 1620@@ -625,7 +871,7 @@ prjoin (struct line const *line1, struct
56ae3f82
SS
1621 o = o->next;
1622 if (o == NULL)
1623 break;
1624- putchar (output_separator);
1625+ PUT_TAB_CHAR;
1626 }
fbb9790b 1627 putchar (eolchar);
56ae3f82 1628 }
fca5c2d6 1629@@ -1103,21 +1349,46 @@ main (int argc, char **argv)
56ae3f82
SS
1630
1631 case 't':
1632 {
1633- unsigned char newtab = optarg[0];
e7f6ab54 1634+ char *newtab = NULL;
56ae3f82
SS
1635+ size_t newtablen;
1636+ newtab = xstrdup (optarg);
1637+#if HAVE_MBRTOWC
1638+ if (MB_CUR_MAX > 1)
1639+ {
1640+ mbstate_t state;
1641+
1642+ memset (&state, 0, sizeof (mbstate_t));
1643+ newtablen = mbrtowc (NULL, newtab,
1644+ strnlen (newtab, MB_LEN_MAX),
1645+ &state);
1646+ if (newtablen == (size_t) 0
1647+ || newtablen == (size_t) -1
1648+ || newtablen == (size_t) -2)
1649+ newtablen = 1;
1650+ }
1651+ else
1652+#endif
1653+ newtablen = 1;
1654 if (! newtab)
e7f6ab54 1655- newtab = '\n'; /* '' => process the whole line. */
6987acf5 1656+ {
fbb9790b 1657+ newtab = (char*)"\n"; /* '' => process the whole line. */
56ae3f82
SS
1658+ }
1659 else if (optarg[1])
1660 {
1661- if (STREQ (optarg, "\\0"))
1662- newtab = '\0';
1663- else
1664- error (EXIT_FAILURE, 0, _("multi-character tab %s"),
1665- quote (optarg));
1666+ if (newtablen == 1 && newtab[1])
1667+ {
1668+ if (STREQ (newtab, "\\0"))
1669+ newtab[0] = '\0';
1670+ }
1671+ }
1672+ if (tab != NULL && strcmp (tab, newtab))
1673+ {
1674+ free (newtab);
1675+ error (EXIT_FAILURE, 0, _("incompatible tabs"));
1676 }
1677- if (0 <= tab && tab != newtab)
1678- error (EXIT_FAILURE, 0, _("incompatible tabs"));
1679 tab = newtab;
1680- }
1681+ tablen = newtablen;
1682+ }
1683 break;
1684
fbb9790b 1685 case 'z':
fca5c2d6
SS
1686diff -Naurp coreutils-8.25-orig/src/pr.c coreutils-8.25/src/pr.c
1687--- coreutils-8.25-orig/src/pr.c 2016-01-01 07:48:50.000000000 -0600
1688+++ coreutils-8.25/src/pr.c 2016-02-08 19:07:10.306944635 -0600
1689@@ -311,6 +311,24 @@
56ae3f82
SS
1690
1691 #include <getopt.h>
1692 #include <sys/types.h>
1693+
1694+/* Get MB_LEN_MAX. */
1695+#include <limits.h>
1696+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
1697+ installation; work around this configuration error. */
1698+#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
1699+# define MB_LEN_MAX 16
1700+#endif
1701+
1702+/* Get MB_CUR_MAX. */
1703+#include <stdlib.h>
1704+
1705+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
1706+/* Get mbstate_t, mbrtowc(), wcwidth(). */
1707+#if HAVE_WCHAR_H
1708+# include <wchar.h>
1709+#endif
56ae3f82
SS
1710+
1711 #include "system.h"
1712 #include "error.h"
1555d43c 1713 #include "fadvise.h"
fbb9790b 1714@@ -323,6 +341,18 @@
56ae3f82 1715 #include "xstrtol.h"
fca5c2d6 1716 #include "xdectoint.h"
56ae3f82
SS
1717
1718+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1719+#if HAVE_MBRTOWC && defined mbstate_t
1720+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1721+#endif
1722+
1723+#ifndef HAVE_DECL_WCWIDTH
1724+"this configure-time declaration test was not run"
1725+#endif
1726+#if !HAVE_DECL_WCWIDTH
1727+extern int wcwidth ();
1728+#endif
1729+
6987acf5 1730 /* The official name of this program (e.g., no 'g' prefix). */
56ae3f82
SS
1731 #define PROGRAM_NAME "pr"
1732
fca5c2d6 1733@@ -415,7 +445,20 @@ struct COLUMN
56ae3f82
SS
1734
1735 typedef struct COLUMN COLUMN;
1736
1737-static int char_to_clump (char c);
1738+/* Funtion pointers to switch functions for single byte locale or for
1739+ multibyte locale. If multibyte functions do not exist in your sysytem,
1740+ these pointers always point the function for single byte locale. */
1741+static void (*print_char) (char c);
1742+static int (*char_to_clump) (char c);
1743+
1744+/* Functions for single byte locale. */
1745+static void print_char_single (char c);
1746+static int char_to_clump_single (char c);
1747+
1748+/* Functions for multibyte locale. */
1749+static void print_char_multi (char c);
1750+static int char_to_clump_multi (char c);
1751+
1752 static bool read_line (COLUMN *p);
1753 static bool print_page (void);
1754 static bool print_stored (COLUMN *p);
fca5c2d6
SS
1755@@ -427,6 +470,7 @@ static void add_line_number (COLUMN *p);
1756 static void getoptnum (const char *n_str, int min, int *num,
1757 const char *errfmt);
56ae3f82
SS
1758 static void getoptarg (char *arg, char switch_char, char *character,
1759+ int *character_length, int *character_width,
1760 int *number);
56ae3f82 1761 static void print_files (int number_of_files, char **av);
fa4603be 1762 static void init_parameters (int number_of_files);
fca5c2d6
SS
1763@@ -440,7 +484,6 @@ static void store_char (char c);
1764 static void pad_down (unsigned int lines);
56ae3f82
SS
1765 static void read_rest_of_line (COLUMN *p);
1766 static void skip_read (COLUMN *p, int column_number);
1767-static void print_char (char c);
1768 static void cleanup (void);
1769 static void print_sep_string (void);
1770 static void separator_string (const char *optarg_S);
fca5c2d6 1771@@ -452,7 +495,7 @@ static COLUMN *column_vector;
56ae3f82
SS
1772 we store the leftmost columns contiguously in buff.
1773 To print a line from buff, get the index of the first character
1774 from line_vector[i], and print up to line_vector[i + 1]. */
1775-static char *buff;
1776+static unsigned char *buff;
1777
1778 /* Index of the position in buff where the next character
1779 will be stored. */
fca5c2d6 1780@@ -556,7 +599,7 @@ static int chars_per_column;
56ae3f82
SS
1781 static bool untabify_input = false;
1782
1783 /* (-e) The input tab character. */
1784-static char input_tab_char = '\t';
1785+static char input_tab_char[MB_LEN_MAX] = "\t";
1786
1787 /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
1788 where the leftmost column is 1. */
fca5c2d6 1789@@ -566,7 +609,10 @@ static int chars_per_input_tab = 8;
56ae3f82
SS
1790 static bool tabify_output = false;
1791
1792 /* (-i) The output tab character. */
1793-static char output_tab_char = '\t';
1794+static char output_tab_char[MB_LEN_MAX] = "\t";
1795+
1796+/* (-i) The byte length of output tab character. */
1797+static int output_tab_char_length = 1;
1798
1799 /* (-i) The width of the output tab. */
1800 static int chars_per_output_tab = 8;
fca5c2d6 1801@@ -636,7 +682,13 @@ static int line_number;
56ae3f82
SS
1802 static bool numbered_lines = false;
1803
1804 /* (-n) Character which follows each line number. */
1805-static char number_separator = '\t';
1806+static char number_separator[MB_LEN_MAX] = "\t";
1807+
1808+/* (-n) The byte length of the character which follows each line number. */
1809+static int number_separator_length = 1;
1810+
1811+/* (-n) The character width of the character which follows each line number. */
1812+static int number_separator_width = 0;
1813
1814 /* (-n) line counting starts with 1st line of input file (not with 1st
1815 line of 1st page printed). */
fca5c2d6 1816@@ -689,6 +741,7 @@ static bool use_col_separator = false;
6987acf5 1817 -a|COLUMN|-m is a 'space' and with the -J option a 'tab'. */
56ae3f82
SS
1818 static char *col_sep_string = (char *) "";
1819 static int col_sep_length = 0;
1820+static int col_sep_width = 0;
1821 static char *column_separator = (char *) " ";
1822 static char *line_separator = (char *) "\t";
1823
fca5c2d6 1824@@ -839,6 +892,13 @@ separator_string (const char *optarg_S)
56ae3f82
SS
1825 col_sep_length = (int) strlen (optarg_S);
1826 col_sep_string = xmalloc (col_sep_length + 1);
1827 strcpy (col_sep_string, optarg_S);
1828+
1829+#if HAVE_MBRTOWC
1830+ if (MB_CUR_MAX > 1)
1831+ col_sep_width = mbswidth (col_sep_string, 0);
1832+ else
1833+#endif
1834+ col_sep_width = col_sep_length;
1835 }
1836
1837 int
fca5c2d6 1838@@ -863,6 +923,21 @@ main (int argc, char **argv)
56ae3f82
SS
1839
1840 atexit (close_stdout);
1841
1842+/* Define which functions are used, the ones for single byte locale or the ones
1843+ for multibyte locale. */
1844+#if HAVE_MBRTOWC
1845+ if (MB_CUR_MAX > 1)
1846+ {
1847+ print_char = print_char_multi;
1848+ char_to_clump = char_to_clump_multi;
1849+ }
1850+ else
1851+#endif
1852+ {
1853+ print_char = print_char_single;
1854+ char_to_clump = char_to_clump_single;
1855+ }
1856+
1857 n_files = 0;
1858 file_names = (argc > 1
1859 ? xmalloc ((argc - 1) * sizeof (char *))
fca5c2d6 1860@@ -939,8 +1014,12 @@ main (int argc, char **argv)
56ae3f82
SS
1861 break;
1862 case 'e':
1863 if (optarg)
1864- getoptarg (optarg, 'e', &input_tab_char,
1865- &chars_per_input_tab);
1866+ {
1867+ int dummy_length, dummy_width;
1868+
1869+ getoptarg (optarg, 'e', input_tab_char, &dummy_length,
1870+ &dummy_width, &chars_per_input_tab);
1871+ }
1872 /* Could check tab width > 0. */
1873 untabify_input = true;
1874 break;
fca5c2d6 1875@@ -953,8 +1032,12 @@ main (int argc, char **argv)
56ae3f82
SS
1876 break;
1877 case 'i':
1878 if (optarg)
1879- getoptarg (optarg, 'i', &output_tab_char,
1880- &chars_per_output_tab);
1881+ {
1882+ int dummy_width;
1883+
1884+ getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
1885+ &dummy_width, &chars_per_output_tab);
1886+ }
1887 /* Could check tab width > 0. */
1888 tabify_output = true;
1889 break;
fca5c2d6 1890@@ -972,8 +1055,8 @@ main (int argc, char **argv)
56ae3f82
SS
1891 case 'n':
1892 numbered_lines = true;
1893 if (optarg)
1894- getoptarg (optarg, 'n', &number_separator,
1895- &chars_per_number);
1896+ getoptarg (optarg, 'n', number_separator, &number_separator_length,
1897+ &number_separator_width, &chars_per_number);
1898 break;
1899 case 'N':
1900 skip_count = false;
fca5c2d6 1901@@ -997,7 +1080,7 @@ main (int argc, char **argv)
56ae3f82
SS
1902 old_s = false;
1903 /* Reset an additional input of -s, -S dominates -s */
1904 col_sep_string = bad_cast ("");
1905- col_sep_length = 0;
1906+ col_sep_length = col_sep_width = 0;
1907 use_col_separator = true;
1908 if (optarg)
1909 separator_string (optarg);
fca5c2d6 1910@@ -1152,10 +1235,45 @@ getoptnum (const char *n_str, int min, i
56ae3f82
SS
1911 a number. */
1912
1913 static void
1914-getoptarg (char *arg, char switch_char, char *character, int *number)
1915+getoptarg (char *arg, char switch_char, char *character, int *character_length,
1916+ int *character_width, int *number)
1917 {
1918 if (!ISDIGIT (*arg))
1919- *character = *arg++;
1920+ {
1921+#ifdef HAVE_MBRTOWC
1922+ if (MB_CUR_MAX > 1) /* for multibyte locale. */
1923+ {
1924+ wchar_t wc;
1925+ size_t mblength;
1926+ int width;
1927+ mbstate_t state = {'\0'};
1928+
1929+ mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
1930+
1931+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
1932+ {
1933+ *character_length = 1;
1934+ *character_width = 1;
1935+ }
1936+ else
1937+ {
1938+ *character_length = (mblength < 1) ? 1 : mblength;
1939+ width = wcwidth (wc);
1940+ *character_width = (width < 0) ? 0 : width;
1941+ }
1942+
1943+ strncpy (character, arg, *character_length);
1944+ arg += *character_length;
1945+ }
1946+ else /* for single byte locale. */
1947+#endif
1948+ {
1949+ *character = *arg++;
1950+ *character_length = 1;
1951+ *character_width = 1;
1952+ }
1953+ }
1954+
1955 if (*arg)
1956 {
1957 long int tmp_long;
fca5c2d6 1958@@ -1177,6 +1295,11 @@ static void
6987acf5
MT
1959 init_parameters (int number_of_files)
1960 {
1961 int chars_used_by_number = 0;
1962+ int mb_len = 1;
1963+#if HAVE_MBRTOWC
1964+ if (MB_CUR_MAX > 1)
1965+ mb_len = MB_LEN_MAX;
1966+#endif
1967
1968 lines_per_body = lines_per_page - lines_per_header - lines_per_footer;
1969 if (lines_per_body <= 0)
fca5c2d6 1970@@ -1214,7 +1337,7 @@ init_parameters (int number_of_files)
56ae3f82
SS
1971 else
1972 col_sep_string = column_separator;
1973
1974- col_sep_length = 1;
1975+ col_sep_length = col_sep_width = 1;
1976 use_col_separator = true;
1977 }
1978 /* It's rather pointless to define a TAB separator with column
fca5c2d6 1979@@ -1244,11 +1367,11 @@ init_parameters (int number_of_files)
764f5877 1980 + TAB_WIDTH (chars_per_input_tab, chars_per_number); */
56ae3f82
SS
1981
1982 /* Estimate chars_per_text without any margin and keep it constant. */
1983- if (number_separator == '\t')
1984+ if (number_separator[0] == '\t')
764f5877
SS
1985 number_width = (chars_per_number
1986 + TAB_WIDTH (chars_per_default_tab, chars_per_number));
56ae3f82
SS
1987 else
1988- number_width = chars_per_number + 1;
1989+ number_width = chars_per_number + number_separator_width;
1990
1991 /* The number is part of the column width unless we are
1992 printing files in parallel. */
fca5c2d6 1993@@ -1257,7 +1380,7 @@ init_parameters (int number_of_files)
56ae3f82
SS
1994 }
1995
764f5877
SS
1996 chars_per_column = (chars_per_line - chars_used_by_number
1997- - (columns - 1) * col_sep_length) / columns;
1998+ - (columns - 1) * col_sep_width) / columns;
56ae3f82
SS
1999
2000 if (chars_per_column < 1)
2001 error (EXIT_FAILURE, 0, _("page width too narrow"));
fca5c2d6 2002@@ -1275,7 +1398,7 @@ init_parameters (int number_of_files)
6987acf5
MT
2003 We've to use 8 as the lower limit, if we use chars_per_default_tab = 8
2004 to expand a tab which is not an input_tab-char. */
2005 free (clump_buff);
2006- clump_buff = xmalloc (MAX (8, chars_per_input_tab));
2007+ clump_buff = xmalloc (mb_len * MAX (8, chars_per_input_tab));
2008 }
fca5c2d6 2009
6987acf5 2010 /* Open the necessary files,
fca5c2d6 2011@@ -1383,7 +1506,7 @@ init_funcs (void)
56ae3f82
SS
2012
2013 /* Enlarge p->start_position of first column to use the same form of
2014 padding_not_printed with all columns. */
2015- h = h + col_sep_length;
2016+ h = h + col_sep_width;
2017
2018 /* This loop takes care of all but the rightmost column. */
2019
fca5c2d6 2020@@ -1417,7 +1540,7 @@ init_funcs (void)
56ae3f82
SS
2021 }
2022 else
2023 {
2024- h = h_next + col_sep_length;
2025+ h = h_next + col_sep_width;
2026 h_next = h + chars_per_column;
2027 }
2028 }
fca5c2d6 2029@@ -1708,9 +1831,9 @@ static void
56ae3f82
SS
2030 align_column (COLUMN *p)
2031 {
2032 padding_not_printed = p->start_position;
2033- if (padding_not_printed - col_sep_length > 0)
2034+ if (padding_not_printed - col_sep_width > 0)
2035 {
2036- pad_across_to (padding_not_printed - col_sep_length);
2037+ pad_across_to (padding_not_printed - col_sep_width);
2038 padding_not_printed = ANYWHERE;
2039 }
2040
fca5c2d6 2041@@ -1981,13 +2104,13 @@ store_char (char c)
56ae3f82
SS
2042 /* May be too generous. */
2043 buff = X2REALLOC (buff, &buff_allocated);
2044 }
2045- buff[buff_current++] = c;
2046+ buff[buff_current++] = (unsigned char) c;
2047 }
2048
2049 static void
2050 add_line_number (COLUMN *p)
2051 {
2052- int i;
2053+ int i, j;
2054 char *s;
e5317bd9 2055 int num_width;
56ae3f82 2056
fca5c2d6 2057@@ -2004,22 +2127,24 @@ add_line_number (COLUMN *p)
56ae3f82 2058 /* Tabification is assumed for multiple columns, also for n-separators,
6987acf5 2059 but 'default n-separator = TAB' hasn't been given priority over
56ae3f82
SS
2060 equal column_width also specified by POSIX. */
2061- if (number_separator == '\t')
2062+ if (number_separator[0] == '\t')
2063 {
2064 i = number_width - chars_per_number;
2065 while (i-- > 0)
2066 (p->char_func) (' ');
2067 }
2068 else
2069- (p->char_func) (number_separator);
2070+ for (j = 0; j < number_separator_length; j++)
2071+ (p->char_func) (number_separator[j]);
2072 }
2073 else
2074 /* To comply with POSIX, we avoid any expansion of default TAB
2075 separator with a single column output. No column_width requirement
2076 has to be considered. */
2077 {
2078- (p->char_func) (number_separator);
2079- if (number_separator == '\t')
2080+ for (j = 0; j < number_separator_length; j++)
2081+ (p->char_func) (number_separator[j]);
2082+ if (number_separator[0] == '\t')
2083 output_position = POS_AFTER_TAB (chars_per_output_tab,
2084 output_position);
2085 }
fca5c2d6 2086@@ -2180,7 +2305,7 @@ print_white_space (void)
56ae3f82
SS
2087 while (goal - h_old > 1
2088 && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
2089 {
2090- putchar (output_tab_char);
2091+ fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
2092 h_old = h_new;
2093 }
2094 while (++h_old <= goal)
fca5c2d6 2095@@ -2200,6 +2325,7 @@ print_sep_string (void)
56ae3f82
SS
2096 {
2097 char *s;
2098 int l = col_sep_length;
2099+ int not_space_flag;
2100
2101 s = col_sep_string;
2102
fca5c2d6 2103@@ -2213,6 +2339,7 @@ print_sep_string (void)
56ae3f82
SS
2104 {
2105 for (; separators_not_printed > 0; --separators_not_printed)
2106 {
2107+ not_space_flag = 0;
2108 while (l-- > 0)
2109 {
2110 /* 3 types of sep_strings: spaces only, spaces and chars,
fca5c2d6 2111@@ -2226,12 +2353,15 @@ print_sep_string (void)
56ae3f82
SS
2112 }
2113 else
2114 {
2115+ not_space_flag = 1;
2116 if (spaces_not_printed > 0)
2117 print_white_space ();
2118 putchar (*s++);
2119- ++output_position;
2120 }
2121 }
2122+ if (not_space_flag)
2123+ output_position += col_sep_width;
2124+
2125 /* sep_string ends with some spaces */
2126 if (spaces_not_printed > 0)
2127 print_white_space ();
fca5c2d6 2128@@ -2259,7 +2389,7 @@ print_clump (COLUMN *p, int n, char *clu
56ae3f82
SS
2129 required number of tabs and spaces. */
2130
2131 static void
2132-print_char (char c)
2133+print_char_single (char c)
2134 {
2135 if (tabify_output)
2136 {
fca5c2d6 2137@@ -2283,6 +2413,74 @@ print_char (char c)
56ae3f82
SS
2138 putchar (c);
2139 }
2140
2141+#ifdef HAVE_MBRTOWC
2142+static void
2143+print_char_multi (char c)
2144+{
2145+ static size_t mbc_pos = 0;
2146+ static char mbc[MB_LEN_MAX] = {'\0'};
2147+ static mbstate_t state = {'\0'};
2148+ mbstate_t state_bak;
2149+ wchar_t wc;
2150+ size_t mblength;
2151+ int width;
2152+
2153+ if (tabify_output)
2154+ {
2155+ state_bak = state;
2156+ mbc[mbc_pos++] = c;
2157+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2158+
2159+ while (mbc_pos > 0)
2160+ {
2161+ switch (mblength)
2162+ {
2163+ case (size_t)-2:
2164+ state = state_bak;
2165+ return;
2166+
2167+ case (size_t)-1:
2168+ state = state_bak;
2169+ ++output_position;
2170+ putchar (mbc[0]);
2171+ memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
2172+ --mbc_pos;
2173+ break;
2174+
2175+ case 0:
2176+ mblength = 1;
2177+
2178+ default:
2179+ if (wc == L' ')
2180+ {
2181+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2182+ --mbc_pos;
2183+ ++spaces_not_printed;
2184+ return;
2185+ }
2186+ else if (spaces_not_printed > 0)
2187+ print_white_space ();
2188+
2189+ /* Nonprintables are assumed to have width 0, except L'\b'. */
2190+ if ((width = wcwidth (wc)) < 1)
2191+ {
2192+ if (wc == L'\b')
2193+ --output_position;
2194+ }
2195+ else
2196+ output_position += width;
2197+
2198+ fwrite (mbc, sizeof(char), mblength, stdout);
2199+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2200+ mbc_pos -= mblength;
2201+ }
2202+ }
2203+ return;
2204+ }
2205+ putchar (c);
2206+}
2207+#endif
2208+
2209 /* Skip to page PAGE before printing.
2210 PAGE may be larger than total number of pages. */
2211
fca5c2d6 2212@@ -2462,9 +2660,9 @@ read_line (COLUMN *p)
56ae3f82
SS
2213 align_empty_cols = false;
2214 }
2215
2216- if (padding_not_printed - col_sep_length > 0)
2217+ if (padding_not_printed - col_sep_width > 0)
2218 {
2219- pad_across_to (padding_not_printed - col_sep_length);
2220+ pad_across_to (padding_not_printed - col_sep_width);
2221 padding_not_printed = ANYWHERE;
2222 }
2223
fca5c2d6 2224@@ -2534,7 +2732,7 @@ print_stored (COLUMN *p)
fbb9790b
SS
2225 int i;
2226
2227 int line = p->current_line++;
2228- char *first = &buff[line_vector[line]];
2229+ unsigned char *first = &buff[line_vector[line]];
2230 /* FIXME
2231 UMR: Uninitialized memory read:
2232 * This is occurring while in:
fca5c2d6 2233@@ -2546,7 +2744,7 @@ print_stored (COLUMN *p)
fbb9790b
SS
2234 xmalloc [xmalloc.c:94]
2235 init_store_cols [pr.c:1648]
2236 */
2237- char *last = &buff[line_vector[line + 1]];
2238+ unsigned char *last = &buff[line_vector[line + 1]];
2239
2240 pad_vertically = true;
2241
fca5c2d6 2242@@ -2565,9 +2763,9 @@ print_stored (COLUMN *p)
56ae3f82
SS
2243 }
2244 }
2245
2246- if (padding_not_printed - col_sep_length > 0)
2247+ if (padding_not_printed - col_sep_width > 0)
2248 {
2249- pad_across_to (padding_not_printed - col_sep_length);
2250+ pad_across_to (padding_not_printed - col_sep_width);
2251 padding_not_printed = ANYWHERE;
2252 }
2253
fca5c2d6 2254@@ -2580,8 +2778,8 @@ print_stored (COLUMN *p)
56ae3f82
SS
2255 if (spaces_not_printed == 0)
2256 {
2257 output_position = p->start_position + end_vector[line];
2258- if (p->start_position - col_sep_length == chars_per_margin)
2259- output_position -= col_sep_length;
2260+ if (p->start_position - col_sep_width == chars_per_margin)
2261+ output_position -= col_sep_width;
2262 }
2263
2264 return true;
fca5c2d6 2265@@ -2600,7 +2798,7 @@ print_stored (COLUMN *p)
56ae3f82
SS
2266 number of characters is 1.) */
2267
2268 static int
2269-char_to_clump (char c)
2270+char_to_clump_single (char c)
2271 {
2272 unsigned char uc = c;
2273 char *s = clump_buff;
fca5c2d6 2274@@ -2610,10 +2808,10 @@ char_to_clump (char c)
56ae3f82
SS
2275 int chars;
2276 int chars_per_c = 8;
2277
2278- if (c == input_tab_char)
2279+ if (c == input_tab_char[0])
2280 chars_per_c = chars_per_input_tab;
2281
2282- if (c == input_tab_char || c == '\t')
2283+ if (c == input_tab_char[0] || c == '\t')
2284 {
2285 width = TAB_WIDTH (chars_per_c, input_position);
2286
fca5c2d6 2287@@ -2694,6 +2892,164 @@ char_to_clump (char c)
56ae3f82
SS
2288 return chars;
2289 }
2290
2291+#ifdef HAVE_MBRTOWC
2292+static int
2293+char_to_clump_multi (char c)
2294+{
2295+ static size_t mbc_pos = 0;
2296+ static char mbc[MB_LEN_MAX] = {'\0'};
2297+ static mbstate_t state = {'\0'};
2298+ mbstate_t state_bak;
2299+ wchar_t wc;
2300+ size_t mblength;
2301+ int wc_width;
2302+ register char *s = clump_buff;
2303+ register int i, j;
2304+ char esc_buff[4];
2305+ int width;
2306+ int chars;
2307+ int chars_per_c = 8;
2308+
2309+ state_bak = state;
2310+ mbc[mbc_pos++] = c;
2311+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2312+
2313+ width = 0;
2314+ chars = 0;
2315+ while (mbc_pos > 0)
2316+ {
2317+ switch (mblength)
2318+ {
2319+ case (size_t)-2:
2320+ state = state_bak;
2321+ return 0;
2322+
2323+ case (size_t)-1:
2324+ state = state_bak;
2325+ mblength = 1;
2326+
2327+ if (use_esc_sequence || use_cntrl_prefix)
2328+ {
2329+ width = +4;
2330+ chars = +4;
2331+ *s++ = '\\';
6987acf5 2332+ sprintf (esc_buff, "%03o", (unsigned char) mbc[0]);
56ae3f82
SS
2333+ for (i = 0; i <= 2; ++i)
2334+ *s++ = (int) esc_buff[i];
2335+ }
2336+ else
2337+ {
2338+ width += 1;
2339+ chars += 1;
2340+ *s++ = mbc[0];
2341+ }
2342+ break;
2343+
2344+ case 0:
2345+ mblength = 1;
2346+ /* Fall through */
2347+
2348+ default:
2349+ if (memcmp (mbc, input_tab_char, mblength) == 0)
2350+ chars_per_c = chars_per_input_tab;
2351+
2352+ if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
2353+ {
2354+ int width_inc;
2355+
2356+ width_inc = TAB_WIDTH (chars_per_c, input_position);
2357+ width += width_inc;
2358+
2359+ if (untabify_input)
2360+ {
2361+ for (i = width_inc; i; --i)
2362+ *s++ = ' ';
2363+ chars += width_inc;
2364+ }
2365+ else
2366+ {
2367+ for (i = 0; i < mblength; i++)
2368+ *s++ = mbc[i];
2369+ chars += mblength;
2370+ }
2371+ }
2372+ else if ((wc_width = wcwidth (wc)) < 1)
2373+ {
2374+ if (use_esc_sequence)
2375+ {
2376+ for (i = 0; i < mblength; i++)
2377+ {
2378+ width += 4;
2379+ chars += 4;
2380+ *s++ = '\\';
6987acf5 2381+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
56ae3f82
SS
2382+ for (j = 0; j <= 2; ++j)
2383+ *s++ = (int) esc_buff[j];
2384+ }
2385+ }
2386+ else if (use_cntrl_prefix)
2387+ {
2388+ if (wc < 0200)
2389+ {
2390+ width += 2;
2391+ chars += 2;
2392+ *s++ = '^';
2393+ *s++ = wc ^ 0100;
2394+ }
2395+ else
2396+ {
2397+ for (i = 0; i < mblength; i++)
2398+ {
2399+ width += 4;
2400+ chars += 4;
2401+ *s++ = '\\';
6987acf5 2402+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
56ae3f82
SS
2403+ for (j = 0; j <= 2; ++j)
2404+ *s++ = (int) esc_buff[j];
2405+ }
2406+ }
2407+ }
2408+ else if (wc == L'\b')
2409+ {
2410+ width += -1;
2411+ chars += 1;
2412+ *s++ = c;
2413+ }
2414+ else
2415+ {
2416+ width += 0;
2417+ chars += mblength;
2418+ for (i = 0; i < mblength; i++)
2419+ *s++ = mbc[i];
2420+ }
2421+ }
2422+ else
2423+ {
2424+ width += wc_width;
2425+ chars += mblength;
2426+ for (i = 0; i < mblength; i++)
2427+ *s++ = mbc[i];
2428+ }
2429+ }
2430+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2431+ mbc_pos -= mblength;
2432+ }
2433+
fbb9790b
SS
2434+ /* Too many backspaces must put us in position 0 -- never negative. */
2435+ if (width < 0 && input_position == 0)
2436+ {
2437+ chars = 0;
2438+ input_position = 0;
2439+ }
2440+ else if (width < 0 && input_position <= -width)
2441+ input_position = 0;
2442+ else
2443+ input_position += width;
2444+
56ae3f82
SS
2445+ return chars;
2446+}
2447+#endif
2448+
2449 /* We've just printed some files and need to clean up things before
2450 looking for more options and printing the next batch of files.
2451
fca5c2d6
SS
2452diff -Naurp coreutils-8.25-orig/src/sort.c coreutils-8.25/src/sort.c
2453--- coreutils-8.25-orig/src/sort.c 2016-01-16 13:09:33.000000000 -0600
2454+++ coreutils-8.25/src/sort.c 2016-02-08 19:07:10.310944648 -0600
effd5ec1 2455@@ -29,6 +29,14 @@
56ae3f82
SS
2456 #include <sys/wait.h>
2457 #include <signal.h>
effd5ec1 2458 #include <assert.h>
56ae3f82
SS
2459+#if HAVE_WCHAR_H
2460+# include <wchar.h>
2461+#endif
2462+/* Get isw* functions. */
2463+#if HAVE_WCTYPE_H
2464+# include <wctype.h>
2465+#endif
2466+
2467 #include "system.h"
2468 #include "argmatch.h"
2469 #include "error.h"
fca5c2d6 2470@@ -163,14 +171,39 @@ static int decimal_point;
fbb9790b
SS
2471 /* Thousands separator; if -1, then there isn't one. */
2472 static int thousands_sep;
56ae3f82 2473
fbb9790b
SS
2474+/* True if -f is specified. */
2475+static bool folding;
2476+
56ae3f82
SS
2477 /* Nonzero if the corresponding locales are hard. */
2478 static bool hard_LC_COLLATE;
2479-#if HAVE_NL_LANGINFO
2480+#if HAVE_LANGINFO_CODESET
2481 static bool hard_LC_TIME;
2482 #endif
2483
2484 #define NONZERO(x) ((x) != 0)
2485
2486+/* get a multibyte character's byte length. */
2487+#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
2488+ do \
2489+ { \
2490+ wchar_t wc; \
2491+ mbstate_t state_bak; \
2492+ \
2493+ state_bak = STATE; \
2494+ mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
2495+ \
2496+ switch (MBLENGTH) \
2497+ { \
2498+ case (size_t)-1: \
2499+ case (size_t)-2: \
2500+ STATE = state_bak; \
2501+ /* Fall through. */ \
2502+ case 0: \
2503+ MBLENGTH = 1; \
2504+ } \
2505+ } \
2506+ while (0)
2507+
2508 /* The kind of blanks for '-b' to skip in various options. */
2509 enum blanktype { bl_start, bl_end, bl_both };
2510
fca5c2d6 2511@@ -344,13 +377,11 @@ static bool reverse;
56ae3f82
SS
2512 they were read if all keys compare equal. */
2513 static bool stable;
2514
2515-/* If TAB has this value, blanks separate fields. */
2516-enum { TAB_DEFAULT = CHAR_MAX + 1 };
2517-
2518-/* Tab character separating fields. If TAB_DEFAULT, then fields are
2519+/* Tab character separating fields. If tab_length is 0, then fields are
2520 separated by the empty string between a non-blank character and a blank
2521 character. */
2522-static int tab = TAB_DEFAULT;
2523+static char tab[MB_LEN_MAX + 1];
2524+static size_t tab_length = 0;
2525
2526 /* Flag to remove consecutive duplicate lines from the output.
2527 Only the last of a sequence of equal lines will be output. */
fca5c2d6 2528@@ -810,6 +841,46 @@ reap_all (void)
407c5be3 2529 reap (-1);
56ae3f82
SS
2530 }
2531
2532+/* Function pointers. */
2533+static void
2534+(*inittables) (void);
2535+static char *
2536+(*begfield) (const struct line*, const struct keyfield *);
2537+static char *
2538+(*limfield) (const struct line*, const struct keyfield *);
1555d43c
SS
2539+static void
2540+(*skipblanks) (char **ptr, char *lim);
56ae3f82 2541+static int
1555d43c 2542+(*getmonth) (char const *, size_t, char **);
56ae3f82
SS
2543+static int
2544+(*keycompare) (const struct line *, const struct line *);
2545+static int
2546+(*numcompare) (const char *, const char *);
2547+
2548+/* Test for white space multibyte character.
2549+ Set LENGTH the byte length of investigated multibyte character. */
2550+#if HAVE_MBRTOWC
2551+static int
2552+ismbblank (const char *str, size_t len, size_t *length)
2553+{
2554+ size_t mblength;
2555+ wchar_t wc;
2556+ mbstate_t state;
2557+
2558+ memset (&state, '\0', sizeof(mbstate_t));
2559+ mblength = mbrtowc (&wc, str, len, &state);
2560+
2561+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
2562+ {
2563+ *length = 1;
2564+ return 0;
2565+ }
2566+
2567+ *length = (mblength < 1) ? 1 : mblength;
fca5c2d6 2568+ return iswblank (wc) || wc == '\n';
56ae3f82
SS
2569+}
2570+#endif
2571+
2572 /* Clean up any remaining temporary files. */
2573
2574 static void
fca5c2d6 2575@@ -1254,7 +1325,7 @@ zaptemp (char const *name)
56ae3f82
SS
2576 free (node);
2577 }
2578
2579-#if HAVE_NL_LANGINFO
2580+#if HAVE_LANGINFO_CODESET
2581
2582 static int
1555d43c 2583 struct_month_cmp (void const *m1, void const *m2)
fca5c2d6 2584@@ -1269,7 +1340,7 @@ struct_month_cmp (void const *m1, void c
56ae3f82
SS
2585 /* Initialize the character class tables. */
2586
2587 static void
2588-inittables (void)
2589+inittables_uni (void)
2590 {
2591 size_t i;
2592
fca5c2d6 2593@@ -1281,7 +1352,7 @@ inittables (void)
56ae3f82
SS
2594 fold_toupper[i] = toupper (i);
2595 }
2596
2597-#if HAVE_NL_LANGINFO
2598+#if HAVE_LANGINFO_CODESET
2599 /* If we're not in the "C" locale, read different names for months. */
2600 if (hard_LC_TIME)
2601 {
fca5c2d6 2602@@ -1363,6 +1434,84 @@ specify_nmerge (int oi, char c, char con
56ae3f82
SS
2603 xstrtol_fatal (e, oi, c, long_options, s);
2604 }
2605
2606+#if HAVE_MBRTOWC
2607+static void
2608+inittables_mb (void)
2609+{
2610+ int i, j, k, l;
1555d43c 2611+ char *name, *s, *lc_time, *lc_ctype;
56ae3f82
SS
2612+ size_t s_len, mblength;
2613+ char mbc[MB_LEN_MAX];
2614+ wchar_t wc, pwc;
2615+ mbstate_t state_mb, state_wc;
2616+
1555d43c
SS
2617+ lc_time = setlocale (LC_TIME, "");
2618+ if (lc_time)
2619+ lc_time = xstrdup (lc_time);
2620+
2621+ lc_ctype = setlocale (LC_CTYPE, "");
2622+ if (lc_ctype)
2623+ lc_ctype = xstrdup (lc_ctype);
2624+
2625+ if (lc_time && lc_ctype)
2626+ /* temporarily set LC_CTYPE to match LC_TIME, so that we can convert
2627+ * the names of months to upper case */
2628+ setlocale (LC_CTYPE, lc_time);
2629+
56ae3f82
SS
2630+ for (i = 0; i < MONTHS_PER_YEAR; i++)
2631+ {
2632+ s = (char *) nl_langinfo (ABMON_1 + i);
2633+ s_len = strlen (s);
2634+ monthtab[i].name = name = (char *) xmalloc (s_len + 1);
2635+ monthtab[i].val = i + 1;
2636+
2637+ memset (&state_mb, '\0', sizeof (mbstate_t));
2638+ memset (&state_wc, '\0', sizeof (mbstate_t));
2639+
2640+ for (j = 0; j < s_len;)
2641+ {
2642+ if (!ismbblank (s + j, s_len - j, &mblength))
2643+ break;
2644+ j += mblength;
2645+ }
2646+
2647+ for (k = 0; j < s_len;)
2648+ {
2649+ mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
2650+ assert (mblength != (size_t)-1 && mblength != (size_t)-2);
2651+ if (mblength == 0)
2652+ break;
2653+
2654+ pwc = towupper (wc);
2655+ if (pwc == wc)
2656+ {
2657+ memcpy (mbc, s + j, mblength);
2658+ j += mblength;
2659+ }
2660+ else
2661+ {
2662+ j += mblength;
2663+ mblength = wcrtomb (mbc, pwc, &state_wc);
2664+ assert (mblength != (size_t)0 && mblength != (size_t)-1);
2665+ }
2666+
2667+ for (l = 0; l < mblength; l++)
2668+ name[k++] = mbc[l];
2669+ }
2670+ name[k] = '\0';
2671+ }
2672+ qsort ((void *) monthtab, MONTHS_PER_YEAR,
2673+ sizeof (struct month), struct_month_cmp);
1555d43c
SS
2674+
2675+ if (lc_time && lc_ctype)
2676+ /* restore the original locales */
2677+ setlocale (LC_CTYPE, lc_ctype);
2678+
2679+ free (lc_ctype);
2680+ free (lc_time);
56ae3f82
SS
2681+}
2682+#endif
2683+
2684 /* Specify the amount of main memory to use when sorting. */
2685 static void
2686 specify_sort_size (int oi, char c, char const *s)
fca5c2d6 2687@@ -1596,7 +1745,7 @@ buffer_linelim (struct buffer const *buf
56ae3f82
SS
2688 by KEY in LINE. */
2689
2690 static char *
1555d43c 2691-begfield (struct line const *line, struct keyfield const *key)
56ae3f82
SS
2692+begfield_uni (const struct line *line, const struct keyfield *key)
2693 {
2694 char *ptr = line->text, *lim = ptr + line->length - 1;
2695 size_t sword = key->sword;
fca5c2d6 2696@@ -1605,10 +1754,10 @@ begfield (struct line const *line, struc
56ae3f82
SS
2697 /* The leading field separator itself is included in a field when -t
2698 is absent. */
2699
2700- if (tab != TAB_DEFAULT)
2701+ if (tab_length)
2702 while (ptr < lim && sword--)
2703 {
2704- while (ptr < lim && *ptr != tab)
2705+ while (ptr < lim && *ptr != tab[0])
2706 ++ptr;
2707 if (ptr < lim)
2708 ++ptr;
fca5c2d6 2709@@ -1634,11 +1783,70 @@ begfield (struct line const *line, struc
56ae3f82
SS
2710 return ptr;
2711 }
2712
2713+#if HAVE_MBRTOWC
2714+static char *
2715+begfield_mb (const struct line *line, const struct keyfield *key)
2716+{
2717+ int i;
2718+ char *ptr = line->text, *lim = ptr + line->length - 1;
2719+ size_t sword = key->sword;
2720+ size_t schar = key->schar;
2721+ size_t mblength;
2722+ mbstate_t state;
2723+
2724+ memset (&state, '\0', sizeof(mbstate_t));
2725+
2726+ if (tab_length)
2727+ while (ptr < lim && sword--)
2728+ {
2729+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2730+ {
2731+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2732+ ptr += mblength;
2733+ }
2734+ if (ptr < lim)
2735+ {
2736+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2737+ ptr += mblength;
2738+ }
2739+ }
2740+ else
2741+ while (ptr < lim && sword--)
2742+ {
2743+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2744+ ptr += mblength;
2745+ if (ptr < lim)
2746+ {
2747+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2748+ ptr += mblength;
2749+ }
2750+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2751+ ptr += mblength;
2752+ }
2753+
2754+ if (key->skipsblanks)
2755+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2756+ ptr += mblength;
2757+
2758+ for (i = 0; i < schar; i++)
2759+ {
2760+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2761+
2762+ if (ptr + mblength > lim)
2763+ break;
2764+ else
2765+ ptr += mblength;
2766+ }
2767+
2768+ return ptr;
2769+}
2770+#endif
2771+
2772 /* Return the limit of (a pointer to the first character after) the field
2773 in LINE specified by KEY. */
2774
2775 static char *
1555d43c 2776-limfield (struct line const *line, struct keyfield const *key)
56ae3f82
SS
2777+limfield_uni (const struct line *line, const struct keyfield *key)
2778 {
2779 char *ptr = line->text, *lim = ptr + line->length - 1;
2780 size_t eword = key->eword, echar = key->echar;
fca5c2d6 2781@@ -1653,10 +1861,10 @@ limfield (struct line const *line, struc
6987acf5
MT
2782 'beginning' is the first character following the delimiting TAB.
2783 Otherwise, leave PTR pointing at the first 'blank' character after
56ae3f82
SS
2784 the preceding field. */
2785- if (tab != TAB_DEFAULT)
2786+ if (tab_length)
2787 while (ptr < lim && eword--)
2788 {
2789- while (ptr < lim && *ptr != tab)
2790+ while (ptr < lim && *ptr != tab[0])
2791 ++ptr;
2792 if (ptr < lim && (eword || echar))
2793 ++ptr;
fca5c2d6 2794@@ -1702,10 +1910,10 @@ limfield (struct line const *line, struc
56ae3f82
SS
2795 */
2796
2797 /* Make LIM point to the end of (one byte past) the current field. */
2798- if (tab != TAB_DEFAULT)
2799+ if (tab_length)
2800 {
2801 char *newlim;
2802- newlim = memchr (ptr, tab, lim - ptr);
2803+ newlim = memchr (ptr, tab[0], lim - ptr);
2804 if (newlim)
2805 lim = newlim;
2806 }
fca5c2d6 2807@@ -1736,6 +1944,130 @@ limfield (struct line const *line, struc
56ae3f82
SS
2808 return ptr;
2809 }
2810
2811+#if HAVE_MBRTOWC
2812+static char *
2813+limfield_mb (const struct line *line, const struct keyfield *key)
2814+{
2815+ char *ptr = line->text, *lim = ptr + line->length - 1;
2816+ size_t eword = key->eword, echar = key->echar;
2817+ int i;
2818+ size_t mblength;
2819+ mbstate_t state;
2820+
2821+ if (echar == 0)
2822+ eword++; /* skip all of end field. */
2823+
2824+ memset (&state, '\0', sizeof(mbstate_t));
2825+
2826+ if (tab_length)
2827+ while (ptr < lim && eword--)
2828+ {
2829+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2830+ {
2831+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2832+ ptr += mblength;
2833+ }
2834+ if (ptr < lim && (eword | echar))
2835+ {
2836+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2837+ ptr += mblength;
2838+ }
2839+ }
2840+ else
2841+ while (ptr < lim && eword--)
2842+ {
2843+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2844+ ptr += mblength;
2845+ if (ptr < lim)
2846+ {
2847+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2848+ ptr += mblength;
2849+ }
2850+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2851+ ptr += mblength;
2852+ }
2853+
2854+
2855+# ifdef POSIX_UNSPECIFIED
2856+ /* Make LIM point to the end of (one byte past) the current field. */
2857+ if (tab_length)
2858+ {
2859+ char *newlim, *p;
2860+
2861+ newlim = NULL;
2862+ for (p = ptr; p < lim;)
2863+ {
2864+ if (memcmp (p, tab, tab_length) == 0)
2865+ {
2866+ newlim = p;
2867+ break;
2868+ }
2869+
2870+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2871+ p += mblength;
2872+ }
2873+ }
2874+ else
2875+ {
2876+ char *newlim;
2877+ newlim = ptr;
2878+
2879+ while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
2880+ newlim += mblength;
2881+ if (ptr < lim)
2882+ {
2883+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2884+ ptr += mblength;
2885+ }
2886+ while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
2887+ newlim += mblength;
2888+ lim = newlim;
2889+ }
2890+# endif
2891+
2892+ if (echar != 0)
2893+ {
2894+ /* If we're skipping leading blanks, don't start counting characters
2895+ * until after skipping past any leading blanks. */
fbb9790b 2896+ if (key->skipeblanks)
56ae3f82
SS
2897+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2898+ ptr += mblength;
2899+
2900+ memset (&state, '\0', sizeof(mbstate_t));
2901+
2902+ /* Advance PTR by ECHAR (if possible), but no further than LIM. */
2903+ for (i = 0; i < echar; i++)
2904+ {
2905+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2906+
2907+ if (ptr + mblength > lim)
2908+ break;
2909+ else
2910+ ptr += mblength;
2911+ }
2912+ }
2913+
2914+ return ptr;
2915+}
2916+#endif
1555d43c
SS
2917+
2918+static void
2919+skipblanks_uni (char **ptr, char *lim)
2920+{
2921+ while (*ptr < lim && blanks[to_uchar (**ptr)])
2922+ ++(*ptr);
2923+}
2924+
2925+#if HAVE_MBRTOWC
2926+static void
2927+skipblanks_mb (char **ptr, char *lim)
2928+{
2929+ size_t mblength;
2930+ while (*ptr < lim && ismbblank (*ptr, lim - *ptr, &mblength))
2931+ (*ptr) += mblength;
2932+}
2933+#endif
56ae3f82
SS
2934+
2935 /* Fill BUF reading from FP, moving buf->left bytes from the end
2936 of buf->buf to the beginning first. If EOF is reached and the
2937 file wasn't terminated by a newline, supply one. Set up BUF's line
fca5c2d6 2938@@ -1822,8 +2154,22 @@ fillbuf (struct buffer *buf, FILE *fp, c
56ae3f82
SS
2939 else
2940 {
2941 if (key->skipsblanks)
2942- while (blanks[to_uchar (*line_start)])
2943- line_start++;
2944+ {
2945+#if HAVE_MBRTOWC
2946+ if (MB_CUR_MAX > 1)
2947+ {
2948+ size_t mblength;
56ae3f82
SS
2949+ while (line_start < line->keylim &&
2950+ ismbblank (line_start,
2951+ line->keylim - line_start,
2952+ &mblength))
2953+ line_start += mblength;
2954+ }
2955+ else
2956+#endif
2957+ while (blanks[to_uchar (*line_start)])
2958+ line_start++;
2959+ }
2960 line->keybeg = line_start;
2961 }
2962 }
fca5c2d6 2963@@ -1944,7 +2290,7 @@ human_numcompare (char const *a, char co
56ae3f82
SS
2964 hideously fast. */
2965
2966 static int
1555d43c 2967-numcompare (char const *a, char const *b)
56ae3f82
SS
2968+numcompare_uni (const char *a, const char *b)
2969 {
2970 while (blanks[to_uchar (*a)])
2971 a++;
fca5c2d6 2972@@ -1954,6 +2300,25 @@ numcompare (char const *a, char const *b
1555d43c 2973 return strnumcmp (a, b, decimal_point, thousands_sep);
56ae3f82
SS
2974 }
2975
2976+#if HAVE_MBRTOWC
2977+static int
2978+numcompare_mb (const char *a, const char *b)
2979+{
2980+ size_t mblength, len;
2981+ len = strlen (a); /* okay for UTF-8 */
2982+ while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2983+ {
2984+ a += mblength;
2985+ len -= mblength;
2986+ }
2987+ len = strlen (b); /* okay for UTF-8 */
2988+ while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2989+ b += mblength;
2990+
2991+ return strnumcmp (a, b, decimal_point, thousands_sep);
2992+}
2993+#endif /* HAV_EMBRTOWC */
2994+
fa4603be
SS
2995 /* Work around a problem whereby the long double value returned by glibc's
2996 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
2997 A and B before calling strtold. FIXME: remove this function once
fca5c2d6 2998@@ -2004,7 +2369,7 @@ general_numcompare (char const *sa, char
56ae3f82
SS
2999 Return 0 if the name in S is not recognized. */
3000
3001 static int
1555d43c
SS
3002-getmonth (char const *month, char **ea)
3003+getmonth_uni (char const *month, size_t len, char **ea)
56ae3f82
SS
3004 {
3005 size_t lo = 0;
3006 size_t hi = MONTHS_PER_YEAR;
fca5c2d6 3007@@ -2280,15 +2645,14 @@ debug_key (struct line const *line, stru
407c5be3
SS
3008 char saved = *lim;
3009 *lim = '\0';
1555d43c
SS
3010
3011- while (blanks[to_uchar (*beg)])
3012- beg++;
3013+ skipblanks (&beg, lim);
3014
3015 char *tighter_lim = beg;
3016
3badd2da
SS
3017 if (lim < beg)
3018 tighter_lim = lim;
3019 else if (key->month)
1555d43c
SS
3020- getmonth (beg, &tighter_lim);
3021+ getmonth (beg, lim-beg, &tighter_lim);
3022 else if (key->general_numeric)
3023 ignore_value (strtold (beg, &tighter_lim));
3024 else if (key->numeric || key->human_numeric)
fca5c2d6 3025@@ -2432,7 +2796,7 @@ key_warnings (struct keyfield const *gke
1555d43c
SS
3026 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
3027 && !(key->schar || key->echar);
3028 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
3029- if (!gkey_only && tab == TAB_DEFAULT && !line_offset
3030+ if (!gkey_only && !tab_length && !line_offset
3031 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
3032 || (!key->skipsblanks && key->schar)
3033 || (!key->skipeblanks && key->echar)))
fca5c2d6 3034@@ -2490,11 +2854,87 @@ key_warnings (struct keyfield const *gke
6987acf5 3035 error (0, 0, _("option '-r' only applies to last-resort comparison"));
56ae3f82
SS
3036 }
3037
3038+#if HAVE_MBRTOWC
3039+static int
1555d43c 3040+getmonth_mb (const char *s, size_t len, char **ea)
56ae3f82
SS
3041+{
3042+ char *month;
3043+ register size_t i;
3044+ register int lo = 0, hi = MONTHS_PER_YEAR, result;
3045+ char *tmp;
3046+ size_t wclength, mblength;
fca5c2d6
SS
3047+ const char *pp;
3048+ const wchar_t *wpp;
56ae3f82
SS
3049+ wchar_t *month_wcs;
3050+ mbstate_t state;
3051+
3052+ while (len > 0 && ismbblank (s, len, &mblength))
3053+ {
3054+ s += mblength;
3055+ len -= mblength;
3056+ }
3057+
3058+ if (len == 0)
3059+ return 0;
3060+
fca5c2d6
SS
3061+ if (SIZE_MAX - len < 1)
3062+ xalloc_die ();
3063+
3064+ month = (char *) xnmalloc (len + 1, MB_CUR_MAX);
56ae3f82 3065+
fca5c2d6 3066+ pp = tmp = (char *) xnmalloc (len + 1, MB_CUR_MAX);
56ae3f82
SS
3067+ memcpy (tmp, s, len);
3068+ tmp[len] = '\0';
fca5c2d6
SS
3069+ wpp = month_wcs = (wchar_t *) xnmalloc (len + 1, sizeof (wchar_t));
3070+ memset (&state, '\0', sizeof (mbstate_t));
56ae3f82 3071+
fca5c2d6
SS
3072+ wclength = mbsrtowcs (month_wcs, &pp, len + 1, &state);
3073+ if (wclength == (size_t)-1 || pp != NULL)
1555d43c 3074+ error (SORT_FAILURE, 0, _("Invalid multibyte input %s."), quote(s));
56ae3f82
SS
3075+
3076+ for (i = 0; i < wclength; i++)
3077+ {
3078+ month_wcs[i] = towupper(month_wcs[i]);
3079+ if (iswblank (month_wcs[i]))
3080+ {
3081+ month_wcs[i] = L'\0';
3082+ break;
3083+ }
3084+ }
3085+
fca5c2d6
SS
3086+ mblength = wcsrtombs (month, &wpp, (len + 1) * MB_CUR_MAX, &state);
3087+ assert (mblength != (-1) && wpp == NULL);
56ae3f82
SS
3088+
3089+ do
3090+ {
3091+ int ix = (lo + hi) / 2;
3092+
3093+ if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
3094+ hi = ix;
3095+ else
3096+ lo = ix;
3097+ }
3098+ while (hi - lo > 1);
3099+
3100+ result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
3101+ ? monthtab[lo].val : 0);
3102+
6987acf5 3103+ if (ea && result)
fbb9790b 3104+ *ea = (char*) s + strlen (monthtab[lo].name);
6987acf5 3105+
e5317bd9
SS
3106+ free (month);
3107+ free (tmp);
3108+ free (month_wcs);
3109+
56ae3f82
SS
3110+ return result;
3111+}
3112+#endif
3113+
3114 /* Compare two lines A and B trying every key in sequence until there
3115 are no more keys or a difference is found. */
3116
3117 static int
1555d43c 3118-keycompare (struct line const *a, struct line const *b)
56ae3f82
SS
3119+keycompare_uni (const struct line *a, const struct line *b)
3120 {
3121 struct keyfield *key = keylist;
3122
fca5c2d6 3123@@ -2579,7 +3019,7 @@ keycompare (struct line const *a, struct
1555d43c
SS
3124 else if (key->human_numeric)
3125 diff = human_numcompare (ta, tb);
3126 else if (key->month)
3127- diff = getmonth (ta, NULL) - getmonth (tb, NULL);
3128+ diff = getmonth (ta, tlena, NULL) - getmonth (tb, tlenb, NULL);
3129 else if (key->random)
3130 diff = compare_random (ta, tlena, tb, tlenb);
3131 else if (key->version)
fca5c2d6 3132@@ -2695,6 +3135,211 @@ keycompare (struct line const *a, struct
56ae3f82
SS
3133 return key->reverse ? -diff : diff;
3134 }
3135
3136+#if HAVE_MBRTOWC
3137+static int
3138+keycompare_mb (const struct line *a, const struct line *b)
3139+{
3140+ struct keyfield *key = keylist;
3141+
3142+ /* For the first iteration only, the key positions have been
3143+ precomputed for us. */
3144+ char *texta = a->keybeg;
3145+ char *textb = b->keybeg;
3146+ char *lima = a->keylim;
3147+ char *limb = b->keylim;
3148+
3149+ size_t mblength_a, mblength_b;
3150+ wchar_t wc_a, wc_b;
3151+ mbstate_t state_a, state_b;
3152+
fbb9790b 3153+ int diff = 0;
56ae3f82
SS
3154+
3155+ memset (&state_a, '\0', sizeof(mbstate_t));
3156+ memset (&state_b, '\0', sizeof(mbstate_t));
fbb9790b
SS
3157+ /* Ignore keys with start after end. */
3158+ if (a->keybeg - a->keylim > 0)
3159+ return 0;
56ae3f82 3160+
56ae3f82
SS
3161+
3162+ /* Ignore and/or translate chars before comparing. */
3163+# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
3164+ do \
3165+ { \
3166+ wchar_t uwc; \
3167+ char mbc[MB_LEN_MAX]; \
3168+ mbstate_t state_wc; \
3169+ \
3170+ for (NEW_LEN = i = 0; i < LEN;) \
3171+ { \
3172+ mbstate_t state_bak; \
3173+ \
3174+ state_bak = STATE; \
3175+ MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
3176+ \
3177+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
3178+ || MBLENGTH == 0) \
3179+ { \
3180+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
3181+ STATE = state_bak; \
3182+ if (!ignore) \
e7f6ab54
SS
3183+ COPY[NEW_LEN++] = TEXT[i]; \
3184+ i++; \
56ae3f82
SS
3185+ continue; \
3186+ } \
3187+ \
3188+ if (ignore) \
3189+ { \
3190+ if ((ignore == nonprinting && !iswprint (WC)) \
3191+ || (ignore == nondictionary \
3192+ && !iswalnum (WC) && !iswblank (WC))) \
3193+ { \
3194+ i += MBLENGTH; \
3195+ continue; \
3196+ } \
3197+ } \
3198+ \
3199+ if (translate) \
3200+ { \
3201+ \
3202+ uwc = towupper(WC); \
3203+ if (WC == uwc) \
3204+ { \
3205+ memcpy (mbc, TEXT + i, MBLENGTH); \
3206+ i += MBLENGTH; \
3207+ } \
3208+ else \
3209+ { \
3210+ i += MBLENGTH; \
3211+ WC = uwc; \
3212+ memset (&state_wc, '\0', sizeof (mbstate_t)); \
3213+ \
3214+ MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
3215+ assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
3216+ } \
3217+ \
3218+ for (j = 0; j < MBLENGTH; j++) \
3219+ COPY[NEW_LEN++] = mbc[j]; \
3220+ } \
3221+ else \
3222+ for (j = 0; j < MBLENGTH; j++) \
3223+ COPY[NEW_LEN++] = TEXT[i++]; \
3224+ } \
3225+ COPY[NEW_LEN] = '\0'; \
3226+ } \
3227+ while (0)
fbb9790b
SS
3228+
3229+ /* Actually compare the fields. */
3230+
3231+ for (;;)
3232+ {
3233+ /* Find the lengths. */
3234+ size_t lena = lima <= texta ? 0 : lima - texta;
3235+ size_t lenb = limb <= textb ? 0 : limb - textb;
3236+
fca5c2d6
SS
3237+ char enda IF_LINT (= 0);
3238+ char endb IF_LINT (= 0);
3239+
fbb9790b
SS
3240+ char const *translate = key->translate;
3241+ bool const *ignore = key->ignore;
3242+
3243+ if (ignore || translate)
3244+ {
fca5c2d6
SS
3245+ if (SIZE_MAX - lenb - 2 < lena)
3246+ xalloc_die ();
3247+ char *copy_a = (char *) xnmalloc (lena + lenb + 2, MB_CUR_MAX);
3248+ char *copy_b = copy_a + lena * MB_CUR_MAX + 1;
fbb9790b
SS
3249+ size_t new_len_a, new_len_b;
3250+ size_t i, j;
3251+
3252+ IGNORE_CHARS (new_len_a, lena, texta, copy_a,
3253+ wc_a, mblength_a, state_a);
3254+ IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
3255+ wc_b, mblength_b, state_b);
3256+ texta = copy_a; textb = copy_b;
3257+ lena = new_len_a; lenb = new_len_b;
56ae3f82 3258+ }
fca5c2d6
SS
3259+ else
3260+ {
3261+ /* Use the keys in-place, temporarily null-terminated. */
3262+ enda = texta[lena]; texta[lena] = '\0';
3263+ endb = textb[lenb]; textb[lenb] = '\0';
3264+ }
56ae3f82 3265+
fbb9790b
SS
3266+ if (key->random)
3267+ diff = compare_random (texta, lena, textb, lenb);
3268+ else if (key->numeric | key->general_numeric | key->human_numeric)
3269+ {
3270+ char savea = *lima, saveb = *limb;
3271+
3272+ *lima = *limb = '\0';
3273+ diff = (key->numeric ? numcompare (texta, textb)
3274+ : key->general_numeric ? general_numcompare (texta, textb)
3275+ : human_numcompare (texta, textb));
3276+ *lima = savea, *limb = saveb;
3277+ }
3278+ else if (key->version)
3279+ diff = filevercmp (texta, textb);
3280+ else if (key->month)
3281+ diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL);
3282+ else if (lena == 0)
3283+ diff = - NONZERO (lenb);
3284+ else if (lenb == 0)
3285+ diff = 1;
3286+ else if (hard_LC_COLLATE && !folding)
3287+ {
fca5c2d6 3288+ diff = xmemcoll0 (texta, lena + 1, textb, lenb + 1);
fbb9790b
SS
3289+ }
3290+ else
fca5c2d6
SS
3291+ {
3292+ diff = memcmp (texta, textb, MIN (lena, lenb));
3293+ if (diff == 0)
3294+ diff = lena < lenb ? -1 : lena != lenb;
3295+ }
fbb9790b
SS
3296+
3297+ if (ignore || translate)
3298+ free (texta);
fca5c2d6
SS
3299+ else
3300+ {
3301+ texta[lena] = enda;
3302+ textb[lenb] = endb;
3303+ }
fbb9790b 3304+
56ae3f82
SS
3305+ if (diff)
3306+ goto not_equal;
3307+
3308+ key = key->next;
3309+ if (! key)
3310+ break;
3311+
3312+ /* Find the beginning and limit of the next field. */
3313+ if (key->eword != -1)
3314+ lima = limfield (a, key), limb = limfield (b, key);
3315+ else
3316+ lima = a->text + a->length - 1, limb = b->text + b->length - 1;
3317+
3318+ if (key->sword != -1)
3319+ texta = begfield (a, key), textb = begfield (b, key);
3320+ else
3321+ {
3322+ texta = a->text, textb = b->text;
3323+ if (key->skipsblanks)
3324+ {
3325+ while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
3326+ texta += mblength_a;
3327+ while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
3328+ textb += mblength_b;
3329+ }
3330+ }
3331+ }
3332+
56ae3f82 3333+not_equal:
fbb9790b
SS
3334+ if (key && key->reverse)
3335+ return -diff;
3336+ else
3337+ return diff;
56ae3f82
SS
3338+}
3339+#endif
3340+
3341 /* Compare two lines A and B, returning negative, zero, or positive
3342 depending on whether A compares less than, equal to, or greater than B. */
3343
fca5c2d6 3344@@ -2722,7 +3367,7 @@ compare (struct line const *a, struct li
fbb9790b
SS
3345 diff = - NONZERO (blen);
3346 else if (blen == 0)
3347 diff = 1;
3348- else if (hard_LC_COLLATE)
3349+ else if (hard_LC_COLLATE && !folding)
3350 {
3351 /* Note xmemcoll0 is a performance enhancement as
3352 it will not unconditionally write '\0' after the
fca5c2d6 3353@@ -4121,6 +4766,7 @@ set_ordering (char const *s, struct keyf
fbb9790b
SS
3354 break;
3355 case 'f':
3356 key->translate = fold_toupper;
3357+ folding = true;
3358 break;
3359 case 'g':
3360 key->general_numeric = true;
fca5c2d6 3361@@ -4199,7 +4845,7 @@ main (int argc, char **argv)
56ae3f82
SS
3362 initialize_exit_failure (SORT_FAILURE);
3363
3364 hard_LC_COLLATE = hard_locale (LC_COLLATE);
3365-#if HAVE_NL_LANGINFO
3366+#if HAVE_LANGINFO_CODESET
3367 hard_LC_TIME = hard_locale (LC_TIME);
3368 #endif
3369
fca5c2d6 3370@@ -4220,6 +4866,29 @@ main (int argc, char **argv)
56ae3f82
SS
3371 thousands_sep = -1;
3372 }
3373
3374+#if HAVE_MBRTOWC
3375+ if (MB_CUR_MAX > 1)
3376+ {
3377+ inittables = inittables_mb;
3378+ begfield = begfield_mb;
3379+ limfield = limfield_mb;
1555d43c 3380+ skipblanks = skipblanks_mb;
56ae3f82
SS
3381+ getmonth = getmonth_mb;
3382+ keycompare = keycompare_mb;
3383+ numcompare = numcompare_mb;
3384+ }
3385+ else
3386+#endif
3387+ {
3388+ inittables = inittables_uni;
3389+ begfield = begfield_uni;
3390+ limfield = limfield_uni;
1555d43c 3391+ skipblanks = skipblanks_uni;
56ae3f82
SS
3392+ getmonth = getmonth_uni;
3393+ keycompare = keycompare_uni;
3394+ numcompare = numcompare_uni;
3395+ }
3396+
3397 have_read_stdin = false;
3398 inittables ();
3399
fca5c2d6 3400@@ -4494,13 +5163,34 @@ main (int argc, char **argv)
56ae3f82
SS
3401
3402 case 't':
3403 {
3404- char newtab = optarg[0];
3405- if (! newtab)
3406+ char newtab[MB_LEN_MAX + 1];
3407+ size_t newtab_length = 1;
3408+ strncpy (newtab, optarg, MB_LEN_MAX);
3409+ if (! newtab[0])
3410 error (SORT_FAILURE, 0, _("empty tab"));
3411- if (optarg[1])
3412+#if HAVE_MBRTOWC
3413+ if (MB_CUR_MAX > 1)
3414+ {
3415+ wchar_t wc;
3416+ mbstate_t state;
56ae3f82
SS
3417+
3418+ memset (&state, '\0', sizeof (mbstate_t));
3419+ newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
3420+ MB_LEN_MAX),
3421+ &state);
3422+ switch (newtab_length)
3423+ {
3424+ case (size_t) -1:
3425+ case (size_t) -2:
3426+ case 0:
3427+ newtab_length = 1;
3428+ }
3429+ }
3430+#endif
3431+ if (newtab_length == 1 && optarg[1])
3432 {
3433 if (STREQ (optarg, "\\0"))
3434- newtab = '\0';
3435+ newtab[0] = '\0';
3436 else
3437 {
6987acf5 3438 /* Provoke with 'sort -txx'. Complain about
fca5c2d6 3439@@ -4511,9 +5201,12 @@ main (int argc, char **argv)
56ae3f82
SS
3440 quote (optarg));
3441 }
3442 }
3443- if (tab != TAB_DEFAULT && tab != newtab)
3444+ if (tab_length
3445+ && (tab_length != newtab_length
3446+ || memcmp (tab, newtab, tab_length) != 0))
3447 error (SORT_FAILURE, 0, _("incompatible tabs"));
3448- tab = newtab;
3449+ memcpy (tab, newtab, newtab_length);
3450+ tab_length = newtab_length;
3451 }
3452 break;
3453
fca5c2d6
SS
3454@@ -4751,12 +5444,10 @@ main (int argc, char **argv)
3455 sort (files, nfiles, outfile, nthreads);
3456 }
3457
3458-#ifdef lint
3459 if (files_from)
3460 readtokens0_free (&tok);
3461 else
3462 free (files);
3463-#endif
3464
3465 if (have_read_stdin && fclose (stdin) == EOF)
3466 die (_("close failed"), "-");
3467diff -Naurp coreutils-8.25-orig/src/unexpand.c coreutils-8.25/src/unexpand.c
3468--- coreutils-8.25-orig/src/unexpand.c 2016-01-01 07:48:50.000000000 -0600
3469+++ coreutils-8.25/src/unexpand.c 2016-02-08 19:07:10.311944651 -0600
6987acf5 3470@@ -38,12 +38,29 @@
56ae3f82
SS
3471 #include <stdio.h>
3472 #include <getopt.h>
3473 #include <sys/types.h>
3474+
3475+/* Get mbstate_t, mbrtowc(), wcwidth(). */
3476+#if HAVE_WCHAR_H
3477+# include <wchar.h>
3478+#endif
3479+
3480 #include "system.h"
3481 #include "error.h"
1555d43c 3482 #include "fadvise.h"
56ae3f82
SS
3483 #include "quote.h"
3484 #include "xstrndup.h"
3485
3486+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3487+ installation; work around this configuration error. */
3488+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3489+# define MB_LEN_MAX 16
3490+#endif
3491+
3492+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3493+#if HAVE_MBRTOWC && defined mbstate_t
3494+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3495+#endif
3496+
6987acf5 3497 /* The official name of this program (e.g., no 'g' prefix). */
56ae3f82
SS
3498 #define PROGRAM_NAME "unexpand"
3499
fca5c2d6 3500@@ -103,6 +120,210 @@ static struct option const longopts[] =
56ae3f82
SS
3501 {NULL, 0, NULL, 0}
3502 };
3503
3504+static FILE *next_file (FILE *fp);
3505+
3506+#if HAVE_MBRTOWC
3507+static void
3508+unexpand_multibyte (void)
3509+{
3510+ FILE *fp; /* Input stream. */
3511+ mbstate_t i_state; /* Current shift state of the input stream. */
3512+ mbstate_t i_state_bak; /* Back up the I_STATE. */
3513+ mbstate_t o_state; /* Current shift state of the output stream. */
3514+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
3badd2da 3515+ char *bufpos = buf; /* Next read position of BUF. */
56ae3f82
SS
3516+ size_t buflen = 0; /* The length of the byte sequence in buf. */
3517+ wint_t wc; /* A gotten wide character. */
3518+ size_t mblength; /* The byte size of a multibyte character
3519+ which shows as same character as WC. */
e5317bd9 3520+ bool prev_tab = false;
56ae3f82
SS
3521+
3522+ /* Index in `tab_list' of next tabstop: */
3523+ int tab_index = 0; /* For calculating width of pending tabs. */
3524+ int print_tab_index = 0; /* For printing as many tabs as possible. */
3525+ unsigned int column = 0; /* Column on screen of next char. */
3526+ int next_tab_column; /* Column the next tab stop is on. */
3527+ int convert = 1; /* If nonzero, perform translations. */
3528+ unsigned int pending = 0; /* Pending columns of blanks. */
3529+
3530+ fp = next_file ((FILE *) NULL);
3531+ if (fp == NULL)
3532+ return;
3533+
3534+ memset (&o_state, '\0', sizeof(mbstate_t));
3535+ memset (&i_state, '\0', sizeof(mbstate_t));
3536+
3537+ for (;;)
3538+ {
3539+ if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
3540+ {
3541+ memmove (buf, bufpos, buflen);
3542+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
3543+ bufpos = buf;
3544+ }
3545+
3546+ /* Get a wide character. */
3547+ if (buflen < 1)
3548+ {
3549+ mblength = 1;
3550+ wc = WEOF;
3551+ }
3552+ else
3553+ {
3554+ i_state_bak = i_state;
3555+ mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &i_state);
3556+ }
3557+
3558+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
3559+ {
3560+ i_state = i_state_bak;
3561+ wc = L'\0';
3562+ }
3563+
3564+ if (wc == L' ' && convert && column < INT_MAX)
3565+ {
3566+ ++pending;
3567+ ++column;
3568+ }
3569+ else if (wc == L'\t' && convert)
3570+ {
3571+ if (tab_size == 0)
3572+ {
3573+ /* Do not let tab_index == first_free_tab;
3574+ stop when it is 1 less. */
3575+ while (tab_index < first_free_tab - 1
3576+ && column >= tab_list[tab_index])
3577+ tab_index++;
3578+ next_tab_column = tab_list[tab_index];
3579+ if (tab_index < first_free_tab - 1)
3580+ tab_index++;
3581+ if (column >= next_tab_column)
3582+ {
3583+ convert = 0; /* Ran out of tab stops. */
3584+ goto flush_pend_mb;
3585+ }
3586+ }
3587+ else
3588+ {
3589+ next_tab_column = column + tab_size - column % tab_size;
3590+ }
3591+ pending += next_tab_column - column;
3592+ column = next_tab_column;
3593+ }
3594+ else
3595+ {
3596+flush_pend_mb:
3597+ /* Flush pending spaces. Print as many tabs as possible,
3598+ then print the rest as spaces. */
e5317bd9 3599+ if (pending == 1 && column != 1 && !prev_tab)
56ae3f82
SS
3600+ {
3601+ putchar (' ');
3602+ pending = 0;
3603+ }
3604+ column -= pending;
3605+ while (pending > 0)
3606+ {
3607+ if (tab_size == 0)
3608+ {
3609+ /* Do not let print_tab_index == first_free_tab;
3610+ stop when it is 1 less. */
3611+ while (print_tab_index < first_free_tab - 1
3612+ && column >= tab_list[print_tab_index])
3613+ print_tab_index++;
3614+ next_tab_column = tab_list[print_tab_index];
3615+ if (print_tab_index < first_free_tab - 1)
3616+ print_tab_index++;
3617+ }
3618+ else
3619+ {
3620+ next_tab_column =
3621+ column + tab_size - column % tab_size;
3622+ }
3623+ if (next_tab_column - column <= pending)
3624+ {
3625+ putchar ('\t');
3626+ pending -= next_tab_column - column;
3627+ column = next_tab_column;
3628+ }
3629+ else
3630+ {
3631+ --print_tab_index;
3632+ column += pending;
3633+ while (pending != 0)
3634+ {
3635+ putchar (' ');
3636+ pending--;
3637+ }
3638+ }
3639+ }
3640+
3641+ if (wc == WEOF)
3642+ {
3643+ fp = next_file (fp);
3644+ if (fp == NULL)
3645+ break; /* No more files. */
3646+ else
3647+ {
3648+ memset (&i_state, '\0', sizeof(mbstate_t));
3649+ continue;
3650+ }
3651+ }
3652+
3653+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
3654+ {
3655+ if (convert)
3656+ {
3657+ ++column;
3658+ if (convert_entire_line == 0)
3659+ convert = 0;
3660+ }
3661+ mblength = 1;
3662+ putchar (buf[0]);
3663+ }
3664+ else if (mblength == 0)
3665+ {
3666+ if (convert && convert_entire_line == 0)
3667+ convert = 0;
3668+ mblength = 1;
3669+ putchar ('\0');
3670+ }
3671+ else
3672+ {
3673+ if (convert)
3674+ {
3675+ if (wc == L'\b')
3676+ {
3677+ if (column > 0)
3678+ --column;
3679+ }
3680+ else
3681+ {
3682+ int width; /* The width of WC. */
3683+
3684+ width = wcwidth (wc);
3685+ column += (width > 0) ? width : 0;
3686+ if (convert_entire_line == 0)
3687+ convert = 0;
3688+ }
3689+ }
3690+
3691+ if (wc == L'\n')
3692+ {
3693+ tab_index = print_tab_index = 0;
3694+ column = pending = 0;
3695+ convert = 1;
3696+ }
3697+ fwrite (bufpos, sizeof(char), mblength, stdout);
3698+ }
3699+ }
e5317bd9 3700+ prev_tab = wc == L'\t';
56ae3f82
SS
3701+ buflen -= mblength;
3702+ bufpos += mblength;
3703+ }
3704+}
3705+#endif
3706+
3707+
3708 void
3709 usage (int status)
3710 {
fca5c2d6 3711@@ -523,7 +744,12 @@ main (int argc, char **argv)
56ae3f82
SS
3712
3713 file_list = (optind < argc ? &argv[optind] : stdin_argv);
3714
3715- unexpand ();
3716+#if HAVE_MBRTOWC
3717+ if (MB_CUR_MAX > 1)
3718+ unexpand_multibyte ();
3719+ else
3720+#endif
3721+ unexpand ();
3722
3723 if (have_read_stdin && fclose (stdin) != 0)
3724 error (EXIT_FAILURE, errno, "-");
fca5c2d6
SS
3725diff -Naurp coreutils-8.25-orig/src/uniq.c coreutils-8.25/src/uniq.c
3726--- coreutils-8.25-orig/src/uniq.c 2016-01-13 05:08:59.000000000 -0600
3727+++ coreutils-8.25/src/uniq.c 2016-02-08 19:07:10.312944654 -0600
fbb9790b 3728@@ -21,6 +21,17 @@
56ae3f82
SS
3729 #include <getopt.h>
3730 #include <sys/types.h>
3731
3732+/* Get mbstate_t, mbrtowc(). */
3733+#if HAVE_WCHAR_H
3734+# include <wchar.h>
3735+#endif
3736+
3737+/* Get isw* functions. */
3738+#if HAVE_WCTYPE_H
3739+# include <wctype.h>
3740+#endif
fbb9790b 3741+#include <assert.h>
56ae3f82
SS
3742+
3743 #include "system.h"
3744 #include "argmatch.h"
3745 #include "linebuffer.h"
fca5c2d6 3746@@ -33,6 +44,18 @@
56ae3f82 3747 #include "xstrtol.h"
fca5c2d6
SS
3748 #include "memcasecmp.h"
3749 #include "quote.h"
56ae3f82
SS
3750+#include "xmemcoll.h"
3751+
3752+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3753+ installation; work around this configuration error. */
3754+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3755+# define MB_LEN_MAX 16
3756+#endif
3757+
3758+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3759+#if HAVE_MBRTOWC && defined mbstate_t
3760+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3761+#endif
56ae3f82 3762
6987acf5 3763 /* The official name of this program (e.g., no 'g' prefix). */
56ae3f82 3764 #define PROGRAM_NAME "uniq"
fca5c2d6 3765@@ -143,6 +166,10 @@ enum
fbb9790b
SS
3766 GROUP_OPTION = CHAR_MAX + 1
3767 };
56ae3f82
SS
3768
3769+/* Function pointers. */
3770+static char *
3771+(*find_field) (struct linebuffer *line);
3772+
3773 static struct option const longopts[] =
3774 {
3775 {"count", no_argument, NULL, 'c'},
fca5c2d6 3776@@ -252,7 +279,7 @@ size_opt (char const *opt, char const *m
56ae3f82
SS
3777 return a pointer to the beginning of the line's field to be compared. */
3778
e7f6ab54 3779 static char * _GL_ATTRIBUTE_PURE
56ae3f82
SS
3780-find_field (struct linebuffer const *line)
3781+find_field_uni (struct linebuffer *line)
3782 {
3783 size_t count;
3784 char const *lp = line->buffer;
fca5c2d6 3785@@ -272,6 +299,83 @@ find_field (struct linebuffer const *lin
56ae3f82
SS
3786 return line->buffer + i;
3787 }
3788
3789+#if HAVE_MBRTOWC
3790+
3791+# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
3792+ do \
3793+ { \
3794+ mbstate_t state_bak; \
3795+ \
3796+ CONVFAIL = 0; \
3797+ state_bak = *STATEP; \
3798+ \
3799+ MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
3800+ \
3801+ switch (MBLENGTH) \
3802+ { \
3803+ case (size_t)-2: \
3804+ case (size_t)-1: \
3805+ *STATEP = state_bak; \
3806+ CONVFAIL++; \
3807+ /* Fall through */ \
3808+ case 0: \
3809+ MBLENGTH = 1; \
3810+ } \
3811+ } \
3812+ while (0)
3813+
3814+static char *
3815+find_field_multi (struct linebuffer *line)
3816+{
3817+ size_t count;
3818+ char *lp = line->buffer;
3819+ size_t size = line->length - 1;
3820+ size_t pos;
3821+ size_t mblength;
3822+ wchar_t wc;
3823+ mbstate_t *statep;
3badd2da 3824+ int convfail = 0;
56ae3f82
SS
3825+
3826+ pos = 0;
3827+ statep = &(line->state);
3828+
3829+ /* skip fields. */
3830+ for (count = 0; count < skip_fields && pos < size; count++)
3831+ {
3832+ while (pos < size)
3833+ {
3834+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
fca5c2d6
SS
3835+
3836+ if (convfail || !(iswblank (wc) || wc == '\n'))
56ae3f82
SS
3837+ {
3838+ pos += mblength;
3839+ break;
3840+ }
3841+ pos += mblength;
3842+ }
3843+
3844+ while (pos < size)
3845+ {
3846+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3847+
fca5c2d6 3848+ if (!convfail && (iswblank (wc) || wc == '\n'))
56ae3f82
SS
3849+ break;
3850+
3851+ pos += mblength;
3852+ }
3853+ }
3854+
3855+ /* skip fields. */
3856+ for (count = 0; count < skip_chars && pos < size; count++)
3857+ {
3858+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3859+ pos += mblength;
3860+ }
3861+
3862+ return lp + pos;
3863+}
3864+#endif
3865+
3866 /* Return false if two strings OLD and NEW match, true if not.
3867 OLD and NEW point not to the beginnings of the lines
3868 but rather to the beginnings of the fields to compare.
fca5c2d6 3869@@ -280,6 +384,8 @@ find_field (struct linebuffer const *lin
56ae3f82
SS
3870 static bool
3871 different (char *old, char *new, size_t oldlen, size_t newlen)
3872 {
3873+ char *copy_old, *copy_new;
3874+
3875 if (check_chars < oldlen)
3876 oldlen = check_chars;
3877 if (check_chars < newlen)
fca5c2d6 3878@@ -287,15 +393,104 @@ different (char *old, char *new, size_t
56ae3f82
SS
3879
3880 if (ignore_case)
3881 {
3882- /* FIXME: This should invoke strcoll somehow. */
3883- return oldlen != newlen || memcasecmp (old, new, oldlen);
3884+ size_t i;
3885+
e5317bd9
SS
3886+ copy_old = xmalloc (oldlen + 1);
3887+ copy_new = xmalloc (oldlen + 1);
56ae3f82
SS
3888+
3889+ for (i = 0; i < oldlen; i++)
3890+ {
3891+ copy_old[i] = toupper (old[i]);
3892+ copy_new[i] = toupper (new[i]);
3893+ }
e5317bd9
SS
3894+ bool rc = xmemcoll (copy_old, oldlen, copy_new, newlen);
3895+ free (copy_old);
3896+ free (copy_new);
3897+ return rc;
56ae3f82
SS
3898 }
3899- else if (hard_LC_COLLATE)
3900- return xmemcoll (old, oldlen, new, newlen) != 0;
3901 else
3902- return oldlen != newlen || memcmp (old, new, oldlen);
3903+ {
3904+ copy_old = (char *)old;
3905+ copy_new = (char *)new;
3906+ }
3907+
3908+ return xmemcoll (copy_old, oldlen, copy_new, newlen);
e5317bd9 3909+
fbb9790b
SS
3910 }
3911
56ae3f82
SS
3912+#if HAVE_MBRTOWC
3913+static int
3914+different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate)
3915+{
3916+ size_t i, j, chars;
3917+ const char *str[2];
3918+ char *copy[2];
3919+ size_t len[2];
3920+ mbstate_t state[2];
3921+ size_t mblength;
3922+ wchar_t wc, uwc;
3923+ mbstate_t state_bak;
3924+
3925+ str[0] = old;
3926+ str[1] = new;
3927+ len[0] = oldlen;
3928+ len[1] = newlen;
3929+ state[0] = oldstate;
3930+ state[1] = newstate;
3931+
3932+ for (i = 0; i < 2; i++)
3933+ {
e5317bd9 3934+ copy[i] = xmalloc (len[i] + 1);
fbb9790b 3935+ memset (copy[i], '\0', len[i] + 1);
56ae3f82
SS
3936+
3937+ for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++)
3938+ {
3939+ state_bak = state[i];
3940+ mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i]));
3941+
3942+ switch (mblength)
3943+ {
3944+ case (size_t)-1:
3945+ case (size_t)-2:
3946+ state[i] = state_bak;
3947+ /* Fall through */
3948+ case 0:
3949+ mblength = 1;
3950+ break;
3951+
3952+ default:
3953+ if (ignore_case)
3954+ {
3955+ uwc = towupper (wc);
3956+
3957+ if (uwc != wc)
3958+ {
3959+ mbstate_t state_wc;
fbb9790b 3960+ size_t mblen;
56ae3f82
SS
3961+
3962+ memset (&state_wc, '\0', sizeof(mbstate_t));
fbb9790b
SS
3963+ mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
3964+ assert (mblen != (size_t)-1);
56ae3f82
SS
3965+ }
3966+ else
3967+ memcpy (copy[i] + j, str[i] + j, mblength);
3968+ }
3969+ else
3970+ memcpy (copy[i] + j, str[i] + j, mblength);
3971+ }
3972+ j += mblength;
3973+ }
3974+ copy[i][j] = '\0';
3975+ len[i] = j;
3976+ }
e5317bd9
SS
3977+ int rc = xmemcoll (copy[0], len[0], copy[1], len[1]);
3978+ free (copy[0]);
3979+ free (copy[1]);
3980+ return rc;
56ae3f82 3981+
fbb9790b 3982+}
56ae3f82 3983+#endif
fbb9790b 3984+
56ae3f82
SS
3985 /* Output the line in linebuffer LINE to standard output
3986 provided that the switches say it should be output.
fbb9790b 3987 MATCH is true if the line matches the previous line.
fca5c2d6 3988@@ -359,19 +554,38 @@ check_file (const char *infile, const ch
1555d43c
SS
3989 char *prevfield IF_LINT ( = NULL);
3990 size_t prevlen IF_LINT ( = 0);
fbb9790b 3991 bool first_group_printed = false;
56ae3f82
SS
3992+#if HAVE_MBRTOWC
3993+ mbstate_t prevstate;
3994+
3995+ memset (&prevstate, '\0', sizeof (mbstate_t));
3996+#endif
3997
3998 while (!feof (stdin))
3999 {
4000 char *thisfield;
4001 size_t thislen;
fbb9790b 4002 bool new_group;
56ae3f82
SS
4003+#if HAVE_MBRTOWC
4004+ mbstate_t thisstate;
4005+#endif
fbb9790b 4006
56ae3f82
SS
4007 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
4008 break;
fbb9790b 4009
56ae3f82
SS
4010 thisfield = find_field (thisline);
4011 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
4012+#if HAVE_MBRTOWC
4013+ if (MB_CUR_MAX > 1)
4014+ {
fbb9790b
SS
4015+ thisstate = thisline->state;
4016
4017+ new_group = (prevline->length == 0
4018+ || different_multi (thisfield, prevfield,
4019+ thislen, prevlen,
4020+ thisstate, prevstate));
4021+ }
4022+ else
4023+#endif
4024 new_group = (prevline->length == 0
4025 || different (thisfield, prevfield, thislen, prevlen));
4026
fca5c2d6 4027@@ -389,6 +603,10 @@ check_file (const char *infile, const ch
fbb9790b
SS
4028 SWAP_LINES (prevline, thisline);
4029 prevfield = thisfield;
4030 prevlen = thislen;
4031+#if HAVE_MBRTOWC
4032+ if (MB_CUR_MAX > 1)
56ae3f82 4033+ prevstate = thisstate;
56ae3f82 4034+#endif
fbb9790b
SS
4035 first_group_printed = true;
4036 }
4037 }
fca5c2d6 4038@@ -401,17 +619,26 @@ check_file (const char *infile, const ch
56ae3f82
SS
4039 size_t prevlen;
4040 uintmax_t match_count = 0;
4041 bool first_delimiter = true;
4042+#if HAVE_MBRTOWC
4043+ mbstate_t prevstate;
4044+#endif
4045
4046 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
4047 goto closefiles;
4048 prevfield = find_field (prevline);
4049 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
4050+#if HAVE_MBRTOWC
4051+ prevstate = prevline->state;
4052+#endif
4053
4054 while (!feof (stdin))
4055 {
4056 bool match;
4057 char *thisfield;
4058 size_t thislen;
4059+#if HAVE_MBRTOWC
3badd2da 4060+ mbstate_t thisstate = thisline->state;
56ae3f82
SS
4061+#endif
4062 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
4063 {
4064 if (ferror (stdin))
fca5c2d6 4065@@ -420,6 +647,14 @@ check_file (const char *infile, const ch
56ae3f82
SS
4066 }
4067 thisfield = find_field (thisline);
4068 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
4069+#if HAVE_MBRTOWC
4070+ if (MB_CUR_MAX > 1)
4071+ {
56ae3f82
SS
4072+ match = !different_multi (thisfield, prevfield,
4073+ thislen, prevlen, thisstate, prevstate);
4074+ }
4075+ else
4076+#endif
4077 match = !different (thisfield, prevfield, thislen, prevlen);
4078 match_count += match;
4079
fca5c2d6 4080@@ -452,6 +687,9 @@ check_file (const char *infile, const ch
56ae3f82
SS
4081 SWAP_LINES (prevline, thisline);
4082 prevfield = thisfield;
4083 prevlen = thislen;
4084+#if HAVE_MBRTOWC
4085+ prevstate = thisstate;
4086+#endif
4087 if (!match)
4088 match_count = 0;
4089 }
fca5c2d6 4090@@ -498,6 +736,19 @@ main (int argc, char **argv)
56ae3f82
SS
4091
4092 atexit (close_stdout);
4093
4094+#if HAVE_MBRTOWC
4095+ if (MB_CUR_MAX > 1)
4096+ {
4097+ find_field = find_field_multi;
4098+ }
4099+ else
4100+#endif
4101+ {
4102+ find_field = find_field_uni;
4103+ }
4104+
4105+
4106+
4107 skip_chars = 0;
4108 skip_fields = 0;
4109 check_chars = SIZE_MAX;
fca5c2d6
SS
4110diff -Naurp coreutils-8.25-orig/tests/i18n/sort-month.sh coreutils-8.25/tests/i18n/sort-month.sh
4111--- coreutils-8.25-orig/tests/i18n/sort-month.sh 1969-12-31 18:00:00.000000000 -0600
4112+++ coreutils-8.25/tests/i18n/sort-month.sh 2016-02-08 19:07:10.312944654 -0600
4113@@ -0,0 +1,34 @@
4114+#!/bin/sh
4115+# Verify sort -M multi-byte support.
4116+
4117+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
4118+print_ver_ sort
4119+require_valgrind_
4120+
4121+# Skip this test if some deallocations are
4122+# avoided at process end.
4123+grep '^#define lint 1' $CONFIG_HEADER > /dev/null ||
4124+ skip_ 'Allocation checks only work reliably in "lint" mode'
4125+
4126+export LC_ALL=en_US.UTF-8
4127+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
4128+ || skip_ "No UTF-8 locale available"
4129+
4130+# Note the use of ɑ here which expands to
4131+# a wider representation upon case conversion
4132+# which triggered an assertion in sort -M
4133+cat <<EOF > exp
4134+.
4135
4136+EOF
4137+
4138+
4139+# check large mem leak with --month-sort
4140+# https://bugzilla.redhat.com/show_bug.cgi?id=1259942
4141+valgrind --leak-check=full \
4142+ --error-exitcode=1 --errors-for-leak-kinds=definite \
4143+ sort -M < exp > out || fail=1
4144+compare exp out || { fail=1; cat out; }
4145+
4146+
4147+Exit $fail
4148diff -Naurp coreutils-8.25-orig/tests/i18n/sort.sh coreutils-8.25/tests/i18n/sort.sh
4149--- coreutils-8.25-orig/tests/i18n/sort.sh 1969-12-31 18:00:00.000000000 -0600
4150+++ coreutils-8.25/tests/i18n/sort.sh 2016-02-08 19:07:10.312944654 -0600
4151@@ -0,0 +1,29 @@
4152+#!/bin/sh
4153+# Verify sort's multi-byte support.
4154+
4155+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
4156+print_ver_ sort
4157+
4158+export LC_ALL=en_US.UTF-8
4159+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
4160+ || skip_ "No UTF-8 locale available"
4161+
4162+# Enable heap consistency checkng on older systems
4163+export MALLOC_CHECK_=2
4164+
4165+
4166+# check buffer overflow issue due to
4167+# expanding multi-byte representation due to case conversion
4168+# https://bugzilla.suse.com/show_bug.cgi?id=928749
4169+cat <<EOF > exp
4170+.
4171
4172+EOF
4173+cat <<EOF | sort -f > out || fail=1
4174+.
4175
4176+EOF
4177+compare exp out || { fail=1; cat out; }
4178+
4179+
4180+Exit $fail
4181diff -Naurp coreutils-8.25-orig/tests/local.mk coreutils-8.25/tests/local.mk
4182--- coreutils-8.25-orig/tests/local.mk 2016-01-16 12:18:13.000000000 -0600
4183+++ coreutils-8.25/tests/local.mk 2016-02-08 19:07:10.313944658 -0600
4184@@ -344,6 +344,9 @@ all_tests = \
e5317bd9
SS
4185 tests/misc/sort-discrim.sh \
4186 tests/misc/sort-files0-from.pl \
4187 tests/misc/sort-float.sh \
4188+ tests/misc/sort-mb-tests.sh \
fca5c2d6
SS
4189+ tests/i18n/sort.sh \
4190+ tests/i18n/sort-month.sh \
e5317bd9
SS
4191 tests/misc/sort-merge.pl \
4192 tests/misc/sort-merge-fdlimit.sh \
4193 tests/misc/sort-month.sh \
fca5c2d6
SS
4194diff -Naurp coreutils-8.25-orig/tests/misc/cut.pl coreutils-8.25/tests/misc/cut.pl
4195--- coreutils-8.25-orig/tests/misc/cut.pl 2016-01-16 12:18:13.000000000 -0600
4196+++ coreutils-8.25/tests/misc/cut.pl 2016-02-08 19:07:10.314944661 -0600
4197@@ -23,9 +23,11 @@ use strict;
e7f6ab54
SS
4198 # Turn off localization of executable's output.
4199 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4200
4201-my $mb_locale = $ENV{LOCALE_FR_UTF8};
fbb9790b
SS
4202+my $mb_locale;
4203+# uncommented enable multibyte paths
4204+$mb_locale = $ENV{LOCALE_FR_UTF8};
4205 ! defined $mb_locale || $mb_locale eq 'none'
e7f6ab54 4206- and $mb_locale = 'C';
fbb9790b 4207+ and $mb_locale = 'C';
e7f6ab54 4208
56ae3f82 4209 my $prog = 'cut';
6987acf5 4210 my $try = "Try '$prog --help' for more information.\n";
fca5c2d6 4211@@ -240,6 +242,7 @@ if ($mb_locale ne 'C')
fbb9790b
SS
4212 my @new_t = @$t;
4213 my $test_name = shift @new_t;
4214
4215+ next if ($test_name =~ "newline-[12][0-9]");
4216 push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4217 }
4218 push @Tests, @new;
fca5c2d6
SS
4219diff -Naurp coreutils-8.25-orig/tests/misc/expand.pl coreutils-8.25/tests/misc/expand.pl
4220--- coreutils-8.25-orig/tests/misc/expand.pl 2016-01-16 12:18:13.000000000 -0600
4221+++ coreutils-8.25/tests/misc/expand.pl 2016-02-08 19:07:10.314944661 -0600
4222@@ -23,6 +23,15 @@ use strict;
effd5ec1
SS
4223 # Turn off localization of executable's output.
4224 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4225
fbb9790b 4226+#comment out next line to disable multibyte tests
effd5ec1
SS
4227+my $mb_locale = $ENV{LOCALE_FR_UTF8};
4228+! defined $mb_locale || $mb_locale eq 'none'
4229+ and $mb_locale = 'C';
4230+
4231+my $prog = 'expand';
4232+my $try = "Try \`$prog --help' for more information.\n";
4233+my $inval = "$prog: invalid byte, character or field list\n$try";
4234+
4235 my @Tests =
4236 (
4237 ['t1', '--tabs=3', {IN=>"a\tb"}, {OUT=>"a b"}],
fca5c2d6 4238@@ -31,6 +40,37 @@ my @Tests =
effd5ec1
SS
4239 ['i2', '--tabs=3 -i', {IN=>" \ta\tb"}, {OUT=>" a\tb"}],
4240 );
4241
4242+if ($mb_locale ne 'C')
4243+ {
4244+ # Duplicate each test vector, appending "-mb" to the test name and
4245+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4246+ # provide coverage for the distro-added multi-byte code paths.
4247+ my @new;
4248+ foreach my $t (@Tests)
4249+ {
4250+ my @new_t = @$t;
4251+ my $test_name = shift @new_t;
4252+
4253+ # Depending on whether expand is multi-byte-patched,
4254+ # it emits different diagnostics:
4255+ # non-MB: invalid byte or field list
4256+ # MB: invalid byte, character or field list
4257+ # Adjust the expected error output accordingly.
4258+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4259+ (@new_t))
4260+ {
4261+ my $sub = {ERR_SUBST => 's/, character//'};
4262+ push @new_t, $sub;
4263+ push @$t, $sub;
4264+ }
4265+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4266+ }
4267+ push @Tests, @new;
4268+ }
4269+
4270+
4271+@Tests = triple_test \@Tests;
4272+
4273 my $save_temps = $ENV{DEBUG};
4274 my $verbose = $ENV{VERBOSE};
4275
fca5c2d6
SS
4276diff -Naurp coreutils-8.25-orig/tests/misc/fold.pl coreutils-8.25/tests/misc/fold.pl
4277--- coreutils-8.25-orig/tests/misc/fold.pl 2016-01-16 12:18:13.000000000 -0600
4278+++ coreutils-8.25/tests/misc/fold.pl 2016-02-08 19:07:10.314944661 -0600
4279@@ -20,9 +20,18 @@ use strict;
fbb9790b
SS
4280
4281 (my $program_name = $0) =~ s|.*/||;
4282
4283+my $prog = 'fold';
4284+my $try = "Try \`$prog --help' for more information.\n";
4285+my $inval = "$prog: invalid byte, character or field list\n$try";
4286+
4287 # Turn off localization of executable's output.
4288 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4289
4290+# uncommented to enable multibyte paths
4291+my $mb_locale = $ENV{LOCALE_FR_UTF8};
4292+! defined $mb_locale || $mb_locale eq 'none'
4293+ and $mb_locale = 'C';
4294+
4295 my @Tests =
4296 (
4297 ['s1', '-w2 -s', {IN=>"a\t"}, {OUT=>"a\n\t"}],
fca5c2d6 4298@@ -31,9 +40,48 @@ my @Tests =
fbb9790b
SS
4299 ['s4', '-w4 -s', {IN=>"abc ef\n"}, {OUT=>"abc \nef\n"}],
4300 );
4301
4302+# Add _POSIX2_VERSION=199209 to the environment of each test
4303+# that uses an old-style option like +1.
4304+if ($mb_locale ne 'C')
4305+ {
4306+ # Duplicate each test vector, appending "-mb" to the test name and
4307+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4308+ # provide coverage for the distro-added multi-byte code paths.
4309+ my @new;
4310+ foreach my $t (@Tests)
4311+ {
4312+ my @new_t = @$t;
4313+ my $test_name = shift @new_t;
4314+
4315+ # Depending on whether fold is multi-byte-patched,
4316+ # it emits different diagnostics:
4317+ # non-MB: invalid byte or field list
4318+ # MB: invalid byte, character or field list
4319+ # Adjust the expected error output accordingly.
4320+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4321+ (@new_t))
4322+ {
4323+ my $sub = {ERR_SUBST => 's/, character//'};
4324+ push @new_t, $sub;
4325+ push @$t, $sub;
4326+ }
4327+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4328+ }
4329+ push @Tests, @new;
4330+ }
4331+
4332+@Tests = triple_test \@Tests;
4333+
4334+# Remember that triple_test creates from each test with exactly one "IN"
4335+# file two more tests (.p and .r suffix on name) corresponding to reading
4336+# input from a file and from a pipe. The pipe-reading test would fail
4337+# due to a race condition about 1 in 20 times.
4338+# Remove the IN_PIPE version of the "output-is-input" test above.
4339+# The others aren't susceptible because they have three inputs each.
4340+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
4341+
4342 my $save_temps = $ENV{DEBUG};
4343 my $verbose = $ENV{VERBOSE};
4344
4345-my $prog = 'fold';
4346 my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
4347 exit $fail;
fca5c2d6
SS
4348diff -Naurp coreutils-8.25-orig/tests/misc/join.pl coreutils-8.25/tests/misc/join.pl
4349--- coreutils-8.25-orig/tests/misc/join.pl 2016-01-16 12:18:13.000000000 -0600
4350+++ coreutils-8.25/tests/misc/join.pl 2016-02-08 19:07:10.315944664 -0600
4351@@ -25,6 +25,15 @@ my $limits = getlimits ();
fbb9790b
SS
4352
4353 my $prog = 'join';
4354
4355+my $try = "Try \`$prog --help' for more information.\n";
4356+my $inval = "$prog: invalid byte, character or field list\n$try";
4357+
4358+my $mb_locale;
4359+#Comment out next line to disable multibyte tests
4360+$mb_locale = $ENV{LOCALE_FR_UTF8};
4361+! defined $mb_locale || $mb_locale eq 'none'
4362+ and $mb_locale = 'C';
4363+
4364 my $delim = chr 0247;
4365 sub t_subst ($)
4366 {
fca5c2d6 4367@@ -329,8 +338,49 @@ foreach my $t (@tv)
fbb9790b
SS
4368 push @Tests, $new_ent;
4369 }
4370
4371+# Add _POSIX2_VERSION=199209 to the environment of each test
4372+# that uses an old-style option like +1.
4373+if ($mb_locale ne 'C')
4374+ {
4375+ # Duplicate each test vector, appending "-mb" to the test name and
4376+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4377+ # provide coverage for the distro-added multi-byte code paths.
4378+ my @new;
4379+ foreach my $t (@Tests)
4380+ {
4381+ my @new_t = @$t;
4382+ my $test_name = shift @new_t;
4383+
4384+ # Depending on whether join is multi-byte-patched,
4385+ # it emits different diagnostics:
4386+ # non-MB: invalid byte or field list
4387+ # MB: invalid byte, character or field list
4388+ # Adjust the expected error output accordingly.
4389+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4390+ (@new_t))
4391+ {
4392+ my $sub = {ERR_SUBST => 's/, character//'};
4393+ push @new_t, $sub;
4394+ push @$t, $sub;
4395+ }
4396+ #Adjust the output some error messages including test_name for mb
4397+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR}}
4398+ (@new_t))
4399+ {
4400+ my $sub2 = {ERR_SUBST => "s/$test_name-mb/$test_name/"};
4401+ push @new_t, $sub2;
4402+ push @$t, $sub2;
4403+ }
4404+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4405+ }
4406+ push @Tests, @new;
4407+ }
4408+
4409 @Tests = triple_test \@Tests;
4410
4411+#skip invalid-j-mb test, it is failing because of the format
4412+@Tests = grep {$_->[0] ne 'invalid-j-mb'} @Tests;
4413+
4414 my $save_temps = $ENV{DEBUG};
4415 my $verbose = $ENV{VERBOSE};
4416
fca5c2d6
SS
4417diff -Naurp coreutils-8.25-orig/tests/misc/sort-mb-tests.sh coreutils-8.25/tests/misc/sort-mb-tests.sh
4418--- coreutils-8.25-orig/tests/misc/sort-mb-tests.sh 1969-12-31 18:00:00.000000000 -0600
4419+++ coreutils-8.25/tests/misc/sort-mb-tests.sh 2016-02-08 19:07:10.315944664 -0600
e5317bd9
SS
4420@@ -0,0 +1,45 @@
4421+#!/bin/sh
4422+# Verify sort's multi-byte support.
4423+
4424+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
4425+print_ver_ sort
56ae3f82
SS
4426+
4427+export LC_ALL=en_US.UTF-8
e5317bd9
SS
4428+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
4429+ || skip_ "No UTF-8 locale available"
4430+
4431+
4432+cat <<EOF > exp
4433+Banana@5
4434+Apple@10
4435+Citrus@20
4436+Cherry@30
4437+EOF
4438+
4439+cat <<EOF | sort -t @ -k2 -n > out || fail=1
4440+Apple@10
4441+Banana@5
4442+Citrus@20
4443+Cherry@30
4444+EOF
4445+
4446+compare exp out || { fail=1; cat out; }
4447+
4448+
4449+cat <<EOF > exp
4450+Citrus@AA20@@5
4451+Cherry@AA30@@10
4452+Apple@AA10@@20
4453+Banana@AA5@@30
4454+EOF
4455+
4456+cat <<EOF | sort -t @ -k4 -n > out || fail=1
4457+Apple@AA10@@20
4458+Banana@AA5@@30
4459+Citrus@AA20@@5
4460+Cherry@AA30@@10
4461+EOF
4462+
4463+compare exp out || { fail=1; cat out; }
4464+
4465+Exit $fail
fca5c2d6
SS
4466diff -Naurp coreutils-8.25-orig/tests/misc/sort-merge.pl coreutils-8.25/tests/misc/sort-merge.pl
4467--- coreutils-8.25-orig/tests/misc/sort-merge.pl 2016-01-16 12:18:14.000000000 -0600
4468+++ coreutils-8.25/tests/misc/sort-merge.pl 2016-02-08 19:07:10.316944667 -0600
4469@@ -26,6 +26,15 @@ my $prog = 'sort';
fbb9790b
SS
4470 # Turn off localization of executable's output.
4471 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4472
4473+my $mb_locale;
4474+# uncommented according to upstream commit enabling multibyte paths
4475+$mb_locale = $ENV{LOCALE_FR_UTF8};
4476+! defined $mb_locale || $mb_locale eq 'none'
4477+ and $mb_locale = 'C';
4478+
4479+my $try = "Try \`$prog --help' for more information.\n";
4480+my $inval = "$prog: invalid byte, character or field list\n$try";
4481+
4482 # three empty files and one that says 'foo'
4483 my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
4484
fca5c2d6 4485@@ -77,6 +86,39 @@ my @Tests =
fbb9790b
SS
4486 {OUT=>$big_input}],
4487 );
4488
4489+# Add _POSIX2_VERSION=199209 to the environment of each test
4490+# that uses an old-style option like +1.
4491+if ($mb_locale ne 'C')
4492+ {
4493+ # Duplicate each test vector, appending "-mb" to the test name and
4494+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4495+ # provide coverage for the distro-added multi-byte code paths.
4496+ my @new;
4497+ foreach my $t (@Tests)
4498+ {
4499+ my @new_t = @$t;
4500+ my $test_name = shift @new_t;
4501+
4502+ # Depending on whether sort is multi-byte-patched,
4503+ # it emits different diagnostics:
4504+ # non-MB: invalid byte or field list
4505+ # MB: invalid byte, character or field list
4506+ # Adjust the expected error output accordingly.
4507+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4508+ (@new_t))
4509+ {
4510+ my $sub = {ERR_SUBST => 's/, character//'};
4511+ push @new_t, $sub;
4512+ push @$t, $sub;
4513+ }
4514+ next if ($test_name =~ "nmerge-.");
4515+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4516+ }
4517+ push @Tests, @new;
4518+ }
4519+
4520+@Tests = triple_test \@Tests;
4521+
4522 my $save_temps = $ENV{DEBUG};
4523 my $verbose = $ENV{VERBOSE};
4524
fca5c2d6
SS
4525diff -Naurp coreutils-8.25-orig/tests/misc/sort.pl coreutils-8.25/tests/misc/sort.pl
4526--- coreutils-8.25-orig/tests/misc/sort.pl 2016-01-16 12:18:14.000000000 -0600
4527+++ coreutils-8.25/tests/misc/sort.pl 2016-02-08 19:07:10.316944667 -0600
4528@@ -24,10 +24,15 @@ my $prog = 'sort';
fbb9790b
SS
4529 # Turn off localization of executable's output.
4530 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4531
4532-my $mb_locale = $ENV{LOCALE_FR_UTF8};
4533+my $mb_locale;
4534+#Comment out next line to disable multibyte tests
4535+$mb_locale = $ENV{LOCALE_FR_UTF8};
4536 ! defined $mb_locale || $mb_locale eq 'none'
4537 and $mb_locale = 'C';
4538
4539+my $try = "Try \`$prog --help' for more information.\n";
4540+my $inval = "$prog: invalid byte, character or field list\n$try";
4541+
4542 # Since each test is run with a file name and with redirected stdin,
4543 # the name in the diagnostic is either the file name or "-".
4544 # Normalize each diagnostic to use '-'.
fca5c2d6 4545@@ -424,6 +429,38 @@ foreach my $t (@Tests)
fbb9790b
SS
4546 }
4547 }
4548
4549+if ($mb_locale ne 'C')
4550+ {
4551+ # Duplicate each test vector, appending "-mb" to the test name and
4552+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4553+ # provide coverage for the distro-added multi-byte code paths.
4554+ my @new;
4555+ foreach my $t (@Tests)
4556+ {
4557+ my @new_t = @$t;
4558+ my $test_name = shift @new_t;
4559+
4560+ # Depending on whether sort is multi-byte-patched,
4561+ # it emits different diagnostics:
4562+ # non-MB: invalid byte or field list
4563+ # MB: invalid byte, character or field list
4564+ # Adjust the expected error output accordingly.
4565+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4566+ (@new_t))
4567+ {
4568+ my $sub = {ERR_SUBST => 's/, character//'};
4569+ push @new_t, $sub;
4570+ push @$t, $sub;
4571+ }
4572+ #disable several failing tests until investigation, disable all tests with envvars set
4573+ next if (grep {ref $_ eq 'HASH' && exists $_->{ENV}} (@new_t));
4574+ next if ($test_name =~ "18g" or $test_name =~ "sort-numeric" or $test_name =~ "08[ab]" or $test_name =~ "03[def]" or $test_name =~ "h4" or $test_name =~ "n1" or $test_name =~ "2[01]a");
fca5c2d6 4575+ next if ($test_name =~ "11[ab]"); # avoid FP: expected result differs to MB result due to collation rules.
fbb9790b
SS
4576+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4577+ }
4578+ push @Tests, @new;
4579+ }
4580+
4581 @Tests = triple_test \@Tests;
4582
4583 # Remember that triple_test creates from each test with exactly one "IN"
fca5c2d6 4584@@ -433,6 +470,7 @@ foreach my $t (@Tests)
fbb9790b
SS
4585 # Remove the IN_PIPE version of the "output-is-input" test above.
4586 # The others aren't susceptible because they have three inputs each.
4587 @Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
4588+@Tests = grep {$_->[0] ne 'output-is-input-mb.p'} @Tests;
4589
4590 my $save_temps = $ENV{DEBUG};
4591 my $verbose = $ENV{VERBOSE};
fca5c2d6
SS
4592diff -Naurp coreutils-8.25-orig/tests/misc/unexpand.pl coreutils-8.25/tests/misc/unexpand.pl
4593--- coreutils-8.25-orig/tests/misc/unexpand.pl 2016-01-16 12:18:14.000000000 -0600
4594+++ coreutils-8.25/tests/misc/unexpand.pl 2016-02-08 19:07:10.317944671 -0600
4595@@ -27,6 +27,14 @@ my $limits = getlimits ();
fbb9790b
SS
4596
4597 my $prog = 'unexpand';
4598
4599+# comment out next line to disable multibyte tests
4600+my $mb_locale = $ENV{LOCALE_FR_UTF8};
4601+! defined $mb_locale || $mb_locale eq 'none'
4602+ and $mb_locale = 'C';
4603+
4604+my $try = "Try \`$prog --help' for more information.\n";
4605+my $inval = "$prog: invalid byte, character or field list\n$try";
4606+
4607 my @Tests =
4608 (
4609 ['a1', {IN=> ' 'x 1 ."y\n"}, {OUT=> ' 'x 1 ."y\n"}],
fca5c2d6 4610@@ -92,6 +100,37 @@ my @Tests =
fbb9790b
SS
4611 {EXIT => 1}, {ERR => "$prog: tab stop value is too large\n"}],
4612 );
4613
4614+if ($mb_locale ne 'C')
4615+ {
4616+ # Duplicate each test vector, appending "-mb" to the test name and
4617+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4618+ # provide coverage for the distro-added multi-byte code paths.
4619+ my @new;
4620+ foreach my $t (@Tests)
4621+ {
4622+ my @new_t = @$t;
4623+ my $test_name = shift @new_t;
4624+
4625+ # Depending on whether unexpand is multi-byte-patched,
4626+ # it emits different diagnostics:
4627+ # non-MB: invalid byte or field list
4628+ # MB: invalid byte, character or field list
4629+ # Adjust the expected error output accordingly.
4630+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4631+ (@new_t))
4632+ {
4633+ my $sub = {ERR_SUBST => 's/, character//'};
4634+ push @new_t, $sub;
4635+ push @$t, $sub;
4636+ }
4637+ next if ($test_name =~ 'b-1');
4638+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4639+ }
4640+ push @Tests, @new;
4641+ }
4642+
4643+@Tests = triple_test \@Tests;
4644+
4645 my $save_temps = $ENV{DEBUG};
4646 my $verbose = $ENV{VERBOSE};
4647
fca5c2d6
SS
4648diff -Naurp coreutils-8.25-orig/tests/misc/uniq.pl coreutils-8.25/tests/misc/uniq.pl
4649--- coreutils-8.25-orig/tests/misc/uniq.pl 2016-01-16 12:18:14.000000000 -0600
4650+++ coreutils-8.25/tests/misc/uniq.pl 2016-02-08 19:07:10.317944671 -0600
4651@@ -23,9 +23,17 @@ my $limits = getlimits ();
fbb9790b
SS
4652 my $prog = 'uniq';
4653 my $try = "Try '$prog --help' for more information.\n";
4654
4655+my $inval = "$prog: invalid byte, character or field list\n$try";
4656+
4657 # Turn off localization of executable's output.
4658 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4659
4660+my $mb_locale;
4661+#Comment out next line to disable multibyte tests
4662+$mb_locale = $ENV{LOCALE_FR_UTF8};
4663+! defined $mb_locale || $mb_locale eq 'none'
4664+ and $mb_locale = 'C';
4665+
4666 # When possible, create a "-z"-testing variant of each test.
4667 sub add_z_variants($)
4668 {
fca5c2d6 4669@@ -262,6 +270,53 @@ foreach my $t (@Tests)
fbb9790b
SS
4670 and push @$t, {ENV=>'_POSIX2_VERSION=199209'};
4671 }
4672
4673+if ($mb_locale ne 'C')
4674+ {
4675+ # Duplicate each test vector, appending "-mb" to the test name and
4676+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4677+ # provide coverage for the distro-added multi-byte code paths.
4678+ my @new;
4679+ foreach my $t (@Tests)
4680+ {
4681+ my @new_t = @$t;
4682+ my $test_name = shift @new_t;
4683+
4684+ # Depending on whether uniq is multi-byte-patched,
4685+ # it emits different diagnostics:
4686+ # non-MB: invalid byte or field list
4687+ # MB: invalid byte, character or field list
4688+ # Adjust the expected error output accordingly.
4689+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4690+ (@new_t))
4691+ {
4692+ my $sub = {ERR_SUBST => 's/, character//'};
4693+ push @new_t, $sub;
4694+ push @$t, $sub;
4695+ }
4696+ # In test #145, replace the each ‘...’ by '...'.
4697+ if ($test_name =~ "145")
4698+ {
4699+ my $sub = { ERR_SUBST => "s/‘([^’]+)’/'\$1'/g"};
4700+ push @new_t, $sub;
4701+ push @$t, $sub;
4702+ }
4703+ next if ( $test_name =~ "schar"
4704+ or $test_name =~ "^obs-plus"
4705+ or $test_name =~ "119");
4706+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4707+ }
4708+ push @Tests, @new;
4709+ }
4710+
4711+# Remember that triple_test creates from each test with exactly one "IN"
4712+# file two more tests (.p and .r suffix on name) corresponding to reading
4713+# input from a file and from a pipe. The pipe-reading test would fail
4714+# due to a race condition about 1 in 20 times.
4715+# Remove the IN_PIPE version of the "output-is-input" test above.
4716+# The others aren't susceptible because they have three inputs each.
4717+
4718+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
4719+
4720 @Tests = add_z_variants \@Tests;
4721 @Tests = triple_test \@Tests;
4722
fca5c2d6
SS
4723diff -Naurp coreutils-8.25-orig/tests/pr/pr-tests.pl coreutils-8.25/tests/pr/pr-tests.pl
4724--- coreutils-8.25-orig/tests/pr/pr-tests.pl 2016-01-16 12:18:14.000000000 -0600
4725+++ coreutils-8.25/tests/pr/pr-tests.pl 2016-02-08 19:07:10.318944674 -0600
4726@@ -24,6 +24,15 @@ use strict;
fbb9790b 4727 my $prog = 'pr';
fca5c2d6 4728 my $normalize_strerror = "s/': .*/'/";
fbb9790b
SS
4729
4730+my $mb_locale;
4731+#Uncomment the following line to enable multibyte tests
4732+$mb_locale = $ENV{LOCALE_FR_UTF8};
4733+! defined $mb_locale || $mb_locale eq 'none'
4734+ and $mb_locale = 'C';
4735+
4736+my $try = "Try \`$prog --help' for more information.\n";
4737+my $inval = "$prog: invalid byte, character or field list\n$try";
4738+
4739 my @tv = (
4740
4741 # -b option is no longer an official option. But it's still working to
fca5c2d6 4742@@ -467,8 +476,48 @@ push @Tests,
fbb9790b
SS
4743 {IN=>{3=>"x\ty\tz\n"}},
4744 {OUT=>join("\t", qw(a b c m n o x y z)) . "\n"} ];
4745
4746+# Add _POSIX2_VERSION=199209 to the environment of each test
4747+# that uses an old-style option like +1.
4748+if ($mb_locale ne 'C')
4749+ {
4750+ # Duplicate each test vector, appending "-mb" to the test name and
4751+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4752+ # provide coverage for the distro-added multi-byte code paths.
4753+ my @new;
4754+ foreach my $t (@Tests)
4755+ {
4756+ my @new_t = @$t;
4757+ my $test_name = shift @new_t;
4758+
4759+ # Depending on whether pr is multi-byte-patched,
4760+ # it emits different diagnostics:
4761+ # non-MB: invalid byte or field list
4762+ # MB: invalid byte, character or field list
4763+ # Adjust the expected error output accordingly.
4764+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4765+ (@new_t))
4766+ {
4767+ my $sub = {ERR_SUBST => 's/, character//'};
4768+ push @new_t, $sub;
4769+ push @$t, $sub;
4770+ }
4771+ #temporarily skip some failing tests
4772+ next if ($test_name =~ "col-0" or $test_name =~ "col-inval");
4773+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4774+ }
4775+ push @Tests, @new;
4776+ }
4777+
4778 @Tests = triple_test \@Tests;
4779
4780+# Remember that triple_test creates from each test with exactly one "IN"
4781+# file two more tests (.p and .r suffix on name) corresponding to reading
4782+# input from a file and from a pipe. The pipe-reading test would fail
4783+# due to a race condition about 1 in 20 times.
4784+# Remove the IN_PIPE version of the "output-is-input" test above.
4785+# The others aren't susceptible because they have three inputs each.
4786+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
4787+
4788 my $save_temps = $ENV{DEBUG};
4789 my $verbose = $ENV{VERBOSE};
4790