From: Lukáš Zaoral Date: Fri, 6 Mar 2026 14:13:17 +0000 (+0000) Subject: expand,unexpand: support multi-byte input X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2b92c16d26d5905eb340436335cdfa19be105cae;p=thirdparty%2Fcoreutils.git expand,unexpand: support multi-byte input * src/expand.c: Use mbbuf to support multi-byte input. * src/unexpand.c: Likewise. * tests/expand/mb.sh: New multi-byte test. * tests/unexpand/mb.sh: Likewise. * tests/local.mk: Reference new tests. * NEWS: Mention the improvement. --- diff --git a/NEWS b/NEWS index 3639e1ba39..cec03a581b 100644 --- a/NEWS +++ b/NEWS @@ -34,6 +34,8 @@ GNU coreutils NEWS -*- outline -*- 'df --local' recognises more file system types as remote. Specifically: autofs, ncpfs, smb, smb2, gfs, gfs2, userlandfs. + 'expand' and 'unexpand' now support multi-byte characters. + 'groups' and 'id' will now exit sooner after a write error, which is significant when listing information for many users. diff --git a/src/expand.c b/src/expand.c index cbf659c17e..6d4223c9b2 100644 --- a/src/expand.c +++ b/src/expand.c @@ -37,7 +37,11 @@ #include #include #include + #include "system.h" +#include "ioblksize.h" +#include "mcel.h" +#include "mbbuf.h" #include "expand-common.h" /* The official name of this program (e.g., no 'g' prefix). */ @@ -103,10 +107,14 @@ expand (void) if (!fp) return; + static char line_in[IO_BUFSIZE]; + mbbuf_t mbbuf; + mbbuf_init (&mbbuf, line_in, sizeof line_in, fp); + while (true) { /* Input character, or EOF. */ - int c; + mcel_t g; /* If true, perform translations. */ bool convert = true; @@ -126,12 +134,16 @@ expand (void) do { - while ((c = getc (fp)) < 0 && (fp = next_file (fp))) - continue; + while ((g = mbbuf_get_char (&mbbuf)).ch == MBBUF_EOF + && (fp = next_file (fp))) + mbbuf_init (&mbbuf, line_in, sizeof line_in, fp); if (convert) { - if (c == '\t') + convert &= convert_entire_line + || !! (c32isblank (g.ch) && ! c32isnbspace (g.ch)); + + if (g.ch == '\t') { /* Column the next input tab stop is on. */ bool last_tab; @@ -142,9 +154,12 @@ expand (void) if (putchar (' ') < 0) write_error (); - c = ' '; + if (putchar (' ') < 0) + write_error (); + + continue; } - else if (c == '\b') + else if (g.ch == '\b') { /* Go back one column, and force recalculation of the next tab stop. */ @@ -153,20 +168,21 @@ expand (void) } else { - if (ckd_add (&column, column, 1)) + int width = c32width (g.ch); + if (ckd_add (&column, column, width < 0 ? 1 : width)) error (EXIT_FAILURE, 0, _("input line is too long")); } - convert &= convert_entire_line || !! isblank (c); } - if (c < 0) + if (g.ch == MBBUF_EOF) return; - if (putchar (c) < 0) + fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, stdout); + if (ferror (stdout)) write_error (); } - while (c != '\n'); + while (g.ch != '\n'); } } diff --git a/src/unexpand.c b/src/unexpand.c index 54b3ae2fe2..16d0f00315 100644 --- a/src/unexpand.c +++ b/src/unexpand.c @@ -38,7 +38,11 @@ #include #include #include + #include "system.h" +#include "ioblksize.h" +#include "mbbuf.h" +#include "mcel.h" #include "expand-common.h" /* The official name of this program (e.g., no 'g' prefix). */ @@ -120,15 +124,19 @@ unexpand (void) if (!fp) return; + static char line_in[IO_BUFSIZE]; + mbbuf_t mbbuf; + mbbuf_init (&mbbuf, line_in, sizeof line_in, fp); + /* The worst case is a non-blank character, then one blank, then a tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ - pending_blank = ximalloc (max_column_width); + pending_blank = ximalloc (max_column_width * sizeof (char) * MB_LEN_MAX); while (true) { /* Input character, or EOF. */ - int c; + mcel_t g; /* If true, perform translations. */ bool convert = true; @@ -140,6 +148,9 @@ unexpand (void) /* Column of next input character. */ colno column = 0; + /* Column the next input tab stop is on. */ + colno next_tab_column = 0; + /* Index in TAB_LIST of next tab stop to examine. */ idx_t tab_index = 0; @@ -159,28 +170,27 @@ unexpand (void) do { - while ((c = getc (fp)) < 0 && (fp = next_file (fp))) - continue; + while ((g = mbbuf_get_char (&mbbuf)).ch == MBBUF_EOF + && (fp = next_file (fp))) + mbbuf_init (&mbbuf, line_in, sizeof line_in, fp); if (convert) { - bool blank = !! isblank (c); + bool blank = !! (c32isblank (g.ch) && ! c32isnbspace (g.ch)); if (blank) { bool last_tab; - /* Column the next input tab stop is on. */ - colno next_tab_column = get_next_tab_column (column, - &tab_index, - &last_tab); + next_tab_column = get_next_tab_column (column, &tab_index, + &last_tab); if (last_tab) convert = false; if (convert) { - if (c == '\t') + if (g.ch == '\t') { column = next_tab_column; @@ -189,7 +199,7 @@ unexpand (void) } else { - column++; + column += c32width (g.ch); if (! (prev_blank && column == next_tab_column)) { @@ -197,13 +207,18 @@ unexpand (void) will be replaced by tabs. */ if (column == next_tab_column) one_blank_before_tab_stop = true; - pending_blank[pending++] = c; + memcpy (pending_blank + pending, + mbbuf_char_offset (&mbbuf, g), g.len); + pending += g.len; prev_blank = true; continue; } /* Replace the pending blanks by a tab or two. */ - pending_blank[0] = c = '\t'; + g.len = 0; + if (putc ('\t', stdout) < 0) + write_error (); + pending_blank[0] = '\t'; } /* Discard pending blanks, unless it was a single @@ -211,17 +226,18 @@ unexpand (void) pending = one_blank_before_tab_stop; } } - else if (c == '\b') + else if (g.ch == '\b') { /* Go back one column, and force recalculation of the next tab stop. */ column -= !!column; + next_tab_column = column; tab_index -= !!tab_index; } else { - column++; - if (!column) + int width = c32width (g.ch); + if (ckd_add (&column, column, width < 0 ? 1 : width)) error (EXIT_FAILURE, 0, _("input line is too long")); } @@ -239,16 +255,17 @@ unexpand (void) convert &= convert_entire_line || blank; } - if (c < 0) + if (g.ch == MBBUF_EOF) { free (pending_blank); return; } - if (putchar (c) < 0) + fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, stdout); + if (ferror (stdout)) write_error (); } - while (c != '\n'); + while (g.ch != '\n'); } } diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh new file mode 100755 index 0000000000..10ea160f49 --- /dev/null +++ b/tests/expand/mb.sh @@ -0,0 +1,171 @@ +#!/bin/sh + +# Copyright (C) 2012-2015 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ expand printf + +export LC_ALL=en_US.UTF-8 + +#input containing multibyte characters +cat <<\EOF > in || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ + +cat <<\EOF > exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#multiple files as an input +cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +expand ./in ./in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test characters with display widths != 1 +env printf '12345678 +e\t|ascii(1) +\u00E9\t|composed(1) +e\u0301\t|decomposed(1) +\u3000\t|ideo-space(2) +\uFF0D\t|full-hypen(2) +' > in || framework_failure_ + +env printf '12345678 +e |ascii(1) +\u00E9 |composed(1) +e\u0301 |decomposed(1) +\u3000 |ideo-space(2) +\uFF0D |full-hypen(2) +' > exp || framework_failure_ + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#shouldn't fail with "input line too long" +#when a line starts with a control character +env printf '\n' > in || framework_failure_ + +expand < in > out || fail=1 +compare in out > /dev/null 2>&1 || fail=1 + +#non-Unicode characters interspersed between Unicode ones +env printf '12345678 +\t\xFF| +\xFF\t| +\t\xFFä| +ä\xFF\t| +\tä\xFF| +\xFF\tä| +äbcdef\xFF\t| +' > in || framework_failure_ + +env printf '12345678 + \xFF| +\xFF | + \xFFä| +ä\xFF | + ä\xFF| +\xFF ä| +äbcdef\xFF | +' > exp || framework_failure_ + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + + + +#BOM header test 1 +env printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ + +env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +env printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_ + + +env printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF +env printf '\xEF\xBB\xBF' >> exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +expand in1 in1 > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +Exit $fail diff --git a/tests/local.mk b/tests/local.mk index 2d82a0de9a..ef263e9926 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -342,6 +342,7 @@ all_tests = \ tests/env/env-S-script.sh \ tests/expand/expand.pl \ tests/expand/bounded-memory.sh \ + tests/expand/mb.sh \ tests/expr/expr.pl \ tests/expr/expr-multibyte.pl \ tests/factor/factor.pl \ @@ -504,6 +505,7 @@ all_tests = \ tests/misc/usage_vs_refs.sh \ tests/unexpand/unexpand.pl \ tests/unexpand/bounded-memory.sh \ + tests/unexpand/mb.sh \ tests/uniq/uniq.pl \ tests/uniq/uniq-perf.sh \ tests/uniq/uniq-collate.sh \ diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh new file mode 100755 index 0000000000..dde30b5941 --- /dev/null +++ b/tests/unexpand/mb.sh @@ -0,0 +1,163 @@ +#!/bin/sh + +# Copyright (C) 2012-2015 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ unexpand printf + +export LC_ALL=en_US.UTF-8 + +#input containing multibyte characters +cat > in <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +cat > exp <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + + +#multiple files as an input +cat >> exp <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + + +unexpand -a ./in ./in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test characters with a display width larger than 1 + +env printf '12345678 +e |ascii(1) +\u00E9 |composed(1) +e\u0301 |decomposed(1) +\u3000 |ideo-space(2) +\u3000\u3000\u3000\u3000|ideo-space(2) * 4 +\uFF0D |full-hypen(2) +' > in || framework_failure_ + +env printf '12345678 +e\t|ascii(1) +\u00E9\t|composed(1) +e\u0301\t|decomposed(1) +\t|ideo-space(2) +\t|ideo-space(2) * 4 +\uFF0D\t|full-hypen(2) +' > exp || framework_failure_ + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test input where a blank of width > 1 is not being substituted +in="$(LC_ALL=en_US.UTF-8 env printf ' \u3000 ö ü ß')" +exp='   ö ü ß' + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#non-Unicode characters interspersed between Unicode ones +env printf '12345678 + \xFF| +\xFF | + \xFFä| +ä\xFF | + ä\xFF| +\xFF ä| +äbcde\xFF | +' > in || framework_failure_ + +env printf '12345678 +\t\xFF| +\xFF\t| +\t\xFFä| +ä\xFF\t| +\tä\xFF| +\xFF\tä| +äbcde\xFF\t| +' > exp || framework_failure_ + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#BOM header test 1 +env printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF +env printf "\xEF\xBB\xBF" >> exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +unexpand -a ./in ./in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +Exit $fail