* src/expand.c: Use mbbuf to support multi-byte input.
* src/unexpand.c: Likewise.
* tests/expand/mb.sh: New multi-byte test.
* tests/unexpand/mb.sh: Likewise.
* tests/local.mk: Reference new tests.
* NEWS: Mention the improvement.
'df --local' recognises more file system types as remote.
Specifically: autofs, ncpfs, smb, smb2, gfs, gfs2, userlandfs.
+ 'expand' and 'unexpand' now support multi-byte characters.
+
'groups' and 'id' will now exit sooner after a write error,
which is significant when listing information for many users.
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
+
#include "system.h"
+#include "ioblksize.h"
+#include "mcel.h"
+#include "mbbuf.h"
#include "expand-common.h"
/* The official name of this program (e.g., no 'g' prefix). */
if (!fp)
return;
+ static char line_in[IO_BUFSIZE];
+ mbbuf_t mbbuf;
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
+
while (true)
{
/* Input character, or EOF. */
- int c;
+ mcel_t g;
/* If true, perform translations. */
bool convert = true;
do
{
- while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
- continue;
+ while ((g = mbbuf_get_char (&mbbuf)).ch == MBBUF_EOF
+ && (fp = next_file (fp)))
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
if (convert)
{
- if (c == '\t')
+ convert &= convert_entire_line
+ || !! (c32isblank (g.ch) && ! c32isnbspace (g.ch));
+
+ if (g.ch == '\t')
{
/* Column the next input tab stop is on. */
bool last_tab;
if (putchar (' ') < 0)
write_error ();
- c = ' ';
+ if (putchar (' ') < 0)
+ write_error ();
+
+ continue;
}
- else if (c == '\b')
+ else if (g.ch == '\b')
{
/* Go back one column, and force recalculation of the
next tab stop. */
}
else
{
- if (ckd_add (&column, column, 1))
+ int width = c32width (g.ch);
+ if (ckd_add (&column, column, width < 0 ? 1 : width))
error (EXIT_FAILURE, 0, _("input line is too long"));
}
- convert &= convert_entire_line || !! isblank (c);
}
- if (c < 0)
+ if (g.ch == MBBUF_EOF)
return;
- if (putchar (c) < 0)
+ fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, stdout);
+ if (ferror (stdout))
write_error ();
}
- while (c != '\n');
+ while (g.ch != '\n');
}
}
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
+
#include "system.h"
+#include "ioblksize.h"
+#include "mbbuf.h"
+#include "mcel.h"
#include "expand-common.h"
/* The official name of this program (e.g., no 'g' prefix). */
if (!fp)
return;
+ static char line_in[IO_BUFSIZE];
+ mbbuf_t mbbuf;
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
+
/* The worst case is a non-blank character, then one blank, then a
tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
allocate MAX_COLUMN_WIDTH bytes to store the blanks. */
- pending_blank = ximalloc (max_column_width);
+ pending_blank = ximalloc (max_column_width * sizeof (char) * MB_LEN_MAX);
while (true)
{
/* Input character, or EOF. */
- int c;
+ mcel_t g;
/* If true, perform translations. */
bool convert = true;
/* Column of next input character. */
colno column = 0;
+ /* Column the next input tab stop is on. */
+ colno next_tab_column = 0;
+
/* Index in TAB_LIST of next tab stop to examine. */
idx_t tab_index = 0;
do
{
- while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
- continue;
+ while ((g = mbbuf_get_char (&mbbuf)).ch == MBBUF_EOF
+ && (fp = next_file (fp)))
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
if (convert)
{
- bool blank = !! isblank (c);
+ bool blank = !! (c32isblank (g.ch) && ! c32isnbspace (g.ch));
if (blank)
{
bool last_tab;
- /* Column the next input tab stop is on. */
- colno next_tab_column = get_next_tab_column (column,
- &tab_index,
- &last_tab);
+ next_tab_column = get_next_tab_column (column, &tab_index,
+ &last_tab);
if (last_tab)
convert = false;
if (convert)
{
- if (c == '\t')
+ if (g.ch == '\t')
{
column = next_tab_column;
}
else
{
- column++;
+ column += c32width (g.ch);
if (! (prev_blank && column == next_tab_column))
{
will be replaced by tabs. */
if (column == next_tab_column)
one_blank_before_tab_stop = true;
- pending_blank[pending++] = c;
+ memcpy (pending_blank + pending,
+ mbbuf_char_offset (&mbbuf, g), g.len);
+ pending += g.len;
prev_blank = true;
continue;
}
/* Replace the pending blanks by a tab or two. */
- pending_blank[0] = c = '\t';
+ g.len = 0;
+ if (putc ('\t', stdout) < 0)
+ write_error ();
+ pending_blank[0] = '\t';
}
/* Discard pending blanks, unless it was a single
pending = one_blank_before_tab_stop;
}
}
- else if (c == '\b')
+ else if (g.ch == '\b')
{
/* Go back one column, and force recalculation of the
next tab stop. */
column -= !!column;
+ next_tab_column = column;
tab_index -= !!tab_index;
}
else
{
- column++;
- if (!column)
+ int width = c32width (g.ch);
+ if (ckd_add (&column, column, width < 0 ? 1 : width))
error (EXIT_FAILURE, 0, _("input line is too long"));
}
convert &= convert_entire_line || blank;
}
- if (c < 0)
+ if (g.ch == MBBUF_EOF)
{
free (pending_blank);
return;
}
- if (putchar (c) < 0)
+ fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, stdout);
+ if (ferror (stdout))
write_error ();
}
- while (c != '\n');
+ while (g.ch != '\n');
}
}
--- /dev/null
+#!/bin/sh
+
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ expand printf
+
+export LC_ALL=en_US.UTF-8
+
+#input containing multibyte characters
+cat <<\EOF > in || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
+
+cat <<\EOF > exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#multiple files as an input
+cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test characters with display widths != 1
+env printf '12345678
+e\t|ascii(1)
+\u00E9\t|composed(1)
+e\u0301\t|decomposed(1)
+\u3000\t|ideo-space(2)
+\uFF0D\t|full-hypen(2)
+' > in || framework_failure_
+
+env printf '12345678
+e |ascii(1)
+\u00E9 |composed(1)
+e\u0301 |decomposed(1)
+\u3000 |ideo-space(2)
+\uFF0D |full-hypen(2)
+' > exp || framework_failure_
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#shouldn't fail with "input line too long"
+#when a line starts with a control character
+env printf '\n' > in || framework_failure_
+
+expand < in > out || fail=1
+compare in out > /dev/null 2>&1 || fail=1
+
+#non-Unicode characters interspersed between Unicode ones
+env printf '12345678
+\t\xFF|
+\xFF\t|
+\t\xFFä|
+ä\xFF\t|
+\tä\xFF|
+\xFF\tä|
+äbcdef\xFF\t|
+' > in || framework_failure_
+
+env printf '12345678
+ \xFF|
+\xFF |
+ \xFFä|
+ä\xFF |
+ ä\xFF|
+\xFF ä|
+äbcdef\xFF |
+' > exp || framework_failure_
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+
+#BOM header test 1
+env printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
+
+env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+env printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_
+
+
+env printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+env printf '\xEF\xBB\xBF' >> exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand in1 in1 > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+Exit $fail
tests/env/env-S-script.sh \
tests/expand/expand.pl \
tests/expand/bounded-memory.sh \
+ tests/expand/mb.sh \
tests/expr/expr.pl \
tests/expr/expr-multibyte.pl \
tests/factor/factor.pl \
tests/misc/usage_vs_refs.sh \
tests/unexpand/unexpand.pl \
tests/unexpand/bounded-memory.sh \
+ tests/unexpand/mb.sh \
tests/uniq/uniq.pl \
tests/uniq/uniq-perf.sh \
tests/uniq/uniq-collate.sh \
--- /dev/null
+#!/bin/sh
+
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ unexpand printf
+
+export LC_ALL=en_US.UTF-8
+
+#input containing multibyte characters
+cat > in <<\EOF
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+cat > exp <<\EOF
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+#multiple files as an input
+cat >> exp <<\EOF
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+
+unexpand -a ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test characters with a display width larger than 1
+
+env printf '12345678
+e |ascii(1)
+\u00E9 |composed(1)
+e\u0301 |decomposed(1)
+\u3000 |ideo-space(2)
+\u3000\u3000\u3000\u3000|ideo-space(2) * 4
+\uFF0D |full-hypen(2)
+' > in || framework_failure_
+
+env printf '12345678
+e\t|ascii(1)
+\u00E9\t|composed(1)
+e\u0301\t|decomposed(1)
+\t|ideo-space(2)
+\t|ideo-space(2) * 4
+\uFF0D\t|full-hypen(2)
+' > exp || framework_failure_
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test input where a blank of width > 1 is not being substituted
+in="$(LC_ALL=en_US.UTF-8 env printf ' \u3000 ö ü ß')"
+exp=' ö ü ß'
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#non-Unicode characters interspersed between Unicode ones
+env printf '12345678
+ \xFF|
+\xFF |
+ \xFFä|
+ä\xFF |
+ ä\xFF|
+\xFF ä|
+äbcde\xFF |
+' > in || framework_failure_
+
+env printf '12345678
+\t\xFF|
+\xFF\t|
+\t\xFFä|
+ä\xFF\t|
+\tä\xFF|
+\xFF\tä|
+äbcde\xFF\t|
+' > exp || framework_failure_
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#BOM header test 1
+env printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+env printf "\xEF\xBB\xBF" >> exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+unexpand -a ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+Exit $fail