/* True if we have ever read standard input. */
static bool have_read_stdin;
+/* If true, don't split multibyte characters in byte mode. */
+static bool no_split;
+
/* If true, interpret each run of whitespace as one field delimiter. */
static bool whitespace_delimited;
{"characters", required_argument, NULL, 'c'},
{"fields", required_argument, NULL, 'f'},
{"delimiter", required_argument, NULL, 'd'},
+ {"no-partial", no_argument, NULL, 'n'},
{"whitespace-delimited", no_argument, NULL, 'w'},
{"only-delimited", no_argument, NULL, 's'},
{"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION},
no delimiter character, unless the -s option is specified\n\
"));
oputs (_("\
- -n\n\
- (ignored)\n\
+ -n, --no-partial\n\
+ with -b, don't output partial multi-byte characters\n\
"));
oputs (_("\
-s, --only-delimited\n\
}
}
+/* Read from STREAM, printing selected bytes without splitting
+ multibyte characters. */
+
+static void
+cut_bytes_no_split (FILE *stream)
+{
+ uintmax_t byte_idx = 0;
+ bool print_delimiter = false;
+ static char line_in[IO_BUFSIZE];
+ mbbuf_t mbbuf;
+
+ current_rp = frp;
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, stream);
+
+ while (true)
+ {
+ mcel_t g = mbbuf_get_char (&mbbuf);
+
+ if (g.ch == line_delim)
+ {
+ if (putchar (line_delim) < 0)
+ write_error ();
+ byte_idx = 0;
+ print_delimiter = false;
+ current_rp = frp;
+ }
+ else if (g.ch == MBBUF_EOF)
+ {
+ if (byte_idx > 0)
+ {
+ if (putchar (line_delim) < 0)
+ write_error ();
+ }
+ break;
+ }
+ else
+ {
+ bool first_selected_is_range_start = false;
+ bool seen_selected = false;
+ bool suffix_selected = true;
+
+ for (idx_t i = 0; i < g.len; i++)
+ {
+ next_item (&byte_idx);
+ if (print_kth (byte_idx))
+ {
+ if (!seen_selected)
+ {
+ seen_selected = true;
+ first_selected_is_range_start
+ = is_range_start_index (byte_idx);
+ }
+ }
+ else if (seen_selected)
+ suffix_selected = false;
+ }
+
+ if (seen_selected && suffix_selected)
+ {
+ if (output_delimiter_string != output_delimiter_default)
+ {
+ if (print_delimiter && first_selected_is_range_start)
+ write_bytes (output_delimiter_string,
+ output_delimiter_length);
+ print_delimiter = true;
+ }
+
+ write_bytes (mbbuf_char_offset (&mbbuf, g), g.len);
+ }
+ }
+ }
+}
+
/* Read from STREAM, printing to standard output any selected characters. */
static void
break;
case 'n':
+ no_split = true;
break;
case 's':
unreachable ();
case CUT_MODE_BYTES:
- cut_stream = cut_bytes;
+ cut_stream = MB_CUR_MAX <= 1 || !no_split
+ ? cut_bytes : cut_bytes_no_split;
break;
case CUT_MODE_CHARACTERS:
{ENV => "LC_ALL=$mb_locale"}],
['mb-char-5', '-c1-2', {IN=>"\xc3x\n"}, {OUT=>"\xc3x\n"},
{ENV => "LC_ALL=$mb_locale"}],
+ # Note mb-byte-n-1 and mb-byte-n-4 differ from coreutils-i18n patch,
+ # which outputs a character if any byte is selected.
+ # I.e., the i18n patch may output more bytes that the requested range.
+ # Also mb-byte-n-3 differs from coreutils-i18n patch,
+ # but that looks like a bug in that patch rather than a design choice.
+ ['mb-byte-n-1', qw(-b1 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"\n"},
+ {ENV => "LC_ALL=$mb_locale"}],
+ ['mb-byte-n-2', qw(-b2 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"\xc3\xa9\n"},
+ {ENV => "LC_ALL=$mb_locale"}],
+ ['mb-byte-n-3', qw(-b1-2 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"\xc3\xa9\n"},
+ {ENV => "LC_ALL=$mb_locale"}],
+ ['mb-byte-n-4', qw(-b1,3 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"x\n"},
+ {ENV => "LC_ALL=$mb_locale"}],
+ ['mb-byte-n-5', qw(-b2-3 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"\xc3\xa9x\n"},
+ {ENV => "LC_ALL=$mb_locale"}],
+ ['mb-byte-n-6', qw(-b2 -n), {IN=>"\xe2\x82\xacx\n"}, {OUT=>"\n"},
+ {ENV => "LC_ALL=$mb_locale"}],
+ ['mb-byte-n-7', qw(-b3 -n), {IN=>"\xe2\x82\xacx\n"},
+ {OUT=>"\xe2\x82\xac\n"},
+ {ENV => "LC_ALL=$mb_locale"}],
+ ['mb-byte-n-8', qw(-b2-3 -n), {IN=>"\xe2\x82\xacx\n"},
+ {OUT=>"\xe2\x82\xac\n"},
+ {ENV => "LC_ALL=$mb_locale"}],
['mb-delim-1', '-d', "\xc3\xa9", '-f2',
{IN=>"a\xc3\xa9b\xc3\xa9c\n"}, {OUT=>"b\n"},
{ENV => "LC_ALL=$mb_locale"}],