From: Collin Funk Date: Thu, 21 Aug 2025 04:13:52 +0000 (-0700) Subject: fold: add the --characters option X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;ds=inline;p=thirdparty%2Fcoreutils.git fold: add the --characters option * src/fold.c: Include mcel.h. (count_bytes): Remove variable. (counting_mode, last_character_width): New variables. (shortopts, long_options): Add the option. (adjust_column): If --characters is in used account for number of characters instead of their width. (fold_file): Use getline and iterate over the result with mcel functions to handle multibyte characters. (main): Check for the option. * src/local.mk (src_fold_LDADD): Add $(LIBC32CONV), $(LIBUNISTRING), and $(MBRTOWC_LIB). * tests/fold/fold-characters.sh: New file. * tests/fold/fold-spaces.sh: New file. * tests/fold/fold-nbsp.sh: New file. * tests/local.mk (all_tests): Add the tests. * NEWS: Mention the new option. * doc/coreutils.texi (fold invocation): Likewise. --- diff --git a/NEWS b/NEWS index 2a80566585..f2e7c9e6ee 100644 --- a/NEWS +++ b/NEWS @@ -83,6 +83,9 @@ GNU coreutils NEWS -*- outline -*- Iranian locale (fa_IR) and for the Ethiopian locale (am_ET), and also does so more consistently for the Thailand locale (th_TH.UTF-8). + fold now supports the --characters (-c) option to count multi-byte + characters instead of the number of columns. + nproc now honors any cgroup v2 configured CPU quotas, which may reduce the effective number of processors available. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index c874ffc61c..3f0931e1a9 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -2964,6 +2964,13 @@ Count bytes rather than columns, so that tabs, backspaces, and carriage returns are each counted as taking up one column, just like other characters. +@item -c +@itemx --characters +@opindex -c +@opindex --characters +Count characters rather than columns, meaning that lines containing +characters wider than one column will be visually longer. + @item -s @itemx --spaces @opindex -s diff --git a/src/fold.c b/src/fold.c index b64aad491a..153ce15a61 100644 --- a/src/fold.c +++ b/src/fold.c @@ -25,6 +25,7 @@ #include "system.h" #include "fadvise.h" +#include "mcel.h" #include "xdectoint.h" #define TAB_WIDTH 8 @@ -37,17 +38,26 @@ /* If nonzero, try to break on whitespace. */ static bool break_spaces; -/* If nonzero, count bytes, not column positions. */ -static bool count_bytes; +/* Mode to operate in. */ +static enum + { + COUNT_COLUMNS, + COUNT_BYTES, + COUNT_CHARACTERS + } counting_mode = COUNT_COLUMNS; /* If nonzero, at least one of the files we read was standard input. */ static bool have_read_stdin; -static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::"; +/* Width of last read character. */ +static int last_character_width = 0; + +static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::"; static struct option const longopts[] = { {"bytes", no_argument, nullptr, 'b'}, + {"characters", no_argument, nullptr, 'c'}, {"spaces", no_argument, nullptr, 's'}, {"width", required_argument, nullptr, 'w'}, {GETOPT_HELP_OPTION_DECL}, @@ -75,6 +85,7 @@ Wrap input lines in each FILE, writing to standard output.\n\ fputs (_("\ -b, --bytes count bytes rather than columns\n\ + -c, --characters count characters rather than columns\n\ -s, --spaces break at spaces\n\ -w, --width=WIDTH use WIDTH columns instead of 80\n\ "), stdout); @@ -90,24 +101,28 @@ Wrap input lines in each FILE, writing to standard output.\n\ The first column is 0. */ static size_t -adjust_column (size_t column, char c) +adjust_column (size_t column, mcel_t g) { - if (!count_bytes) + if (counting_mode != COUNT_BYTES) { - if (c == '\b') + if (g.ch == '\b') { if (column > 0) - column--; + column -= last_character_width; } - else if (c == '\r') + else if (g.ch == '\r') column = 0; - else if (c == '\t') + else if (g.ch == '\t') column += TAB_WIDTH - column % TAB_WIDTH; - else /* if (isprint (c)) */ - column++; + else /* if (c32isprint (g.ch)) */ + { + last_character_width = (counting_mode == COUNT_CHARACTERS + ? 1 : c32width (g.ch)); + column += last_character_width; + } } else - column++; + column += g.len; return column; } @@ -119,11 +134,13 @@ static bool fold_file (char const *filename, size_t width) { FILE *istream; - int c; size_t column = 0; /* Screen column where next char will go. */ idx_t offset_out = 0; /* Index in 'line_out' for next char. */ static char *line_out = nullptr; static idx_t allocated_out = 0; + static char *line_in = nullptr; + static size_t allocated_in = 0; + static ssize_t length_in = 0; int saved_errno; if (STREQ (filename, "-")) @@ -142,74 +159,90 @@ fold_file (char const *filename, size_t width) fadvise (istream, FADVISE_SEQUENTIAL); - while ((c = getc (istream)) != EOF) + while (0 <= (length_in = getline (&line_in, &allocated_in, istream))) { - if (allocated_out - offset_out <= 1) - line_out = xpalloc (line_out, &allocated_out, 1, -1, sizeof *line_out); - - if (c == '\n') + char *p = line_in; + char *lim = p + length_in; + mcel_t g; + for (; p < lim; p += g.len) { - line_out[offset_out++] = c; - fwrite (line_out, sizeof (char), offset_out, stdout); - column = offset_out = 0; - continue; - } - - rescan: - column = adjust_column (column, c); - - if (column > width) - { - /* This character would make the line too long. - Print the line plus a newline, and make this character - start the next line. */ - if (break_spaces) + g = mcel_scan (p, lim); + if (allocated_out - offset_out <= g.len) + line_out = xpalloc (line_out, &allocated_out, g.len, -1, + sizeof *line_out); + if (g.ch == '\n') { - bool found_blank = false; - idx_t logical_end = offset_out; + memcpy (line_out + offset_out, p, g.len); + offset_out += g.len; + fwrite (line_out, sizeof (char), offset_out, stdout); + column = offset_out = 0; + continue; + } + rescan: + column = adjust_column (column, g); - /* Look for the last blank. */ - while (logical_end) + if (column > width) + { + /* This character would make the line too long. + Print the line plus a newline, and make this character + start the next line. */ + if (break_spaces) { - --logical_end; - if (isblank (to_uchar (line_out[logical_end]))) + int space_length = 0; + idx_t logical_end = offset_out; + char *logical_p = line_out; + char *logical_lim = logical_p + logical_end; + + for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len) { - found_blank = true; - break; + g2 = mcel_scan (logical_p, logical_lim); + if (c32isblank (g2.ch)) + { + space_length = g2.len; + logical_end = logical_p - line_out; + } + } + + if (space_length) + { + logical_end += space_length; + /* Found a blank. Don't output the part after it. */ + fwrite (line_out, sizeof (char), logical_end, stdout); + putchar ('\n'); + /* Move the remainder to the beginning of the next line. + The areas being copied here might overlap. */ + memmove (line_out, line_out + logical_end, + offset_out - logical_end); + offset_out -= logical_end; + column = 0; + char *printed_p = line_out; + char *printed_lim = printed_p + offset_out; + for (mcel_t g2; printed_p < printed_lim; + printed_p += g2.len) + { + g2 = mcel_scan (printed_p, printed_lim); + column = adjust_column (column, g2); + } + goto rescan; } } - if (found_blank) + if (offset_out == 0) { - /* Found a blank. Don't output the part after it. */ - logical_end++; - fwrite (line_out, sizeof (char), logical_end, stdout); - putchar ('\n'); - /* Move the remainder to the beginning of the next line. - The areas being copied here might overlap. */ - memmove (line_out, line_out + logical_end, - offset_out - logical_end); - offset_out -= logical_end; - column = 0; - for (idx_t i = 0; i < offset_out; i++) - column = adjust_column (column, line_out[i]); - goto rescan; + memcpy (line_out + offset_out, p, g.len); + offset_out += g.len; + continue; } - } - if (offset_out == 0) - { - line_out[offset_out++] = c; - continue; + line_out[offset_out++] = '\n'; + fwrite (line_out, sizeof (char), offset_out, stdout); + column = offset_out = 0; + goto rescan; } - line_out[offset_out++] = '\n'; - fwrite (line_out, sizeof (char), offset_out, stdout); - column = offset_out = 0; - goto rescan; + memcpy (line_out + offset_out, p, g.len); + offset_out += g.len; } - - line_out[offset_out++] = c; } saved_errno = errno; @@ -249,7 +282,7 @@ main (int argc, char **argv) atexit (close_stdout); - break_spaces = count_bytes = have_read_stdin = false; + break_spaces = have_read_stdin = false; while ((optc = getopt_long (argc, argv, shortopts, longopts, nullptr)) != -1) { @@ -258,7 +291,11 @@ main (int argc, char **argv) switch (optc) { case 'b': /* Count bytes rather than columns. */ - count_bytes = true; + counting_mode = COUNT_BYTES; + break; + + case 'c': /* Count characters rather than columns. */ + counting_mode = COUNT_CHARACTERS; break; case 's': /* Break at word boundaries. */ diff --git a/src/local.mk b/src/local.mk index c7c77a7c95..3f93a75078 100644 --- a/src/local.mk +++ b/src/local.mk @@ -336,6 +336,9 @@ src_sort_LDADD += $(LIBPMULTITHREAD) # for pthread_sigmask src_sort_LDADD += $(PTHREAD_SIGMASK_LIB) +# for mbrtowc, mbfile +src_fold_LDADD += $(LIBC32CONV) $(LIBUNISTRING) $(MBRTOWC_LIB) + # Get the release year from lib/version-etc.c. RELEASE_YEAR = \ `sed -n '/.*COPYRIGHT_YEAR = \([0-9][0-9][0-9][0-9]\) };/s//\1/p' \ diff --git a/tests/fold/fold-characters.sh b/tests/fold/fold-characters.sh new file mode 100755 index 0000000000..0b22aad6be --- /dev/null +++ b/tests/fold/fold-characters.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# Test fold --characters. + +# Copyright (C) 2025 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ fold printf + +test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available" + +LC_ALL=$LOCALE_FR_UTF8 +export LC_ALL + +# The string "뉐뉐뉐" is 3 characters, but occupies 6 columns. +env printf '\uB250\uB250\uB250\n' > input1 || framework_failure_ +env printf '\uB250\uB250\n\uB250\n' > column-exp1 || framework_failure_ + +fold -w 5 input1 > column-out1 || fail=1 +compare column-exp1 column-out1 || fail=1 + +# Should be the same as the input. +fold --characters -w 5 input1 > characters-out1 || fail=1 +compare input1 characters-out1 || fail=1 + +# Test with 50 2 column wide characters. +for i in $(seq 50); do + env printf '\uFF1A' >> input2 || framework_failure_ + env printf '\uFF1A' >> column-exp2 || framework_failure_ + env printf '\uFF1A' >> character-exp2 || framework_failure_ + if test $(($i % 5)) -eq 0; then + env printf '\n' >> column-exp2 || framework_failure_ + fi + if test $(($i % 10)) -eq 0; then + env printf '\n' >> character-exp2 || framework_failure_ + fi +done + +env printf '\n' >> input2 || framework_failure_ + +# 5 characters per line. +fold -w 10 input2 > column-out2 || fail=1 +compare column-exp2 column-out2 || fail=1 + +# 10 characters per line. +fold --characters -w 10 input2 > character-out2 || fail=1 +compare character-exp2 character-out2 || fail=1 + +Exit $fail diff --git a/tests/fold/fold-nbsp.sh b/tests/fold/fold-nbsp.sh new file mode 100755 index 0000000000..2c1e8fd511 --- /dev/null +++ b/tests/fold/fold-nbsp.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# Test fold --spaces with various Unicode non-breaking space characters. + +# Copyright (C) 2025 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ fold printf + +test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available" + +LC_ALL=$LOCALE_FR_UTF8 +export LC_ALL + +# One U+2007 FIGURE SPACE characters. +env printf 'abcdefghijklmnop\u2007qrstuvwxyz\n' > input1 || framework_failure_ +env printf 'abcdefghij\nklmnop\u2007qrs\ntuvwxyz\n'> exp1 || framework_failure_ +fold --spaces --width 10 input1 > out1 || fail=1 +compare exp1 out1 || fail=1 + +# Two U+00A0 NO-BREAK SPACE characters. +env printf 'abcdefghijklmnop\u00A0\u00A0qrstuvwxyz\n' > input2 \ + || framework_failure_ +env printf 'abcdefghij\nklmnop\u00A0\u00A0qr\nstuvwxyz\n'> exp2 \ + || framework_failure_ +fold --spaces --width 10 input2 > out2 || fail=1 +compare exp2 out2 || fail=1 + +Exit $fail diff --git a/tests/fold/fold-spaces.sh b/tests/fold/fold-spaces.sh new file mode 100755 index 0000000000..9ea95bf029 --- /dev/null +++ b/tests/fold/fold-spaces.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# Test fold --spaces with various Unicode breaking space characters. + +# Copyright (C) 2025 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ fold printf + +test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available" + +LC_ALL=$LOCALE_FR_UTF8 +export LC_ALL + +# One U+2002 EN SPACE characters. +env printf 'abcdefghijklmnop\u2002qrstuvwxyz\n' > input1 || framework_failure_ +env printf 'abcdefghij\nklmnop\u2002\nqrstuvwxyz\n'> exp1 || framework_failure_ +fold --spaces --width 10 input1 > out1 || fail=1 +compare exp1 out1 || fail=1 + +# Two U+2003 EM SPACE characters. +env printf 'abcdefghijklmnop\u2003\u2003qrstuvwxyz\n' > input2 \ + || framework_failure_ +env printf 'abcdefghij\nklmnop\u2003\u2003\nqrstuvwxyz\n'> exp2 \ + || framework_failure_ +fold --spaces --width 10 input2 > out2 || fail=1 +compare exp2 out2 || fail=1 + +Exit $fail diff --git a/tests/local.mk b/tests/local.mk index 3fbf442ee6..d883c00ba0 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -326,6 +326,9 @@ all_tests = \ tests/factor/factor.pl \ tests/factor/factor-parallel.sh \ tests/misc/false-status.sh \ + tests/fold/fold-characters.sh \ + tests/fold/fold-nbsp.sh \ + tests/fold/fold-spaces.sh \ tests/misc/fold.pl \ tests/groups/groups-dash.sh \ tests/groups/groups-process-all.sh \