fold: add the --characters option

author Collin Funk <collin.funk1@gmail.com>

Thu, 21 Aug 2025 04:13:52 +0000 (21:13 -0700)

committer Collin Funk <collin.funk1@gmail.com>

Sat, 23 Aug 2025 05:09:50 +0000 (22:09 -0700)
author Collin Funk <collin.funk1@gmail.com>
Thu, 21 Aug 2025 04:13:52 +0000 (21:13 -0700)
committer Collin Funk <collin.funk1@gmail.com>
Sat, 23 Aug 2025 05:09:50 +0000 (22:09 -0700)
diff --git a/NEWS b/NEWS

index 2a8056658590a726a24fce2166ff1edb3639ba9a..f2e7c9e6ee5b0e0822b150c81346fd32f7c2eb50 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -83,6 +83,9 @@ GNU coreutils NEWS                                    -*- outline -*-
    Iranian locale (fa_IR) and for the Ethiopian locale (am_ET), and also
    does so more consistently for the Thailand locale (th_TH.UTF-8).
  
+  fold now supports the --characters (-c) option to count multi-byte
+  characters instead of the number of columns.
+
    nproc now honors any cgroup v2 configured CPU quotas,
    which may reduce the effective number of processors available.
  
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index c874ffc61cc5076dbdcc8b674de52e8e94c543f0..3f0931e1a9928e3bea497bf4e77a4226c1e37639 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -2964,6 +2964,13 @@ Count bytes rather than columns, so that tabs, backspaces, and carriage
  returns are each counted as taking up one column, just like other
  characters.
  
+@item -c
+@itemx --characters
+@opindex -c
+@opindex --characters
+Count characters rather than columns, meaning that lines containing
+characters wider than one column will be visually longer.
+
  @item -s
  @itemx --spaces
  @opindex -s
diff --git a/src/fold.c b/src/fold.c

index b64aad491addbe7b897694086545c44cb7b652e0..153ce15a61d69498c55bc90483f42ad4a9bb11f4 100644 (file)
--- a/src/fold.c
+++ b/src/fold.c
@@ -25,6 +25,7 @@
  
  #include "system.h"
  #include "fadvise.h"
+#include "mcel.h"
  #include "xdectoint.h"
  
  #define TAB_WIDTH 8
@@ -37,17 +38,26 @@
  /* If nonzero, try to break on whitespace. */
  static bool break_spaces;
  
-/* If nonzero, count bytes, not column positions. */
-static bool count_bytes;
+/* Mode to operate in.  */
+static enum
+  {
+    COUNT_COLUMNS,
+    COUNT_BYTES,
+    COUNT_CHARACTERS
+  } counting_mode = COUNT_COLUMNS;
  
  /* If nonzero, at least one of the files we read was standard input. */
  static bool have_read_stdin;
  
-static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
+/* Width of last read character.  */
+static int last_character_width = 0;
+
+static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
  
  static struct option const longopts[] =
  {
    {"bytes", no_argument, nullptr, 'b'},
+  {"characters", no_argument, nullptr, 'c'},
    {"spaces", no_argument, nullptr, 's'},
    {"width", required_argument, nullptr, 'w'},
    {GETOPT_HELP_OPTION_DECL},
@@ -75,6 +85,7 @@ Wrap input lines in each FILE, writing to standard output.\n\
  
        fputs (_("\
    -b, --bytes         count bytes rather than columns\n\
+  -c, --characters    count characters rather than columns\n\
    -s, --spaces        break at spaces\n\
    -w, --width=WIDTH   use WIDTH columns instead of 80\n\
  "), stdout);
@@ -90,24 +101,28 @@ Wrap input lines in each FILE, writing to standard output.\n\
     The first column is 0. */
  
  static size_t
-adjust_column (size_t column, char c)
+adjust_column (size_t column, mcel_t g)
  {
-  if (!count_bytes)
+  if (counting_mode != COUNT_BYTES)
      {
-      if (c == '\b')
+      if (g.ch == '\b')
          {
            if (column > 0)
-            column--;
+            column -= last_character_width;
          }
-      else if (c == '\r')
+      else if (g.ch == '\r')
          column = 0;
-      else if (c == '\t')
+      else if (g.ch == '\t')
          column += TAB_WIDTH - column % TAB_WIDTH;
-      else /* if (isprint (c)) */
-        column++;
+      else /* if (c32isprint (g.ch)) */
+        {
+          last_character_width = (counting_mode == COUNT_CHARACTERS
+                                  ? 1 : c32width (g.ch));
+          column += last_character_width;
+        }
      }
    else
-    column++;
+    column += g.len;
    return column;
  }
  
@@ -119,11 +134,13 @@ static bool
  fold_file (char const *filename, size_t width)
  {
    FILE *istream;
-  int c;
    size_t column = 0;           /* Screen column where next char will go. */
    idx_t offset_out = 0;                /* Index in 'line_out' for next char. */
    static char *line_out = nullptr;
    static idx_t allocated_out = 0;
+  static char *line_in = nullptr;
+  static size_t allocated_in = 0;
+  static ssize_t length_in = 0;
    int saved_errno;
  
    if (STREQ (filename, "-"))
@@ -142,74 +159,90 @@ fold_file (char const *filename, size_t width)
  
    fadvise (istream, FADVISE_SEQUENTIAL);
  
-  while ((c = getc (istream)) != EOF)
+  while (0 <= (length_in = getline (&line_in, &allocated_in, istream)))
      {
-      if (allocated_out - offset_out <= 1)
-        line_out = xpalloc (line_out, &allocated_out, 1, -1, sizeof *line_out);
-
-      if (c == '\n')
+      char *p = line_in;
+      char *lim = p + length_in;
+      mcel_t g;
+      for (; p < lim; p += g.len)
          {
-          line_out[offset_out++] = c;
-          fwrite (line_out, sizeof (char), offset_out, stdout);
-          column = offset_out = 0;
-          continue;
-        }
-
-    rescan:
-      column = adjust_column (column, c);
-
-      if (column > width)
-        {
-          /* This character would make the line too long.
-             Print the line plus a newline, and make this character
-             start the next line. */
-          if (break_spaces)
+          g = mcel_scan (p, lim);
+          if (allocated_out - offset_out <= g.len)
+            line_out = xpalloc (line_out, &allocated_out, g.len, -1,
+                                sizeof *line_out);
+          if (g.ch == '\n')
              {
-              bool found_blank = false;
-              idx_t logical_end = offset_out;
+              memcpy (line_out + offset_out, p, g.len);
+              offset_out += g.len;
+              fwrite (line_out, sizeof (char), offset_out, stdout);
+              column = offset_out = 0;
+              continue;
+            }
+        rescan:
+          column = adjust_column (column, g);
  
-              /* Look for the last blank. */
-              while (logical_end)
+          if (column > width)
+            {
+              /* This character would make the line too long.
+                 Print the line plus a newline, and make this character
+                 start the next line. */
+              if (break_spaces)
                  {
-                  --logical_end;
-                  if (isblank (to_uchar (line_out[logical_end])))
+                  int space_length = 0;
+                  idx_t logical_end = offset_out;
+                  char *logical_p = line_out;
+                  char *logical_lim = logical_p + logical_end;
+
+                  for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
                      {
-                      found_blank = true;
-                      break;
+                      g2 = mcel_scan (logical_p, logical_lim);
+                      if (c32isblank (g2.ch))
+                        {
+                          space_length = g2.len;
+                          logical_end = logical_p - line_out;
+                        }
+                    }
+
+                  if (space_length)
+                    {
+                      logical_end += space_length;
+                      /* Found a blank.  Don't output the part after it. */
+                      fwrite (line_out, sizeof (char), logical_end, stdout);
+                      putchar ('\n');
+                      /* Move the remainder to the beginning of the next line.
+                         The areas being copied here might overlap. */
+                      memmove (line_out, line_out + logical_end,
+                               offset_out - logical_end);
+                      offset_out -= logical_end;
+                      column = 0;
+                      char *printed_p = line_out;
+                      char *printed_lim = printed_p + offset_out;
+                      for (mcel_t g2; printed_p < printed_lim;
+                           printed_p += g2.len)
+                        {
+                          g2 = mcel_scan (printed_p, printed_lim);
+                          column = adjust_column (column, g2);
+                        }
+                      goto rescan;
                      }
                  }
  
-              if (found_blank)
+              if (offset_out == 0)
                  {
-                  /* Found a blank.  Don't output the part after it. */
-                  logical_end++;
-                  fwrite (line_out, sizeof (char), logical_end, stdout);
-                  putchar ('\n');
-                  /* Move the remainder to the beginning of the next line.
-                     The areas being copied here might overlap. */
-                  memmove (line_out, line_out + logical_end,
-                           offset_out - logical_end);
-                  offset_out -= logical_end;
-                  column = 0;
-                  for (idx_t i = 0; i < offset_out; i++)
-                    column = adjust_column (column, line_out[i]);
-                  goto rescan;
+                  memcpy (line_out + offset_out, p, g.len);
+                  offset_out += g.len;
+                  continue;
                  }
-            }
  
-          if (offset_out == 0)
-            {
-              line_out[offset_out++] = c;
-              continue;
+              line_out[offset_out++] = '\n';
+              fwrite (line_out, sizeof (char), offset_out, stdout);
+              column = offset_out = 0;
+              goto rescan;
              }
  
-          line_out[offset_out++] = '\n';
-          fwrite (line_out, sizeof (char), offset_out, stdout);
-          column = offset_out = 0;
-          goto rescan;
+          memcpy (line_out + offset_out, p, g.len);
+          offset_out += g.len;
          }
-
-      line_out[offset_out++] = c;
      }
  
    saved_errno = errno;
@@ -249,7 +282,7 @@ main (int argc, char **argv)
  
    atexit (close_stdout);
  
-  break_spaces = count_bytes = have_read_stdin = false;
+  break_spaces = have_read_stdin = false;
  
    while ((optc = getopt_long (argc, argv, shortopts, longopts, nullptr)) != -1)
      {
@@ -258,7 +291,11 @@ main (int argc, char **argv)
        switch (optc)
          {
          case 'b':              /* Count bytes rather than columns. */
-          count_bytes = true;
+          counting_mode = COUNT_BYTES;
+          break;
+
+        case 'c':               /* Count characters rather than columns. */
+          counting_mode = COUNT_CHARACTERS;
            break;
  
          case 's':              /* Break at word boundaries. */
diff --git a/src/local.mk b/src/local.mk

index c7c77a7c95d52d37e2df3b4218f7b63bb822000d..3f93a750787b037cfe88825162e7679a1efda77b 100644 (file)
--- a/src/local.mk
+++ b/src/local.mk
@@ -336,6 +336,9 @@ src_sort_LDADD += $(LIBPMULTITHREAD)
  # for pthread_sigmask
  src_sort_LDADD += $(PTHREAD_SIGMASK_LIB)
  
+# for mbrtowc, mbfile
+src_fold_LDADD += $(LIBC32CONV) $(LIBUNISTRING) $(MBRTOWC_LIB)
+
  # Get the release year from lib/version-etc.c.
  RELEASE_YEAR = \
    `sed -n '/.*COPYRIGHT_YEAR = \([0-9][0-9][0-9][0-9]\) };/s//\1/p' \
diff --git a/tests/fold/fold-characters.sh b/tests/fold/fold-characters.sh

new file mode 100755 (executable)

index 0000000..0b22aad
--- /dev/null
+++ b/tests/fold/fold-characters.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+# Test fold --characters.
+
+# Copyright (C) 2025 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ fold printf
+
+test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+# The string "뉐뉐뉐" is 3 characters, but occupies 6 columns.
+env printf '\uB250\uB250\uB250\n' > input1 || framework_failure_
+env printf '\uB250\uB250\n\uB250\n' > column-exp1 || framework_failure_
+
+fold -w 5 input1 > column-out1 || fail=1
+compare column-exp1 column-out1 || fail=1
+
+# Should be the same as the input.
+fold --characters -w 5 input1 > characters-out1 || fail=1
+compare input1 characters-out1 || fail=1
+
+# Test with 50 2 column wide characters.
+for i in $(seq 50); do
+  env printf '\uFF1A' >> input2 || framework_failure_
+  env printf '\uFF1A' >> column-exp2 || framework_failure_
+  env printf '\uFF1A' >> character-exp2 || framework_failure_
+  if test $(($i % 5)) -eq 0; then
+    env printf '\n' >> column-exp2 || framework_failure_
+  fi
+  if test $(($i % 10)) -eq 0; then
+    env printf '\n' >> character-exp2 || framework_failure_
+  fi
+done
+
+env printf '\n' >> input2 || framework_failure_
+
+# 5 characters per line.
+fold -w 10 input2 > column-out2 || fail=1
+compare column-exp2 column-out2 || fail=1
+
+# 10 characters per line.
+fold --characters -w 10 input2 > character-out2 || fail=1
+compare character-exp2 character-out2 || fail=1
+
+Exit $fail
diff --git a/tests/fold/fold-nbsp.sh b/tests/fold/fold-nbsp.sh

new file mode 100755 (executable)

index 0000000..2c1e8fd
--- /dev/null
+++ b/tests/fold/fold-nbsp.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+# Test fold --spaces with various Unicode non-breaking space characters.
+
+# Copyright (C) 2025 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ fold printf
+
+test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+# One U+2007 FIGURE SPACE characters.
+env printf 'abcdefghijklmnop\u2007qrstuvwxyz\n' > input1 || framework_failure_
+env printf 'abcdefghij\nklmnop\u2007qrs\ntuvwxyz\n'> exp1 || framework_failure_
+fold --spaces --width 10 input1 > out1 || fail=1
+compare exp1 out1 || fail=1
+
+# Two U+00A0 NO-BREAK SPACE characters.
+env printf 'abcdefghijklmnop\u00A0\u00A0qrstuvwxyz\n' > input2 \
+  || framework_failure_
+env printf 'abcdefghij\nklmnop\u00A0\u00A0qr\nstuvwxyz\n'> exp2 \
+  || framework_failure_
+fold --spaces --width 10 input2 > out2 || fail=1
+compare exp2 out2 || fail=1
+
+Exit $fail
diff --git a/tests/fold/fold-spaces.sh b/tests/fold/fold-spaces.sh

new file mode 100755 (executable)

index 0000000..9ea95bf
--- /dev/null
+++ b/tests/fold/fold-spaces.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+# Test fold --spaces with various Unicode breaking space characters.
+
+# Copyright (C) 2025 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ fold printf
+
+test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+# One U+2002 EN SPACE characters.
+env printf 'abcdefghijklmnop\u2002qrstuvwxyz\n' > input1 || framework_failure_
+env printf 'abcdefghij\nklmnop\u2002\nqrstuvwxyz\n'> exp1 || framework_failure_
+fold --spaces --width 10 input1 > out1 || fail=1
+compare exp1 out1 || fail=1
+
+# Two U+2003 EM SPACE characters.
+env printf 'abcdefghijklmnop\u2003\u2003qrstuvwxyz\n' > input2 \
+  || framework_failure_
+env printf 'abcdefghij\nklmnop\u2003\u2003\nqrstuvwxyz\n'> exp2 \
+  || framework_failure_
+fold --spaces --width 10 input2 > out2 || fail=1
+compare exp2 out2 || fail=1
+
+Exit $fail
diff --git a/tests/local.mk b/tests/local.mk

index 3fbf442ee6a5402db2669e2733e7d37f545beef5..d883c00ba0cdd61059795daa424db0a8e2a032df 100644 (file)
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -326,6 +326,9 @@ all_tests =                                 \
    tests/factor/factor.pl                       \
    tests/factor/factor-parallel.sh              \
    tests/misc/false-status.sh                   \
+  tests/fold/fold-characters.sh                        \
+  tests/fold/fold-nbsp.sh                      \
+  tests/fold/fold-spaces.sh                    \
    tests/misc/fold.pl                           \
    tests/groups/groups-dash.sh                  \
    tests/groups/groups-process-all.sh           \
author	Collin Funk <collin.funk1@gmail.com>
	Thu, 21 Aug 2025 04:13:52 +0000 (21:13 -0700)
committer	Collin Funk <collin.funk1@gmail.com>
	Sat, 23 Aug 2025 05:09:50 +0000 (22:09 -0700)
NEWS		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/fold.c		patch \| blob \| blame \| history
src/local.mk		patch \| blob \| blame \| history
tests/fold/fold-characters.sh	[new file with mode: 0755]	patch \| blob
tests/fold/fold-nbsp.sh	[new file with mode: 0755]	patch \| blob
tests/fold/fold-spaces.sh	[new file with mode: 0755]	patch \| blob
tests/local.mk		patch \| blob \| blame \| history