numfmt: fix issues with multi-byte blanks

author Pádraig Brady <P@draigBrady.com>

Fri, 17 Oct 2025 18:14:21 +0000 (19:14 +0100)

committer Pádraig Brady <P@draigBrady.com>

Sat, 18 Oct 2025 17:37:10 +0000 (18:37 +0100)
author Pádraig Brady <P@draigBrady.com>
Fri, 17 Oct 2025 18:14:21 +0000 (19:14 +0100)
committer Pádraig Brady <P@draigBrady.com>
Sat, 18 Oct 2025 17:37:10 +0000 (18:37 +0100)
diff --git a/NEWS b/NEWS

index b345132715943aec4f226f0000a2e6c7e4b294d7..f80363f87f3ca3fb18695aabf13b1bb4e15b18d6 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,9 @@ GNU coreutils NEWS                                    -*- outline -*-
    'numfmt' no longer reads out-of-bounds memory with trailing blanks in input.
    [bug introduced with numfmt in coreutils-8.21]
  
+  'numfmt' no longer outputs invalid characters with multi-byte blanks in input.
+  [bug introduced in coreutils-9.5]
+
    'rm -d DIR' no longer fails on Ceph snapshot directories.
    Although these directories are nonempty, 'rmdir DIR' succeeds on them.
    [bug introduced in coreutils-8.16]
diff --git a/src/numfmt.c b/src/numfmt.c

index 26f9180542d2e81d22a58cee866df74213f869e2..67458558a747d6f9f724882fcc7d2e066a88bf9e 100644 (file)
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -1150,7 +1150,7 @@ parse_format_string (char const *fmt)
        errno = 0;
        user_precision = strtol (fmt + i, &endptr, 10);
        if (errno == ERANGE || user_precision < 0 || SIZE_MAX < user_precision
-          || isblank (fmt[i]) || fmt[i] == '+')
+          || c_isblank (fmt[i]) || fmt[i] == '+')
          {
            /* Note we disallow negative user_precision to be
               consistent with printf(1).  POSIX states that
@@ -1340,15 +1340,18 @@ process_suffixed_number (char *text, long double *result,
          devmsg ("no valid suffix found\n");
      }
  
-  /* Skip white space - always.  */
-  char *p = text;
-  while (*p && isblank (to_uchar (*p)))
-    ++p;
+  /* Skip blanks - always.  */
+  char *p = skip_str_matching (text, newline_or_blank, true);
  
    /* setup auto-padding.  */
    if (auto_padding)
      {
-      padding_width = text < p || 1 < field ? strlen (text) : 0;
+      padding_width = text < p || 1 < field
+                      ? mbswidth (text,
+                                  MBSW_REJECT_INVALID | MBSW_REJECT_UNPRINTABLE)
+                      : 0;
+      if (padding_width < 0)
+        padding_width = strlen (text);
        devmsg ("setting Auto-Padding to %jd characters\n", padding_width);
      }
  
@@ -1455,7 +1458,8 @@ process_line (char *line, bool newline)
  
      if (*line != '\0')
        {
-        /* nul terminate the current field string and process */
+        /* NUL terminate the current field string and process */
+        char end_field = *line;
          *line = '\0';
  
          if (! process_field (next, field))
@@ -1463,7 +1467,15 @@ process_line (char *line, bool newline)
  
          fputc ((delimiter == DELIMITER_DEFAULT) ?
                 ' ' : delimiter, stdout);
-        ++line;
+
+        if (delimiter != DELIMITER_DEFAULT)
+          line++;
+        else
+          {
+            *line = end_field;
+            mcel_t g = mcel_scanz (line);
+            line += g.len;
+          }
        }
      else
        {
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl

index ff22c730319052a1294bbe0e88b1c1922fcfe892..2f03efd1c14e7a3937f6465084b7a8694da9a78d 100755 (executable)
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -1172,6 +1172,20 @@ my @Locale_Tests =
               {ENV=>"LC_ALL=$locale"}],
       ['lcl-suf-6', "--from=auto '2\xe2\x81\xa0Ki'", {OUT => "2048"},
               {ENV=>"LC_ALL=$locale"}],
+     # multi-byte blank char (em space, \u2003)
+     #   Ensure trailing multi-byte blanks skipped
+     ['lcl-suf-7', "'2\xe2\x80\x83 '", {OUT => "2  "},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-8', "-d '' --from=auto '2Ki\xe2\x80\x83 '", {OUT => "2048"},
+             {ENV=>"LC_ALL=$locale"}],
+     #   Ensure multi-byte blank field separators not corrupted
+     ['lcl-suf-9',  "--field=1 '1\xe2\x80\x832'", {OUT => "1 2"},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-10', "--field=2 '1\xe2\x80\x832'", {OUT => "1 2"},
+             {ENV=>"LC_ALL=$locale"}],
+     #   Ensure multi-byte blank field separators width determined correctly
+     ['lcl-suf-11', "--field=2 '1 \xe2\x80\x832'",
+             {OUT => "1  2"}, {ENV=>"LC_ALL=$locale"}],
  
    );
  if ($locale ne 'C')
author	Pádraig Brady <P@draigBrady.com>
	Fri, 17 Oct 2025 18:14:21 +0000 (19:14 +0100)
committer	Pádraig Brady <P@draigBrady.com>
	Sat, 18 Oct 2025 17:37:10 +0000 (18:37 +0100)
NEWS		patch \| blob \| blame \| history
src/numfmt.c		patch \| blob \| blame \| history
tests/misc/numfmt.pl		patch \| blob \| blame \| history