numfmt: support multi-byte --delimiter

author Pádraig Brady <P@draigBrady.com>

Sat, 18 Oct 2025 16:44:49 +0000 (17:44 +0100)

committer Pádraig Brady <P@draigBrady.com>

Mon, 20 Oct 2025 12:13:24 +0000 (13:13 +0100)
author Pádraig Brady <P@draigBrady.com>
Sat, 18 Oct 2025 16:44:49 +0000 (17:44 +0100)
committer Pádraig Brady <P@draigBrady.com>
Mon, 20 Oct 2025 12:13:24 +0000 (13:13 +0100)
diff --git a/NEWS b/NEWS

index f80363f87f3ca3fb18695aabf13b1bb4e15b18d6..5d70543a16d4b786d4919b5e0c6e64ce7af1cf24 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -44,7 +44,8 @@ GNU coreutils NEWS                                    -*- outline -*-
  ** Improvements
  
    numfmt now parses numbers with a non-breaking space character before a unit,
-  and numbers containing grouping characters from the current locale.
+  and parses numbers containing grouping characters from the current locale.
+  It also supports a multi-byte --delimiter character.
  
    wc -l now operates 10% faster on hosts that support AVX512 instructions.
  
diff --git a/bootstrap.conf b/bootstrap.conf

index 8f91943417c0e333fe5a223868ab624bb0e9c4ad..5125d66970b3bc8b85e18c1107e92424b0c2d421 100644 (file)
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -176,6 +176,7 @@ gnulib_modules="
    mbs_endswith
    mbschr
    mbslen
+  mbsstr
    mbswidth
    mbszero
    mcel-prefer
diff --git a/src/numfmt.c b/src/numfmt.c

index cc80ccc5d85e733d0c8ace7a455e932680fd85dd..0f0a8770b59c7085187cb265d215d0e029365555 100644 (file)
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -156,9 +156,6 @@ static struct option const longopts[] =
    {nullptr, 0, nullptr, 0}
  };
  
-/* If delimiter has this value, blanks separate fields.  */
-enum { DELIMITER_DEFAULT = CHAR_MAX + 1 };
-
  /* Maximum number of digits we can safely handle
     without precision loss, if scaling is 'none'.  */
  enum { MAX_UNSCALED_DIGITS = LDBL_DIG };
@@ -194,8 +191,8 @@ static int conv_exit_code = EXIT_CONVERSION_WARNINGS;
  /* auto-pad each line based on skipped whitespace.  */
  static int auto_padding = 0;
  
-/* field delimiter */
-static int delimiter = DELIMITER_DEFAULT;
+/* field delimiter - if nullptr, blanks separate fields.  */
+static char const *delimiter = nullptr;
  
  /* line delimiter.  */
  static unsigned char line_delim = '\n';
@@ -1374,14 +1371,10 @@ next_field (char **line)
    char *field_start = *line;
    char *field_end   = field_start;
  
-  if (delimiter != DELIMITER_DEFAULT)
+  if (delimiter)
      {
-      if (*field_start != delimiter)
-        {
-          while (*field_end && *field_end != delimiter)
-            ++field_end;
-        }
-      /* else empty field */
+      if (! *delimiter || ! (field_end = mbsstr (field_start, delimiter)))
+        field_end = strchr (field_start, '\0');
      }
    else
      {
@@ -1462,11 +1455,13 @@ process_line (char *line, bool newline)
          if (! process_field (next, field))
            valid_number = false;
  
-        fputc ((delimiter == DELIMITER_DEFAULT) ?
-               ' ' : delimiter, stdout);
+        if (delimiter != nullptr)
+          fputs (delimiter, stdout);
+        else
+          fputc (' ', stdout);
  
-        if (delimiter != DELIMITER_DEFAULT)
-          line++;
+        if (delimiter)
+          line += MAX (strlen (delimiter), 1);
          else
            {
              *line = end_field;
@@ -1573,10 +1568,17 @@ main (int argc, char **argv)
  
          case 'd':
            /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
-          if (optarg[0] != '\0' && optarg[1] != '\0')
-            error (EXIT_FAILURE, 0,
-                   _("the delimiter must be a single character"));
-          delimiter = optarg[0];
+          if (optarg[0] != '\0')
+            {
+              mcel_t g = mcel_scanz (optarg);
+              /* Note we always allow single bytes, especially since mcel
+                 explicitly does not avoid https://sourceware.org/PR29511
+                 I.e., we ignore g.err, and rely on g.len==1 with g.err.  */
+              if (optarg[g.len] != '\0')
+                error (EXIT_FAILURE, 0,
+                       _("the delimiter must be a single character"));
+            }
+          delimiter = optarg;
            break;
  
          case 'z':
@@ -1642,7 +1644,7 @@ main (int argc, char **argv)
        && !grouping && (padding_width == 0) && (format_str == nullptr))
      error (0, 0, _("no conversion option specified"));
  
-  if (debug && unit_separator && delimiter == DELIMITER_DEFAULT)
+  if (debug && unit_separator && delimiter == nullptr)
      error (0, 0,
             _("field delimiters have higher precedence than unit separators"));
  
@@ -1657,7 +1659,7 @@ main (int argc, char **argv)
          error (0, 0, _("grouping has no effect in this locale"));
      }
  
-  auto_padding = (padding_width == 0 && delimiter == DELIMITER_DEFAULT);
+  auto_padding = (padding_width == 0 && delimiter == nullptr);
  
    if (inval_style != inval_abort)
      conv_exit_code = 0;
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl

index 2f03efd1c14e7a3937f6465084b7a8694da9a78d..75de1a9f97ecac96b0718263aa7e6e08cfb2a59e 100755 (executable)
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -283,6 +283,9 @@ my @Tests =
       ['delim-4', '--delimiter=: --from=auto 40M:60M',  {OUT=>'40000000:60M'}],
       ['delim-5', '-d: --field=2 --from=auto :40M:60M',  {OUT=>':40000000:60M'}],
       ['delim-6', '-d: --field 3 --from=auto 40M:60M', {OUT=>"40M:60M"}],
+     # Ensure we don't hit https://sourceware.org/PR29511
+     ['delim-7', "-d '\xc2' --field=2 --invalid=ignore '1\xc2\xb72K'",
+             {OUT => "1\xc2\xb72K"}],
       ['delim-err-1', '-d,, --to=si 1', {EXIT=>1},
               {ERR => "$prog: the delimiter must be a single character\n"}],
  
@@ -1187,6 +1190,10 @@ my @Locale_Tests =
       ['lcl-suf-11', "--field=2 '1 \xe2\x80\x832'",
               {OUT => "1  2"}, {ENV=>"LC_ALL=$locale"}],
  
+     # Support multi-byte delimiter
+     ['lcl-delim-1', "-d '\xc2\xb7' --field=2 --from=auto '1\xc2\xb72K'",
+             {OUT => "1\xc2\xb72000"}, {ENV=>"LC_ALL=$locale"}],
+
    );
  if ($locale ne 'C')
    {
author	Pádraig Brady <P@draigBrady.com>
	Sat, 18 Oct 2025 16:44:49 +0000 (17:44 +0100)
committer	Pádraig Brady <P@draigBrady.com>
	Mon, 20 Oct 2025 12:13:24 +0000 (13:13 +0100)
NEWS		patch \| blob \| blame \| history
bootstrap.conf		patch \| blob \| blame \| history
src/numfmt.c		patch \| blob \| blame \| history
tests/misc/numfmt.pl		patch \| blob \| blame \| history