numfmt: add --unit-separator

author Pádraig Brady <P@draigBrady.com>

Thu, 9 Oct 2025 13:24:12 +0000 (14:24 +0100)

committer Pádraig Brady <P@draigBrady.com>

Fri, 17 Oct 2025 18:16:26 +0000 (19:16 +0100)
author Pádraig Brady <P@draigBrady.com>
Thu, 9 Oct 2025 13:24:12 +0000 (14:24 +0100)
committer Pádraig Brady <P@draigBrady.com>
Fri, 17 Oct 2025 18:16:26 +0000 (19:16 +0100)
diff --git a/NEWS b/NEWS

index a07fe298cdb4ca71845d284b9b1a449e43498152..b345132715943aec4f226f0000a2e6c7e4b294d7 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -33,6 +33,11 @@ GNU coreutils NEWS                                    -*- outline -*-
    that use the GNU extension /NUM or +NUM formats.
    [bug introduced in coreutils-8.28]
  
+** New Features
+
+  'numfmt' now accepts the --unit-separator=SEP option, to output or accept
+  a separator between the number and unit.  For e.g. "1234 M".
+
  ** Improvements
  
    numfmt now parses numbers with a non-breaking space character before a unit,
diff --git a/THANKS.in b/THANKS.in

index 8c97a81388bdf053c3962a76feee02f41607639e..8f6af1b61219126f8246106986fb69f0def31cdc 100644 (file)
--- a/THANKS.in
+++ b/THANKS.in
@@ -315,6 +315,7 @@ Joey Hess                           joeyh@debian.org
  Johan Boule                         bohan@bohan.dyndns.org
  Johan Danielsson                    joda@pdc.kth.se
  Johannes Altmanninger               aclopte@gmail.com
+Johannes Schauer Marin Rodrigues    josch@debian.org
  John Bley                           jbb6@acpub.duke.edu
  John Gatewood Ham                   zappaman@alphabox.compsci.buu.ac.th
  John Gotts                          jgotts@umich.edu
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index b50e5f7246758c9a86b78932413fca7ae9ea049f..89534db726ccc2dfa43637274f4fc7637f026844 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -19544,6 +19544,23 @@ the output numbers represent other units (e.g. to represent @samp{4,000,000}
  bytes in blocks of 1kB, use @samp{--to=si --to-unit=1000}).
  Suffixes are handled as with @samp{--from=auto}.
  
+@item --unit-separator=@var{sep}
+@opindex --unit-separator
+Support a separator @var{sep} between the number and unit,
+with @option{--from} or @option{--to} auto-scaled units.
+By default a blank or non-breaking space character is accepted on input,
+and no separator is printed on output.
+When parsing input, the specified unit separator has lower precedence
+than field delimiters.  See the @option{--delimiter} option above.
+
+Examples:
+@example
+Add a space on output: @option{--unit-separator=' '}
+Disable blanks on input: @option{--unit-separator=''}
+Support blanks on input: @option{--delimiter=''}
+Ditto and output non-breaking space: @option{-d '' --unit-separator=$'\u00A0'}
+@end example
+
  @optZeroTerminated
  @newlineFieldSeparator
  
diff --git a/src/numfmt.c b/src/numfmt.c

index 1a744770faed32bbc78cfb193956d4dbbdcfe98c..26f9180542d2e81d22a58cee866df74213f869e2 100644 (file)
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -60,7 +60,8 @@ enum
    DEV_DEBUG_OPTION,
    HEADER_OPTION,
    FORMAT_OPTION,
-  INVALID_OPTION
+  INVALID_OPTION,
+  UNIT_SEPARATOR_OPTION
  };
  
  enum scale_type
@@ -140,6 +141,7 @@ static struct option const longopts[] =
    {"round", required_argument, nullptr, ROUND_OPTION},
    {"padding", required_argument, nullptr, PADDING_OPTION},
    {"suffix", required_argument, nullptr, SUFFIX_OPTION},
+  {"unit-separator", required_argument, nullptr, UNIT_SEPARATOR_OPTION},
    {"grouping", no_argument, nullptr, GROUPING_OPTION},
    {"delimiter", required_argument, nullptr, 'd'},
    {"field", required_argument, nullptr, FIELD_OPTION},
@@ -172,6 +174,7 @@ static enum scale_type scale_to = scale_none;
  static enum round_type round_style = round_from_zero;
  static enum inval_type inval_style = inval_abort;
  static char const *suffix = nullptr;
+static char const *unit_separator = nullptr;
  static uintmax_t from_unit_size = 1;
  static uintmax_t to_unit_size = 1;
  static int grouping = 0;
@@ -658,10 +661,24 @@ simple_strtod_human (char const *input_str,
      {
        /* process suffix.  */
  
-      /* Skip a single blank or NBSP between the number and suffix.  */
-      mcel_t g = mcel_scanz (*endptr);
-      if (c32isblank (g.ch) || c32isnbspace (g.ch))
-        (*endptr) += g.len;
+      /* Skip a single blank, NBSP or specified unit separator.
+         Note an explicit empty --unit-sep should disable blank matching. */
+      bool matched_unit_sep = false;
+      if (unit_separator)
+        {
+          size_t sep_len = strlen (unit_separator);
+          if (STREQ_LEN (*endptr, unit_separator, sep_len))
+            {
+              matched_unit_sep = true;
+              (*endptr) += sep_len;
+            }
+        }
+      if (!matched_unit_sep)
+        {
+          mcel_t g = mcel_scanz (*endptr);
+          if (c32isblank (g.ch) || c32isnbspace (g.ch))
+            (*endptr) += g.len;
+        }
  
        if (**endptr == '\0')
          break;  /* Treat as no suffix.  */
@@ -768,7 +785,7 @@ double_to_human (long double val, int precision,
                   char *buf, idx_t buf_size,
                   enum scale_type scale, int group, enum round_type round)
  {
-  char fmt[sizeof "%'0.*Lfi%s%s%s" + INT_STRLEN_BOUND (zero_padding_width)];
+  char fmt[sizeof "%'0.*Lfi%s%s%s%s" + INT_STRLEN_BOUND (zero_padding_width)];
    char *pfmt = fmt;
    *pfmt++ = '%';
  
@@ -835,11 +852,12 @@ double_to_human (long double val, int precision,
  
    devmsg ("  after rounding, value=%Lf * %0.f ^ %d\n", val, scale_base, power);
  
-  strcpy (pfmt, ".*Lf%s%s%s");
+  strcpy (pfmt, ".*Lf%s%s%s%s");
  
    int prec = user_precision == -1 ? show_decimal_point : user_precision;
  
    return snprintf (buf, buf_size, fmt, prec, val,
+                   (power > 0 && unit_separator) ? unit_separator : "",
                     power == 1 && scale == scale_SI
                     ? "k" : suffix_power_char (power),
                     &"i"[! (scale == scale_IEC_I && 0 < power)],
@@ -954,6 +972,10 @@ Reformat NUMBER(s), or the numbers from standard input if none are specified.\n\
        fputs (_("\
        --suffix=SUFFIX  add SUFFIX to output numbers, and accept optional\n\
                           SUFFIX in input numbers\n\
+"), stdout);
+      fputs (_("\
+      --unit-separator=SEP  insert SEP between number and unit on output,\n\
+                         and accept optional SEP in input numbers\n\
  "), stdout);
        fputs (_("\
        --to=UNIT        auto-scale output numbers to UNITs; see UNIT below\n\
@@ -1556,6 +1578,10 @@ main (int argc, char **argv)
            suffix = optarg;
            break;
  
+        case UNIT_SEPARATOR_OPTION:
+          unit_separator = optarg;
+          break;
+
          case DEBUG_OPTION:
            debug = true;
            break;
@@ -1607,6 +1633,10 @@ main (int argc, char **argv)
        && !grouping && (padding_width == 0) && (format_str == nullptr))
      error (0, 0, _("no conversion option specified"));
  
+  if (debug && unit_separator && delimiter == DELIMITER_DEFAULT)
+    error (0, 0,
+           _("field delimiters have higher precedence than unit separators"));
+
    if (format_str)
      parse_format_string (format_str);
  
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl

index 1d3c4202c76597069633573e50954bbf36c48a48..ff22c730319052a1294bbe0e88b1c1922fcfe892 100755 (executable)
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -173,6 +173,77 @@ my @Tests =
       ['suf-24', "-d '' --from=auto '2  '", {OUT=>'2'}],
       ['suf-25', "-d '' --from=auto '2K '", {OUT=>'2000'}],
  
+     ## Unit Separator
+     # Output with space separator
+     ['unit-sep-1', '--to=si --unit-separator=" " 1000',  {OUT=>"1.0 k"}],
+     ['unit-sep-2', '--to=iec --unit-separator=" " 1024', {OUT=>"1.0 K"}],
+     ['unit-sep-3', '--to=iec-i --unit-separator=" " 2048', {OUT=>"2.0 Ki"}],
+
+     # Output with multi-character separator
+     ['unit-sep-4', '--to=si --unit-separator="__" 1000', {OUT=>"1.0__k"}],
+     ['unit-sep-5', '--to=iec --unit-separator="::" 2048', {OUT=>"2.0::K"}],
+
+     # Input with space separator
+     ['unit-sep-6', '-d "" --from=si --unit-sep=" " "1 K"', {OUT=>"1000"}],
+     ['unit-sep-7', '-d "" --from=iec --unit-sep=" " "2 M"', {OUT=>"2097152"}],
+
+     # Input with multi-character separator
+     ['unit-sep-8', '-d "" --from=si --unit-separator="  "',
+      {IN_PIPE=>"1  K\n2  M\n3  G\n"},
+      {OUT=>"1000\n2000000\n3000000000"}],
+     ['unit-sep-9', '--from=iec --unit-separator="'."\xC2\xA0".'"',
+      {IN_PIPE=>"4\xC2\xA0K\n"}, {OUT=>"4096"}],
+     ['unit-sep-10', '--from=iec --unit-separator="::"',
+      {IN_PIPE=>"4::K\n"}, {OUT=>"4096"}],
+
+     # input with empty separator
+     ['unit-sep-11', '-d "" --from=si --unit-separator=""',
+      {IN_PIPE=>"1K\n2M\n3G\n"},
+      {OUT=>"1000\n2000000\n3000000000"}],
+     ['unit-sep-12', '-d "" --from=si --unit-separator="" "1 K"',
+      {ERR=>"$prog: invalid suffix in input: '1 K'\n"},
+      {EXIT=>2}],
+
+     # Combined with suffix
+     ['unit-sep-13', '--to=si --unit-separator=" " --suffix=B 1000',
+      {OUT=>"1.0 kB"}],
+     ['unit-sep-14', '--to=si --unit-separator=" " --suffix=" B" 1000',
+      {OUT=>"1.0 k B"}],
+     ['unit-sep-15', '-d "" --from=si --unit-separator=" " --suffix=B',
+      {IN_PIPE=>"5 KB\n"}, {OUT=>"5000B"}],
+
+     # No separator when there's no unit (power=0)
+     ['unit-sep-16', '--to=si --unit-separator=" " 500', {OUT=>"500"}],
+
+     # Round-trip test
+     ['unit-sep-17', '--from=iec --to=iec --unit-separator="_"',
+      {IN_PIPE=>"1_K\n"}, {OUT=>"1.0_K"}],
+
+     # Currently field delimiters have higher precedence than unit separators.
+     # Even if this is changed in future, the following should hold.
+
+     # The space should act as a field delimiter here
+     ['unit-sep-18', '--from=si --unit-separator=" " "1 K_Field2"',
+      {OUT=>"1 K_Field2"}],
+     # Same as above but with 'i' suffix - should split at space with --from=si
+     ['unit-sep-19', '--from=si --unit-separator=" " "5 Ki_Field2"',
+      {OUT=>"5 Ki_Field2"}],
+     # With --from=auto, Ki followed by invalid char should also split
+     ['unit-sep-20', '--from=auto --unit-separator=" " "5 Ki_Field2"',
+      {OUT=>"5 Ki_Field2"}],
+     # With custom delimiter, space after K should not be treated as delimiter
+     ['unit-sep-21', '-d: --from=si --unit-separator=" " "5 K:Field2"',
+      {OUT=>"5000:Field2"}],
+     # Fail case: space after K with custom delimiter should error
+     ['unit-sep-22-fail', '-d: --from=si --unit-separator=" " "5 K Field2"',
+      {ERR=>"$prog: invalid suffix in input '5 K Field2': 'Field2'\n"},
+      {EXIT=>2}],
+
+     # If Unit separator consumed before delimiter char,
+     # this would change to outputting "5000 2"
+     ['unit-sep-23', '--from=si --field=1 --unit-separator=" " -d " " "5 K 2"',
+      {OUT=>"5 K 2"}],
+
       ## GROUPING
  
       # "C" locale - no grouping (locale-specific tests, below)
author	Pádraig Brady <P@draigBrady.com>
	Thu, 9 Oct 2025 13:24:12 +0000 (14:24 +0100)
committer	Pádraig Brady <P@draigBrady.com>
	Fri, 17 Oct 2025 18:16:26 +0000 (19:16 +0100)
NEWS		patch \| blob \| blame \| history
THANKS.in		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/numfmt.c		patch \| blob \| blame \| history
tests/misc/numfmt.pl		patch \| blob \| blame \| history