numfmt: support reading numbers with NBSP before unit

author Pádraig Brady <P@draigBrady.com>

Tue, 14 Oct 2025 15:17:56 +0000 (16:17 +0100)

committer Pádraig Brady <P@draigBrady.com>

Fri, 17 Oct 2025 16:26:25 +0000 (17:26 +0100)
author Pádraig Brady <P@draigBrady.com>
Tue, 14 Oct 2025 15:17:56 +0000 (16:17 +0100)
committer Pádraig Brady <P@draigBrady.com>
Fri, 17 Oct 2025 16:26:25 +0000 (17:26 +0100)
diff --git a/NEWS b/NEWS

index e6053a04b3d78746eddccd64f11fdc2b09f10a23..40d443942e4c6b7667ade95d48e58d3c46c3be2a 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,8 @@ GNU coreutils NEWS                                    -*- outline -*-
  
  ** Improvements
  
+  numfmt now parses numbers with a non-breaking space character before a unit.
+
    wc -l now operates 10% faster on hosts that support AVX512 instructions.
  
  
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index 26c9209a32fb388985429631e8a5898f9be6f987..b50e5f7246758c9a86b78932413fca7ae9ea049f 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -19447,7 +19447,7 @@ Print (to standard error) warning messages about possible erroneous usage.
  @itemx --delimiter=@var{d}
  @opindex -d
  @opindex --delimiter
-Use the character @var{d} as input field separator (default: whitespace).
+Use the character @var{d} as input field separator (default: newline or blank).
  Using non-default delimiter turns off automatic padding.
  
  @item --field=@var{fields}
diff --git a/src/numfmt.c b/src/numfmt.c

index 0cc12689e2c64aa366035d46a748bef0597c256c..fbf104b5119dc044552bb74ad240a13592938757 100644 (file)
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -25,6 +25,7 @@
  #include "argmatch.h"
  #include "c-ctype.h"
  #include "mbswidth.h"
+#include "mcel.h"
  #include "quote.h"
  #include "skipchars.h"
  #include "system.h"
@@ -210,6 +211,11 @@ static int decimal_point_length;
  /* debugging for developers.  Enables devmsg().  */
  static bool dev_debug = false;
  
+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
  
  static inline int
  default_scale_base (enum scale_type scale)
@@ -645,15 +651,23 @@ simple_strtod_human (char const *input_str,
      {
        /* process suffix.  */
  
-      /* Skip any blanks between the number and suffix.  */
-      while (isblank (to_uchar (**endptr)))
-        (*endptr)++;
+      /* Skip a single blank or NBSP between the number and suffix.  */
+      mcel_t g = mcel_scanz (*endptr);
+      if (c32isblank (g.ch) || c32isnbspace (g.ch))
+        (*endptr) += g.len;
  
        if (**endptr == '\0')
          break;  /* Treat as no suffix.  */
  
        if (!valid_suffix (**endptr))
-        return SSE_INVALID_SUFFIX;
+        {
+          /* Trailing blanks are allowed.  */
+          *endptr = skip_str_matching (*endptr, newline_or_blank, true);
+          if (**endptr == '\0')
+            break;
+
+          return SSE_INVALID_SUFFIX;
+        }
  
        if (allowed_scaling == scale_none)
          return SSE_VALID_BUT_FORBIDDEN_SUFFIX;
@@ -680,6 +694,9 @@ simple_strtod_human (char const *input_str,
  
        *precision = 0;  /* Reset, to select precision based on scale.  */
  
+      /* Trailing blanks are allowed.  */
+      *endptr = skip_str_matching (*endptr, newline_or_blank, true);
+
        break;
      }
  
@@ -1320,12 +1337,6 @@ process_suffixed_number (char *text, long double *result,
    return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
  }
  
-static bool
-newline_or_blank (mcel_t g)
-{
-  return g.ch == '\n' || c32isblank (g.ch);
-}
-
  /* Return a pointer to the beginning of the next field in line.
     The line pointer is moved to the end of the next field. */
  static char*
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl

index 4dd9718c9cbf1ab52390556b4eb70d83319ec46e..85c888cd8e3954cccf535d1bed5904ab80a21e16 100755 (executable)
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -164,6 +164,14 @@ my @Tests =
        '--suffix=Foo' . 'x' x 122 . 'y 0',
        {OUT => '0Foo' . 'x' x 122 . 'y'}],
       ['suf-21', "-d '' --from=si '4  '",         {OUT => "4"}],
+     # Multiple spaces between number and suffix should be rejected
+     ['suf-22', "-d '' --from=auto '2  K'",
+             {ERR => "$prog: invalid suffix in input: '2  K'\n"},
+             {EXIT => 2}],
+     # Trailing spaces should be accepted
+     ['suf-23', "-d '' --from=auto '2 '",  {OUT=>'2'}],
+     ['suf-24', "-d '' --from=auto '2  '", {OUT=>'2'}],
+     ['suf-25', "-d '' --from=auto '2K '", {OUT=>'2000'}],
  
       ## GROUPING
  
@@ -1067,6 +1075,21 @@ my @Locale_Tests =
       ['lcl-fmt-7', '--format="%0\'\'6f" 1234',{OUT=>"01${lg}234"},
               {ENV=>"LC_ALL=$locale"}],
  
+     # Single blank/NBSP acceptance between number and suffix
+     ['lcl-suf-1', "-d '' --from=auto '2 K'",      {OUT => "2000"},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-2', "-d '' --from=auto '2\tK'",      {OUT => "2000"},
+             {ENV=>"LC_ALL=$locale"}],
+     # NBSP characters: U+00A0, U+2007, U+202F, U+2060
+     ['lcl-suf-3', "--from=auto '2\xc2\xa0K'", {OUT => "2000"},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-4', "--from=auto '2\xe2\x80\x87Ki'", {OUT => "2048"},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-5', "--from=auto '2\xe2\x80\xafK'", {OUT => "2000"},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-6', "--from=auto '2\xe2\x81\xa0Ki'", {OUT => "2048"},
+             {ENV=>"LC_ALL=$locale"}],
+
    );
  if ($locale ne 'C')
    {
author	Pádraig Brady <P@draigBrady.com>
	Tue, 14 Oct 2025 15:17:56 +0000 (16:17 +0100)
committer	Pádraig Brady <P@draigBrady.com>
	Fri, 17 Oct 2025 16:26:25 +0000 (17:26 +0100)
NEWS		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/numfmt.c		patch \| blob \| blame \| history
tests/misc/numfmt.pl		patch \| blob \| blame \| history