numfmt: implement support for field ranges

author Dylan Cali <calid1984@gmail.com>

Fri, 5 Sep 2014 09:42:02 +0000 (04:42 -0500)

committer Pádraig Brady <P@draigBrady.com>

Fri, 19 Jun 2015 18:59:21 +0000 (19:59 +0100)
author Dylan Cali <calid1984@gmail.com>
Fri, 5 Sep 2014 09:42:02 +0000 (04:42 -0500)
committer Pádraig Brady <P@draigBrady.com>
Fri, 19 Jun 2015 18:59:21 +0000 (19:59 +0100)
diff --git a/NEWS b/NEWS

index 9d69da3301aaae5337917ba8e196b57599c26a41..9c551d5149a9f8117da5e8285ad0ef9aff68840d 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -70,6 +70,8 @@ GNU coreutils NEWS                                    -*- outline -*-
    dd accepts a new status=progress level to print data transfer statistics
    on stderr approximately every second.
  
+  numfmt can now process multiple fields using field ranges similar to cut.
+
    split accepts a new --separator option to select a record separator character
    other than the default newline character.
  
diff --git a/bootstrap.conf b/bootstrap.conf

index 320e7f581a348cc98809ed66afcb6fc82d29235a..5b6ec58e5047b4b00582805c0e4e9ee97450387a 100644 (file)
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -34,6 +34,7 @@ gnulib_modules="
    argv-iter
    assert
    autobuild
+  linked-list
    backupfile
    base64
    buffer-lcm
@@ -270,6 +271,7 @@ gnulib_modules="
    xgetcwd
    xgetgroups
    xgethostname
+  xlist
    xmemcoll
    xnanosleep
    xprintf
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index 08316c9289a3f8312436c20097dbf544109d3f1d..9197cb426d9fba0563f92de739240a7dbf929d15 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -16892,9 +16892,19 @@ Print (to standard error) warning messages about possible erroneous usage.
  Use the character @var{d} as input field separator (default: whitespace).
  @emph{Note}: Using non-default delimiter turns off automatic padding.
  
-@item --field=@var{n}
+@item --field=@var{fields}
  @opindex --field
-Convert the number in input field @var{n} (default: 1).
+Convert the number in input field @var{fields} (default: 1).
+@var{fields} supports @command{cut} style field ranges:
+
+@example
+N    N'th field, counted from 1
+N-   from N'th field, to end of line
+N-M  from N'th to M'th field (inclusive)
+-M   from first to M'th field (inclusive)
+-    all fields
+@end example
+
  
  @item --format=@var{format}
  @opindex --format
diff --git a/gnulib b/gnulib

index 9a417cf7d48fa231c937c53626da6c45d09e6b3e..d0302f003873b8c633d2023ab98aa6c4045b32e8 160000 (submodule)
--- a/gnulib
+++ b/gnulib
@@ -1 +1 @@
-Subproject commit 9a417cf7d48fa231c937c53626da6c45d09e6b3e
+Subproject commit d0302f003873b8c633d2023ab98aa6c4045b32e8
diff --git a/src/numfmt.c b/src/numfmt.c

index c03329f0416b6c366d30f20c80b19a6756a985b2..18243dd9f9257ee0cfbee68eb2742fe0804af612 100644 (file)
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -29,6 +29,8 @@
  #include "system.h"
  #include "xstrtol.h"
  #include "xstrndup.h"
+#include "gl_linked_list.h"
+#include "gl_xlist.h"
  
  /* The official name of this program (e.g., no 'g' prefix).  */
  #define PROGRAM_NAME "numfmt"
@@ -182,7 +184,10 @@ static int conv_exit_code = EXIT_CONVERSION_WARNINGS;
  /* auto-pad each line based on skipped whitespace.  */
  static int auto_padding = 0;
  static mbs_align_t padding_alignment = MBS_ALIGN_RIGHT;
-static long int field = 1;
+static bool all_fields = false;
+static size_t all_fields_after = 0;
+static size_t all_fields_before = 0;
+static gl_list_t field_list;
  static int delimiter = DELIMITER_DEFAULT;
  
  /* if non-zero, the first 'header' lines from STDIN are skipped.  */
@@ -854,7 +859,8 @@ Reformat NUMBER(s), or the numbers from standard input if none are specified.\n\
    -d, --delimiter=X    use X instead of whitespace for field delimiter\n\
  "), stdout);
        fputs (_("\
-      --field=N        replace the number in input field N (default is 1)\n\
+      --field=FIELDS   replace the numbers in these input fields (default=1)\n\
+                         see FIELDS below\n\
  "), stdout);
        fputs (_("\
        --format=FORMAT  use printf style floating-point FORMAT;\n\
@@ -932,6 +938,16 @@ UNIT options:\n"), stdout);
                 1Mi = 1048576,\n\
                 ...\n"), stdout);
  
+      fputs (_("\n\
+FIELDS supports cut(1) style field ranges:\n\
+  N    N'th field, counted from 1\n\
+  N-   from N'th field, to end of line\n\
+  N-M  from N'th to M'th field (inclusive)\n\
+  -M   from first to M'th field (inclusive)\n\
+  -    all fields\n\
+Multiple fields/ranges can be separated with commas\n\
+"), stdout);
+
        fputs (_("\n\
  FORMAT must be suitable for printing one floating-point argument '%f'.\n\
  Optional quote (%'f) will enable --grouping (if supported by current locale).\n\
@@ -960,7 +976,7 @@ Examples:\n\
             -> \"1000\"\n\
    $ echo 1K | %s --from=iec\n\
             -> \"1024\"\n\
-  $ df -B1 | %s --header --field 2 --to=si\n\
+  $ df -B1 | %s --header --field 2-4 --to=si\n\
    $ ls -l  | %s --header --field 5 --to=iec\n\
    $ ls -lh | %s --header --field 5 --from=iec --padding=10\n\
    $ ls -lh | %s --header --field 5 --from=iec --format %%10f\n"),
@@ -1182,7 +1198,8 @@ print_padded_number (void)
  /* Converts the TEXT number string to the requested representation,
     and handles automatic suffix addition.  */
  static int
-process_suffixed_number (char *text, long double *result, size_t *precision)
+process_suffixed_number (char *text, long double *result,
+                         size_t *precision, long int field)
  {
    if (suffix && strlen (text) > strlen (suffix))
      {
@@ -1233,139 +1250,253 @@ process_suffixed_number (char *text, long double *result, size_t *precision)
    return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
  }
  
-/* Skip the requested number of fields in the input string.
-   Returns a pointer to the *delimiter* of the requested field,
-   or a pointer to NUL (if reached the end of the string).  */
-static inline char * _GL_ATTRIBUTE_PURE
-skip_fields (char *buf, int fields)
+typedef struct range_pair
  {
-  char *ptr = buf;
-  if (delimiter != DELIMITER_DEFAULT)
-    {
-      if (*ptr == delimiter)
-        fields--;
-      while (*ptr && fields--)
-        {
-          while (*ptr && *ptr == delimiter)
-            ++ptr;
-          while (*ptr && *ptr != delimiter)
-            ++ptr;
-        }
-    }
-  else
-    while (*ptr && fields--)
-      {
-        while (*ptr && isblank (to_uchar (*ptr)))
-          ++ptr;
-        while (*ptr && !isblank (to_uchar (*ptr)))
-          ++ptr;
-      }
-  return ptr;
+  size_t lo;
+  size_t hi;
+} range_pair_t;
+
+static int
+sort_field (const void *elt1, const void *elt2)
+{
+  range_pair_t* rp1 = (range_pair_t*) elt1;
+  range_pair_t* rp2 = (range_pair_t*) elt2;
+
+  if (rp1->lo < rp2->lo)
+    return -1;
+
+  return rp1->lo > rp2->lo;
  }
  
-/* Parse a delimited string, and extracts the requested field.
-   NOTE: the input buffer is modified.
+static int
+match_field (const void *elt1, const void *elt2)
+{
+  range_pair_t* rp = (range_pair_t*) elt1;
+  size_t field = *(size_t*) elt2;
  
-   TODO:
-     Maybe support multiple fields, though can always pipe output
-     into another numfmt to process other fields.
-     Maybe default to processing all fields rather than just first?
+  if (rp->lo <= field && field <= rp->hi)
+    return 0;
+
+  if (rp->lo < field)
+    return -1;
+
+  return 1;
+}
  
-   Output:
-     _PREFIX, _DATA, _SUFFIX will point to the relevant positions
-     in the input string, or be NULL if such a part doesn't exist.  */
  static void
-extract_fields (char *line, int _field,
-                char ** _prefix, char ** _data, char ** _suffix)
+free_field (const void *elt)
  {
-  char *ptr = line;
-  *_prefix = NULL;
-  *_data = NULL;
-  *_suffix = NULL;
+  void *p = (void *)elt;
+  free (p);
+}
  
-  devmsg ("extracting Fields:\n  input: %s\n  field: %d\n",
-          quote (line), _field);
+/* Add the specified fields to field_list.
+   The format recognized is similar to cut.
+   TODO: Refactor the more performant cut implementation
+   for use by both utilities.  */
+static void
+parse_field_arg (char *optarg)
+{
  
-  if (field > 1)
+  char *start, *end;
+  range_pair_t *rp;
+  size_t field_val;
+  size_t range_val = 0;
+
+  start = end = optarg;
+
+  if (STREQ (optarg, "-"))
      {
-      /* skip the requested number of fields.  */
-      *_prefix = line;
-      ptr = skip_fields (line, field - 1);
-      if (*ptr == '\0')
-        {
-          /* not enough fields in the input - print warning?  */
-          devmsg ("  TOO FEW FIELDS!\n  prefix: %s\n", quote (*_prefix));
-          return;
-        }
+      all_fields = true;
  
-      *ptr = '\0';
-      ++ptr;
+      return;
      }
  
-  *_data = ptr;
-  *_suffix = skip_fields (*_data, 1);
-  if (**_suffix)
+  if (*start == '-')
      {
-      /* there is a suffix (i.e., the field is not the last on the line),
-         so null-terminate the _data before it.  */
-      **_suffix = '\0';
-      ++(*_suffix);
+      /* range -M */
+      ++start;
+
+      all_fields_before = strtol (start, &end, 10);
+
+      if (start == end || all_fields_before <=0)
+        error (EXIT_FAILURE, 0, _("invalid field value %s"),
+               quote (start));
+
+      return;
      }
-  else
-    *_suffix = NULL;
  
-  devmsg ("  prefix: %s\n  number: %s\n  suffix: %s\n",
-          quote_n (0, *_prefix ? *_prefix : ""),
-          quote_n (1, *_data),
-          quote_n (2, *_suffix ? *_suffix : ""));
-}
+  field_list = gl_list_create_empty (GL_LINKED_LIST,
+                                     NULL, NULL, free_field, false);
  
+  while (*end != '\0') {
+    field_val = strtol (start, &end, 10);
  
-/* Convert a number in a given line of text.
-   NEWLINE specifies whether to output a '\n' for this "line".  */
-static int
-process_line (char *line, bool newline)
-{
-  char *pre, *num, *suf;
-  long double val = 0;
-  size_t precision = 0;
-  int valid_number = 0;
+    if (start == end || field_val <=0)
+      error (EXIT_FAILURE, 0, _("invalid field value %s"),
+             quote (start));
  
-  extract_fields (line, field, &pre, &num, &suf);
-  if (!num)
-    if (inval_style != inval_ignore)
-      error (conv_exit_code, 0, _("input line is too short, "
-                                  "no numbers found to convert in field %ld"),
-           field);
+    if (! range_val)
+      {
+        /* field N */
+        rp = xmalloc (sizeof (*rp));
+        rp->lo = rp->hi = field_val;
+        gl_sortedlist_add (field_list, sort_field, rp);
+      }
+    else
+      {
+        /* range N-M
+           The last field was the start of the field range. The current
+           field is the end of the field range.  We already added the
+           start field, so increment and add all the fields through
+           range end. */
+        if (field_val < range_val)
+          error (EXIT_FAILURE, 0, _("invalid decreasing range"));
+        rp = xmalloc (sizeof (*rp));
+        rp->lo = range_val + 1;
+        rp->hi = field_val;
+        gl_sortedlist_add (field_list, sort_field, rp);
+
+        range_val = 0;
+      }
  
-  if (num)
-    {
-      valid_number = process_suffixed_number (num, &val, &precision);
-      if (valid_number)
-        valid_number = prepare_padded_number (val, precision);
+    switch (*end) {
+      case ',':
+        /* discrete field separator */
+        ++end;
+        start = end;
+        break;
+
+      case '-':
+        /* field range separator */
+        ++end;
+        start = end;
+        range_val = field_val;
+        break;
      }
+  }
  
-  if (pre)
-    fputs (pre, stdout);
+  if (range_val)
+    {
+      /* range N-
+         range_val was not reset indicating optarg
+         ended with a trailing '-' */
+      all_fields_after = range_val;
+    }
+}
  
-  if (pre && num)
-    fputc ((delimiter == DELIMITER_DEFAULT) ? ' ' : delimiter, stdout);
+/* Return a pointer to the beginning of the next field in line.
+   The line pointer is moved to the end of the next field. */
+static char*
+next_field (char **line)
+{
+  char *field_start = *line;
+  char *field_end   = field_start;
  
-  if (valid_number)
+  if (delimiter != DELIMITER_DEFAULT)
      {
-      print_padded_number ();
+      if (*field_start != delimiter)
+        {
+          while (*field_end && *field_end != delimiter)
+            ++field_end;
+        }
+      /* else empty field */
      }
    else
      {
-      if (num)
-        fputs (num, stdout);
+      /* keep any space prefix in the returned field */
+      while (*field_end && isblank (to_uchar (*field_end)))
+        ++field_end;
+
+      while (*field_end && !isblank (to_uchar (*field_end)))
+        ++field_end;
      }
  
-  if (suf)
+  *line = field_end;
+  return field_start;
+}
+
+static bool
+include_field (size_t field)
+{
+  if (all_fields)
+    return true;
+
+  if (all_fields_after && all_fields_after <= field)
+    return true;
+
+  if (all_fields_before && field <= all_fields_before)
+    return true;
+
+  /* default to field 1 */
+  if (! field_list)
+    return field == 1;
+
+  return gl_sortedlist_search (field_list, match_field, &field);
+}
+
+/* Convert and output the given field. If it is not included in the set
+   of fields to process just output the original */
+static bool
+process_field (char *text, size_t field)
+{
+  long double val = 0;
+  size_t precision = 0;
+  bool valid_number = true;
+
+  if (include_field (field))
      {
-      fputc ((delimiter == DELIMITER_DEFAULT) ? ' ' : delimiter, stdout);
-      fputs (suf, stdout);
+      valid_number =
+        process_suffixed_number (text, &val, &precision, field);
+
+      if (valid_number)
+        valid_number = prepare_padded_number (val, precision);
+
+      if (valid_number)
+        print_padded_number ();
+      else
+        fputs (text, stdout);
      }
+  else
+    fputs (text, stdout);
+
+  return valid_number;
+}
+
+/* Convert number in a given line of text.
+   NEWLINE specifies whether to output a '\n' for this "line".  */
+static int
+process_line (char *line, bool newline)
+{
+  char *next;
+  size_t field = 0;
+  bool valid_number = true;
+
+  while (true) {
+    ++field;
+    next = next_field (&line);
+
+    if (*line != '\0')
+      {
+        /* nul terminate the current field string and process */
+        *line = '\0';
+
+        if (! process_field (next, field))
+          valid_number = false;
+
+        fputc ((delimiter == DELIMITER_DEFAULT) ?
+               ' ' : delimiter, stdout);
+        ++line;
+      }
+    else
+      {
+        /* end of the line, process the last field and finish */
+        if (! process_field (next, field))
+          valid_number = false;
+
+        break;
+      }
+  }
  
    if (newline)
      putchar ('\n');
@@ -1441,10 +1572,12 @@ main (int argc, char **argv)
            break;
  
          case FIELD_OPTION:
-          if (xstrtol (optarg, NULL, 10, &field, "") != LONGINT_OK
-              || field <= 0)
-            error (EXIT_FAILURE, 0, _("invalid field value %s"),
-                   quote (optarg));
+          if (all_fields || all_fields_before || all_fields_after || field_list)
+            {
+              error (EXIT_FAILURE, 0,
+                     _("multiple field specifications"));
+            }
+          parse_field_arg (optarg);
            break;
  
          case 'd':
@@ -1556,10 +1689,14 @@ main (int argc, char **argv)
          error (0, errno, _("error reading input"));
      }
  
+#ifdef lint
    free (padding_buffer);
    free (format_str_prefix);
    free (format_str_suffix);
  
+  if (field_list)
+    gl_list_free (field_list);
+#endif
  
    if (debug && !valid_numbers)
      error (0, 0, _("failed to convert some of the input numbers"));
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl

index e8640c0f7f2385fdf162a8c0c2d5fd20c5b4f1c6..630d18707c4c377def4645fc6289321a7483cef9 100755 (executable)
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -194,21 +194,16 @@ my @Tests =
       ['delim-3', '--delimiter=" " --from=auto "40M Foo"',{OUT=>'40000000 Foo'}],
       ['delim-4', '--delimiter=: --from=auto 40M:60M',  {OUT=>'40000000:60M'}],
       ['delim-5', '-d: --field=2 --from=auto :40M:60M',  {OUT=>':40000000:60M'}],
-     ['delim-6', '--delimiter=: --field 3 --from=auto 40M:60M',
-             {EXIT=>2},
-             {ERR=>"$prog: input line is too short, no numbers found " .
-                   "to convert in field 3\n"}],
+     ['delim-6', '-d: --field 3 --from=auto 40M:60M', {OUT=>"40M:60M"}],
  
       #Fields
       ['field-1', '--field A',
               {ERR => "$prog: invalid field value 'A'\n"},
               {EXIT => '1'}],
-     ['field-1.1', '--field -5',
-             {ERR => "$prog: invalid field value '-5'\n"},
-             {EXIT => '1'}],
       ['field-2', '--field 2 --from=auto "Hello 40M World 90G"',
               {OUT=>'Hello 40000000 World 90G'}],
       ['field-3', '--field 3 --from=auto "Hello 40M World 90G"',
+             {OUT=>"Hello 40M "},
               {ERR=>"$prog: invalid number: 'World'\n"},
               {EXIT => 2},],
       # Last field - no text after number
@@ -223,10 +218,32 @@ my @Tests =
               {OUT=>"Hello:40000000:World:90G"}],
  
       # not enough fields
-     ['field-8', '--field 3 --to=si "Hello World"',
-             {EXIT=>2},
-             {ERR=>"$prog: input line is too short, no numbers found " .
-                   "to convert in field 3\n"}],
+     ['field-8', '--field 3 --to=si "Hello World"', {OUT=>"Hello World"}],
+
+     # Multiple fields
+     ['field-range-1', '--field 2,4 --to=si "1000 2000 3000 4000 5000"',
+             {OUT=>"1000 2.0K 3000 4.0K 5000"}],
+
+     ['field-range-2', '--field 2-4 --to=si "1000 2000 3000 4000 5000"',
+             {OUT=>"1000 2.0K 3.0K 4.0K 5000"}],
+
+     ['field-range-3', '--field 1,2,3-5 --to=si "1000 2000 3000 4000 5000"',
+             {OUT=>"1.0K 2.0K 3.0K 4.0K 5.0K"}],
+
+     ['field-range-4', '--field 1-5 --to=si "1000 2000 3000 4000 5000"',
+             {OUT=>"1.0K 2.0K 3.0K 4.0K 5.0K"}],
+
+     ['field-range-5', '--field 1-3,5 --to=si "1000 2000 3000 4000 5000"',
+             {OUT=>"1.0K 2.0K 3.0K 4000 5.0K"}],
+
+     ['field-range-6', '--field 3- --to=si "1000 2000 3000 4000 5000"',
+             {OUT=>"1000 2000 3.0K 4.0K 5.0K"}],
+
+     ['field-range-7', '--field -3 --to=si "1000 2000 3000 4000 5000"',
+             {OUT=>"1.0K 2.0K 3.0K 4000 5000"}],
+
+     ['all-fields-1', '--field=- --to=si "1000 2000 3000 4000 5000"',
+             {OUT=>"1.0K 2.0K 3.0K 4.0K 5.0K"}],
  
       # Auto-consume white-space, setup auto-padding
       ['whitespace-1', '--to=si --field 2 "A    500 B"', {OUT=>"A    500 B"}],
@@ -679,9 +696,6 @@ my @Tests =
       ['devdebug-11', '---debug --format "%\'-10f" 10000',{OUT=>"10000     "},
               {ERR=>""},
               {ERR_SUBST=>"s/.*//msg"}],
-     ['devdebug-12', '---debug --field 2 A',{OUT=>""},
-             {ERR=>""}, {EXIT=>2},
-             {ERR_SUBST=>"s/.*//msg"}],
  
       # Invalid parameters
       ['help-1', '--foobar',
@@ -787,11 +801,6 @@ my @Tests =
               {ERR => "$prog: invalid number: 'World'\n"},
               {OUT => "Hello 40M World 90G\n"},
               {EXIT => 2}],
-     ['ign-err-6', '--invalid=fail --field 3 --to=si "Hello World"',
-             {ERR => "$prog: input line is too short, no numbers found " .
-                     "to convert in field 3\n"},
-             {OUT => "Hello World\n"},
-             {EXIT => 2}],
       ['ign-err-7', '--invalid=fail --from=si "foo"',
               {ERR => "$prog: invalid number: 'foo'\n"},
               {OUT => "foo\n"},
@@ -855,13 +864,6 @@ my @Tests =
               {OUT => "A 1000 x\nB Foo y\nC 2.8G z\n"},
               {ERR => "$prog: invalid number: 'Foo'\n"},
               {EXIT => 2}],
-     # one of the lines is too short
-     ['ign-err-m3.2', '--invalid=fail --field 2 --from=si --to=iec',
-             {IN_PIPE => "A 1K x\nB\nC 3G z\n"},
-             {OUT => "A 1000 x\nB\nC 2.8G z\n"},
-             {ERR => "$prog: input line is too short, no numbers found " .
-                     "to convert in field 2\n"},
-             {EXIT => 2}],
      );
  
  my @Locale_Tests =
author	Dylan Cali <calid1984@gmail.com>
	Fri, 5 Sep 2014 09:42:02 +0000 (04:42 -0500)
committer	Pádraig Brady <P@draigBrady.com>
	Fri, 19 Jun 2015 18:59:21 +0000 (19:59 +0100)
NEWS		patch \| blob \| blame \| history
bootstrap.conf		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
gnulib		patch \| blob \| blame \| history
src/numfmt.c		patch \| blob \| blame \| history
tests/misc/numfmt.pl		patch \| blob \| blame \| history