wc: increase I/O size from 16 KiB to 256KiB

author Pádraig Brady <P@draigBrady.com>

Thu, 9 May 2024 13:03:38 +0000 (14:03 +0100)

committer Pádraig Brady <P@draigBrady.com>

Thu, 9 May 2024 14:03:56 +0000 (15:03 +0100)
author Pádraig Brady <P@draigBrady.com>
Thu, 9 May 2024 13:03:38 +0000 (14:03 +0100)
committer Pádraig Brady <P@draigBrady.com>
Thu, 9 May 2024 14:03:56 +0000 (15:03 +0100)
diff --git a/NEWS b/NEWS

index 7e8ccb34f76a200a150b3ad88389e540bb74af76..febb9ac684ec69fb2d865b07bc7507a3a57c442c 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -31,6 +31,10 @@ GNU coreutils NEWS                                    -*- outline -*-
    sort operates more efficiently when used on pseudo files with
    an apparent size of 0, like those in /proc.
  
+  wc now reads a minimum of 256KiB at a time.
+  This was previously 16KiB and increasing to 256KiB was seen to increase
+  wc -l performance by about 10% when reading cached files on modern systems.
+
  
  * Noteworthy changes in release 9.5 (2024-03-28) [stable]
  
diff --git a/src/wc.c b/src/wc.c

index d70ad39363e860632a29c68a34aa7fc4f53dc62d..21ffa74d9faedca3e500fccbf84dd3cc244abd4c 100644 (file)
--- a/src/wc.c
+++ b/src/wc.c
@@ -34,6 +34,7 @@
  #include <xbinary-io.h>
  
  #include "system.h"
+#include "ioblksize.h"
  #include "wc.h"
  
  /* The official name of this program (e.g., no 'g' prefix).  */
@@ -43,9 +44,6 @@
    proper_name ("Paul Rubin"), \
    proper_name ("David MacKenzie")
  
-/* Size of atomic reads. */
-#define BUFFER_SIZE (16 * 1024)
-
  static bool wc_isprint[UCHAR_MAX + 1];
  static bool wc_isspace[UCHAR_MAX + 1];
  
@@ -262,8 +260,8 @@ wc_lines (int fd)
  
    while (true)
      {
-      char buf[BUFFER_SIZE + 1];
-      ssize_t bytes_read = read (fd, buf, BUFFER_SIZE);
+      char buf[IO_BUFSIZE + 1];
+      ssize_t bytes_read = read (fd, buf, IO_BUFSIZE);
        if (bytes_read <= 0)
          return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
  
@@ -304,7 +302,7 @@ static bool
  wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
  {
    int err = 0;
-  char buf[BUFFER_SIZE + 1];
+  char buf[IO_BUFSIZE + 1];
    intmax_t lines, words, chars, bytes, linelength;
    bool count_bytes, count_chars, count_complicated;
    char const *file = file_x ? file_x : _("standard input");
@@ -331,7 +329,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
  
    /* When counting only bytes, save some line- and word-counting
       overhead.  If FD is a 'regular' Unix file, using lseek is enough
-     to get its 'size' in bytes.  Otherwise, read blocks of BUFFER_SIZE
+     to get its 'size' in bytes.  Otherwise, read blocks of IO_BUFSIZE
       bytes at a time until EOF.  Note that the 'size' (number of bytes)
       that wc reports is smaller than stats.st_size when the file is not
       positioned at its beginning.  That's why the lseek calls below are
@@ -386,7 +384,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
          {
            fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL);
            for (ssize_t bytes_read;
-               (bytes_read = read (fd, buf, BUFFER_SIZE));
+               (bytes_read = read (fd, buf, IO_BUFSIZE));
                 bytes += bytes_read)
              if (bytes_read < 0)
                {
@@ -413,7 +411,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
        idx_t prev = 0; /* Number of bytes carried over from previous round.  */
  
        for (ssize_t bytes_read;
-           ((bytes_read = read (fd, buf + prev, BUFFER_SIZE - prev))
+           ((bytes_read = read (fd, buf + prev, IO_BUFSIZE - prev))
              || prev);
             )
          {
@@ -448,7 +446,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
  
                    if (scanbytes < n)
                      {
-                      if (n == (size_t) -2 && plim - p < BUFFER_SIZE
+                      if (n == (size_t) -2 && plim - p < IO_BUFSIZE
                            && bytes_read)
                          {
                            /* An incomplete character that is not ridiculously
@@ -553,7 +551,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
        bool in_word = false;
        intmax_t linepos = 0;
  
-      for (ssize_t bytes_read; (bytes_read = read (fd, buf, BUFFER_SIZE)); )
+      for (ssize_t bytes_read; (bytes_read = read (fd, buf, IO_BUFSIZE)); )
          {
            if (bytes_read < 0)
              {
diff --git a/src/wc_avx2.c b/src/wc_avx2.c

index 5ec714759651c64493adb01da32c9ab25088dfc7..c3f76a625ace2630c65af20516e0565c19df88a7 100644 (file)
--- a/src/wc_avx2.c
+++ b/src/wc_avx2.c
@@ -17,13 +17,11 @@
  #include <config.h>
  
  #include "wc.h"
-
  #include "system.h"
+#include "ioblksize.h"
  
  #include <x86intrin.h>
  
-#define BUFSIZE 16384
-
  /* Read FD and return a summary.  */
  extern struct wc_lines
  wc_lines_avx2 (int fd)
@@ -35,7 +33,7 @@ wc_lines_avx2 (int fd)
  
    while (true)
      {
-       __m256i avx_buf[BUFSIZE / sizeof (__m256i)];
+       __m256i avx_buf[IO_BUFSIZE / sizeof (__m256i)];
        ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf);
        if (bytes_read <= 0)
          return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
author	Pádraig Brady <P@draigBrady.com>
	Thu, 9 May 2024 13:03:38 +0000 (14:03 +0100)
committer	Pádraig Brady <P@draigBrady.com>
	Thu, 9 May 2024 14:03:56 +0000 (15:03 +0100)
NEWS		patch \| blob \| blame \| history
src/wc.c		patch \| blob \| blame \| history
src/wc_avx2.c		patch \| blob \| blame \| history