]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Provide pg_preadv() and pg_pwritev().
authorThomas Munro <tmunro@postgresql.org>
Mon, 11 Jan 2021 01:37:13 +0000 (14:37 +1300)
committerThomas Munro <tmunro@postgresql.org>
Mon, 11 Jan 2021 02:24:38 +0000 (15:24 +1300)
Provide synchronous vectored file I/O routines.  These map to preadv()
and pwritev(), with fallback implementations for systems that don't have
them.  Also provide a wrapper pg_pwritev_with_retry() that automatically
retries on short writes.

Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGJA%2Bu-220VONeoREBXJ9P3S94Y7J%2BkqCnTYmahvZJwM%3Dg%40mail.gmail.com

configure
configure.ac
src/include/pg_config.h.in
src/include/port.h
src/include/port/pg_iovec.h [new file with mode: 0644]
src/port/Makefile
src/port/pread.c
src/port/pwrite.c
src/tools/msvc/Solution.pm

index e51b8ce6ec3500df208abd2b012d3f5678e95c90..b917a2a1c9dbf5b4db4a517556fbd3c17e63335f 100755 (executable)
--- a/configure
+++ b/configure
@@ -13061,7 +13061,7 @@ $as_echo "#define HAVE_STDBOOL_H 1" >>confdefs.h
 fi
 
 
-for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h langinfo.h mbarrier.h poll.h sys/epoll.h sys/event.h sys/ipc.h sys/prctl.h sys/procctl.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/sockio.h sys/tas.h sys/un.h termios.h ucred.h wctype.h
+for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h langinfo.h mbarrier.h poll.h sys/epoll.h sys/event.h sys/ipc.h sys/prctl.h sys/procctl.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/sockio.h sys/tas.h sys/uio.h sys/un.h termios.h ucred.h wctype.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
@@ -15155,7 +15155,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pread preadv pstat pthread_is_threaded_np pwrite pwritev readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l writev
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
@@ -15832,32 +15832,6 @@ esac
 
 fi
 
-ac_fn_c_check_func "$LINENO" "pread" "ac_cv_func_pread"
-if test "x$ac_cv_func_pread" = xyes; then :
-  $as_echo "#define HAVE_PREAD 1" >>confdefs.h
-
-else
-  case " $LIBOBJS " in
-  *" pread.$ac_objext "* ) ;;
-  *) LIBOBJS="$LIBOBJS pread.$ac_objext"
- ;;
-esac
-
-fi
-
-ac_fn_c_check_func "$LINENO" "pwrite" "ac_cv_func_pwrite"
-if test "x$ac_cv_func_pwrite" = xyes; then :
-  $as_echo "#define HAVE_PWRITE 1" >>confdefs.h
-
-else
-  case " $LIBOBJS " in
-  *" pwrite.$ac_objext "* ) ;;
-  *) LIBOBJS="$LIBOBJS pwrite.$ac_objext"
- ;;
-esac
-
-fi
-
 ac_fn_c_check_func "$LINENO" "random" "ac_cv_func_random"
 if test "x$ac_cv_func_random" = xyes; then :
   $as_echo "#define HAVE_RANDOM 1" >>confdefs.h
index 054839f0f25b3e2c25dffd27cb3e3d7102e17dc1..838d47dc22e2a05299047dcf1375c67e78a725b6 100644 (file)
@@ -1331,6 +1331,7 @@ AC_CHECK_HEADERS(m4_normalize([
        sys/shm.h
        sys/sockio.h
        sys/tas.h
+       sys/uio.h
        sys/un.h
        termios.h
        ucred.h
@@ -1660,9 +1661,14 @@ AC_CHECK_FUNCS(m4_normalize([
        poll
        posix_fallocate
        ppoll
+       pread
+       preadv
        pstat
        pthread_is_threaded_np
+       pwrite
+       pwritev
        readlink
+       readv
        setproctitle
        setproctitle_fast
        setsid
@@ -1673,6 +1679,7 @@ AC_CHECK_FUNCS(m4_normalize([
        sync_file_range
        uselocale
        wcstombs_l
+       writev
 ]))
 
 # These typically are compiler builtins, for which AC_CHECK_FUNCS fails.
@@ -1733,8 +1740,6 @@ AC_REPLACE_FUNCS(m4_normalize([
        inet_aton
        link
        mkdtemp
-       pread
-       pwrite
        random
        srandom
        strlcat
index ddaa9e8e182155e16bbb7032d197f3f4c1693551..f4d9f3b408d9f705067d6beab2e392acd89fe1a8 100644 (file)
 /* Define to 1 if you have the `pread' function. */
 #undef HAVE_PREAD
 
+/* Define to 1 if you have the `preadv' function. */
+#undef HAVE_PREADV
+
 /* Define to 1 if you have the `pstat' function. */
 #undef HAVE_PSTAT
 
 /* Define to 1 if you have the `pwrite' function. */
 #undef HAVE_PWRITE
 
+/* Define to 1 if you have the `pwritev' function. */
+#undef HAVE_PWRITEV
+
 /* Define to 1 if you have the `random' function. */
 #undef HAVE_RANDOM
 
 /* Define to 1 if you have the `readlink' function. */
 #undef HAVE_READLINK
 
+/* Define to 1 if you have the `readv' function. */
+#undef HAVE_READV
+
 /* Define to 1 if you have the global variable
    'rl_completion_append_character'. */
 #undef HAVE_RL_COMPLETION_APPEND_CHARACTER
 /* Define to 1 if you have the <sys/ucred.h> header file. */
 #undef HAVE_SYS_UCRED_H
 
+/* Define to 1 if you have the <sys/uio.h> header file. */
+#undef HAVE_SYS_UIO_H
+
 /* Define to 1 if you have the <sys/un.h> header file. */
 #undef HAVE_SYS_UN_H
 
 /* Define to 1 if you have the <winldap.h> header file. */
 #undef HAVE_WINLDAP_H
 
+/* Define to 1 if you have the `writev' function. */
+#undef HAVE_WRITEV
+
 /* Define to 1 if you have the `X509_get_signature_nid' function. */
 #undef HAVE_X509_GET_SIGNATURE_NID
 
index 3e9d4fcd3769aa7d9db9047d0a7d90bd29f59eb1..6486db9fddec02d2bdf32940d06d239909ff9d93 100644 (file)
@@ -431,6 +431,8 @@ extern ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset);
 extern ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset);
 #endif
 
+/* For pg_pwritev() and pg_preadv(), see port/pg_iovec.h. */
+
 #if !HAVE_DECL_STRLCAT
 extern size_t strlcat(char *dst, const char *src, size_t siz);
 #endif
diff --git a/src/include/port/pg_iovec.h b/src/include/port/pg_iovec.h
new file mode 100644 (file)
index 0000000..335f35b
--- /dev/null
@@ -0,0 +1,59 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_iovec.h
+ *       Header for the vectored I/O functions in src/port/p{read,write}.c.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_iovec.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_IOVEC_H
+#define PG_IOVEC_H
+
+#include <limits.h>
+
+#ifdef HAVE_SYS_UIO_H
+#include <sys/uio.h>
+#endif
+
+/* If <sys/uio.h> is missing, define our own POSIX-compatible iovec struct. */
+#ifndef HAVE_SYS_UIO_H
+struct iovec
+{
+       void       *iov_base;
+       size_t          iov_len;
+};
+#endif
+
+/*
+ * If <limits.h> didn't define IOV_MAX, define our own.  POSIX requires at
+ * least 16.
+ */
+#ifndef IOV_MAX
+#define IOV_MAX 16
+#endif
+
+/* Define a reasonable maximum that is safe to use on the stack. */
+#define PG_IOV_MAX Min(IOV_MAX, 32)
+
+#ifdef HAVE_PREADV
+#define pg_preadv preadv
+#else
+extern ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+#endif
+
+#ifdef HAVE_PWRITEV
+#define pg_pwritev pwritev
+#else
+extern ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+#endif
+
+extern ssize_t pg_pwritev_with_retry(int fd,
+                                                                        const struct iovec *iov,
+                                                                        int iovcnt,
+                                                                        off_t offset);
+
+#endif                                                 /* PG_IOVEC_H */
index e41b005c4f1bf9cd77714f608d1d351483f905fa..bc4923ce840e19da46ca1dc5a711fffe4c731a9c 100644 (file)
@@ -53,6 +53,8 @@ OBJS = \
        pgstrcasecmp.o \
        pgstrsignal.o \
        pqsignal.o \
+       pread.o \
+       pwrite.o \
        qsort.o \
        qsort_arg.o \
        quotes.o \
index 486f07a7dffcc79f730edb283bccb6d7d71f0991..a5ae2759fa0e5e69fa6a86b4fa2d9de50ea34da6 100644 (file)
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pread.c
- *       Implementation of pread(2) for platforms that lack one.
+ *       Implementation of pread[v](2) for platforms that lack one.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  *
@@ -9,7 +9,8 @@
  *       src/port/pread.c
  *
  * Note that this implementation changes the current file position, unlike
- * the POSIX function, so we use the name pg_pread().
+ * the POSIX function, so we use the name pg_pread().  Likewise for the
+ * iovec version.
  *
  *-------------------------------------------------------------------------
  */
@@ -23,6 +24,9 @@
 #include <unistd.h>
 #endif
 
+#include "port/pg_iovec.h"
+
+#ifndef HAVE_PREAD
 ssize_t
 pg_pread(int fd, void *buf, size_t size, off_t offset)
 {
@@ -56,3 +60,38 @@ pg_pread(int fd, void *buf, size_t size, off_t offset)
        return read(fd, buf, size);
 #endif
 }
+#endif
+
+#ifndef HAVE_PREADV
+ssize_t
+pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+#ifdef HAVE_READV
+       if (iovcnt == 1)
+               return pg_pread(fd, iov[0].iov_base, iov[0].iov_len, offset);
+       if (lseek(fd, offset, SEEK_SET) < 0)
+               return -1;
+       return readv(fd, iov, iovcnt);
+#else
+       ssize_t         sum = 0;
+       ssize_t         part;
+
+       for (int i = 0; i < iovcnt; ++i)
+       {
+               part = pg_pread(fd, iov[i].iov_base, iov[i].iov_len, offset);
+               if (part < 0)
+               {
+                       if (i == 0)
+                               return -1;
+                       else
+                               return sum;
+               }
+               sum += part;
+               offset += part;
+               if (part < iov[i].iov_len)
+                       return sum;
+       }
+       return sum;
+#endif
+}
+#endif
index 282b27115e509ab22660807cbb74ce5356b6f539..e029f44bc0ce030f1ca87403aa674bacfdc1b268 100644 (file)
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pwrite.c
- *       Implementation of pwrite(2) for platforms that lack one.
+ *       Implementation of pwrite[v](2) for platforms that lack one.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  *
@@ -9,7 +9,8 @@
  *       src/port/pwrite.c
  *
  * Note that this implementation changes the current file position, unlike
- * the POSIX function, so we use the name pg_pwrite().
+ * the POSIX function, so we use the name pg_pwrite().  Likewise for the
+ * iovec version.
  *
  *-------------------------------------------------------------------------
  */
@@ -23,6 +24,9 @@
 #include <unistd.h>
 #endif
 
+#include "port/pg_iovec.h"
+
+#ifndef HAVE_PWRITE
 ssize_t
 pg_pwrite(int fd, const void *buf, size_t size, off_t offset)
 {
@@ -53,3 +57,102 @@ pg_pwrite(int fd, const void *buf, size_t size, off_t offset)
        return write(fd, buf, size);
 #endif
 }
+#endif
+
+#ifndef HAVE_PWRITEV
+ssize_t
+pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+#ifdef HAVE_WRITEV
+       if (iovcnt == 1)
+               return pg_pwrite(fd, iov[0].iov_base, iov[0].iov_len, offset);
+       if (lseek(fd, offset, SEEK_SET) < 0)
+               return -1;
+       return writev(fd, iov, iovcnt);
+#else
+       ssize_t         sum = 0;
+       ssize_t         part;
+
+       for (int i = 0; i < iovcnt; ++i)
+       {
+               part = pg_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
+               if (part < 0)
+               {
+                       if (i == 0)
+                               return -1;
+                       else
+                               return sum;
+               }
+               sum += part;
+               offset += part;
+               if (part < iov[i].iov_len)
+                       return sum;
+       }
+       return sum;
+#endif
+}
+#endif
+
+/*
+ * A convenience wrapper for pg_pwritev() that retries on partial write.  If an
+ * error is returned, it is unspecified how much has been written.
+ */
+ssize_t
+pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+       struct iovec iov_copy[PG_IOV_MAX];
+       ssize_t         sum = 0;
+       ssize_t         part;
+
+       /* We'd better have space to make a copy, in case we need to retry. */
+       if (iovcnt > PG_IOV_MAX)
+       {
+               errno = EINVAL;
+               return -1;
+       }
+
+       for (;;)
+       {
+               /* Write as much as we can. */
+               part = pg_pwritev(fd, iov, iovcnt, offset);
+               if (part < 0)
+                       return -1;
+
+#ifdef SIMULATE_SHORT_WRITE
+               part = Min(part, 4096);
+#endif
+
+               /* Count our progress. */
+               sum += part;
+               offset += part;
+
+               /* Step over iovecs that are done. */
+               while (iovcnt > 0 && iov->iov_len <= part)
+               {
+                       part -= iov->iov_len;
+                       ++iov;
+                       --iovcnt;
+               }
+
+               /* Are they all done? */
+               if (iovcnt == 0)
+               {
+                       if (part > 0)
+                               elog(ERROR, "unexpectedly wrote more than requested");
+                       break;
+               }
+
+               /*
+                * Move whatever's left to the front of our mutable copy and adjust the
+                * leading iovec.
+                */
+               Assert(iovcnt > 0);
+               memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
+               Assert(iov->iov_len > part);
+               iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
+               iov_copy[0].iov_len -= part;
+               iov = iov_copy;
+       }
+
+       return sum;
+}
index 95d4e826b1d054652ec8db83f397dfef68d8abe4..59a42bea97a7f19939fb323bd5b3878397ac716a 100644 (file)
@@ -329,17 +329,20 @@ sub GenerateFiles
                HAVE_PPC_LWARX_MUTEX_HINT   => undef,
                HAVE_PPOLL                  => undef,
                HAVE_PREAD                  => undef,
+               HAVE_PREADV                 => undef,
                HAVE_PSTAT                  => undef,
                HAVE_PS_STRINGS             => undef,
                HAVE_PTHREAD                => undef,
                HAVE_PTHREAD_IS_THREADED_NP => undef,
                HAVE_PTHREAD_PRIO_INHERIT   => undef,
                HAVE_PWRITE                 => undef,
+               HAVE_PWRITEV                => undef,
                HAVE_RANDOM                 => undef,
                HAVE_READLINE_H             => undef,
                HAVE_READLINE_HISTORY_H     => undef,
                HAVE_READLINE_READLINE_H    => undef,
                HAVE_READLINK               => undef,
+               HAVE_READV                  => undef,
                HAVE_RL_COMPLETION_APPEND_CHARACTER      => undef,
                HAVE_RL_COMPLETION_MATCHES               => undef,
                HAVE_RL_COMPLETION_SUPPRESS_QUOTE        => undef,
@@ -400,6 +403,7 @@ sub GenerateFiles
                HAVE_SYS_TAS_H                           => undef,
                HAVE_SYS_TYPES_H                         => 1,
                HAVE_SYS_UCRED_H                         => undef,
+               HAVE_SYS_UIO_H                           => undef,
                HAVE_SYS_UN_H                            => undef,
                HAVE_TERMIOS_H                           => undef,
                HAVE_TYPEOF                              => undef,
@@ -418,6 +422,7 @@ sub GenerateFiles
                HAVE_WINLDAP_H                           => undef,
                HAVE_WCSTOMBS_L                          => 1,
                HAVE_WCTYPE_H                            => 1,
+               HAVE_WRITEV                              => undef,
                HAVE_X509_GET_SIGNATURE_NID              => 1,
                HAVE_X86_64_POPCNTQ                      => undef,
                HAVE__BOOL                               => undef,