xfs: fix transaction leak on remote attr set/remove failure

[thirdparty/xfsprogs-dev.git] / libxfs / rdwr.c
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c

index 3fe0fc3a5c7cafb6871baf5491c89cfbac602607..14a4633e9fa60d3a722d11b257b879ed74e67adf 100644 (file)
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -1,33 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
  /*
   * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   * All Rights Reserved.
+ */
+
+
+#include "libxfs_priv.h"
+#include "init.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+
+#include "libxfs.h"            /* for LIBXFS_EXIT_ON_FAILURE */
+
+/*
+ * Important design/architecture note:
   *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
+ * The userspace code that uses the buffer cache is much less constrained than
+ * the kernel code. The userspace code is pretty nasty in places, especially
+ * when it comes to buffer error handling.  Very little of the userspace code
+ * outside libxfs clears bp->b_error - very little code even checks it - so the
+ * libxfs code is tripping on stale errors left by the userspace code.
   *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
+ * in the kernel, because those functions are used by the libxfs_readbuf_*
+ * functions and hence need to leave the buffers unchanged on cache hits. This
+ * is actually the only way to gather a write error from a libxfs_writebuf()
+ * call - you need to get the buffer again so you can check bp->b_error field -
+ * assuming that the buffer is still in the cache when you check, that is.
   *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ * This is very different to the kernel code which does not release buffers on a
+ * write so we can wait on IO and check errors. The kernel buffer cache also
+ * guarantees a buffer of a known initial state from xfs_buf_get() even on a
+ * cache hit.
+ *
+ * IOWs, userspace is behaving quite differently to the kernel and as a result
+ * it leaks errors from reads, invalidations and writes through
+ * libxfs_getbuf/libxfs_readbuf.
+ *
+ * The result of this is that until the userspace code outside libxfs is cleaned
+ * up, functions that release buffers from userspace control (i.e
+ * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
+ * propagation of stale errors into future buffer operations.
   */
  
-#include <xfs/libxfs.h>
-#include <xfs/xfs_log.h>
-#include <xfs/xfs_log_priv.h>
-#include "init.h"
-
  #define BDSTRAT_SIZE   (256 * 1024)
-#define min(x, y)      ((x) < (y) ? (x) : (y))
  
  #define IO_BCOMPARE_CHECK
  
-void
-libxfs_device_zero(dev_t dev, xfs_daddr_t start, uint len)
+/* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
+int
+libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
  {
         xfs_off_t       start_offset, end_offset, offset;
         ssize_t         zsize, bytes;
@@ -43,10 +73,10 @@ libxfs_device_zero(dev_t dev, xfs_daddr_t start, uint len)
         }
         memset(z, 0, zsize);
  
-       fd = libxfs_device_to_fd(dev);
+       fd = libxfs_device_to_fd(btp->dev);
         start_offset = LIBXFS_BBTOOFF64(start);
  
-       if ((lseek64(fd, start_offset, SEEK_SET)) < 0) {
+       if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
                 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
                         progname, __FUNCTION__,
                         (unsigned long long)start_offset, strerror(errno));
@@ -68,6 +98,7 @@ libxfs_device_zero(dev_t dev, xfs_daddr_t start, uint len)
                 offset += bytes;
         }
         free(z);
+       return 0;
  }
  
  static void unmount_record(void *p)
@@ -75,111 +106,268 @@ static void unmount_record(void *p)
         xlog_op_header_t        *op = (xlog_op_header_t *)p;
         /* the data section must be 32 bit size aligned */
         struct {
-           __uint16_t magic;
-           __uint16_t pad1;
-           __uint32_t pad2; /* may as well make it 64 bits */
+           uint16_t magic;
+           uint16_t pad1;
+           uint32_t pad2; /* may as well make it 64 bits */
         } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
  
         memset(p, 0, BBSIZE);
-       INT_SET(op->oh_tid,             ARCH_CONVERT, 1);
-       INT_SET(op->oh_len,             ARCH_CONVERT, sizeof(magic));
-       INT_SET(op->oh_clientid,        ARCH_CONVERT, XFS_LOG);
-       INT_SET(op->oh_flags,           ARCH_CONVERT, XLOG_UNMOUNT_TRANS);
-       INT_SET(op->oh_res2,            ARCH_CONVERT, 0);
+       /* dummy tid to mark this as written from userspace */
+       op->oh_tid = cpu_to_be32(0xb0c0d0d0);
+       op->oh_len = cpu_to_be32(sizeof(magic));
+       op->oh_clientid = XFS_LOG;
+       op->oh_flags = XLOG_UNMOUNT_TRANS;
+       op->oh_res2 = 0;
  
         /* and the data for this op */
         memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
  }
  
-static xfs_caddr_t next(xfs_caddr_t ptr, int offset, void *private)
+static char *next(
+       char            *ptr,
+       int             offset,
+       void            *private)
  {
-       xfs_buf_t       *buf = (xfs_buf_t *)private;
+       struct xfs_buf  *buf = (struct xfs_buf *)private;
  
-       if (XFS_BUF_COUNT(buf) < (int)(ptr - XFS_BUF_PTR(buf)) + offset)
+       if (buf &&
+           (buf->b_bcount < (int)(ptr - (char *)buf->b_addr) + offset))
                 abort();
+
         return ptr + offset;
  }
  
+/*
+ * Format the log. The caller provides either a buftarg which is used to access
+ * the log via buffers or a direct pointer to a buffer that encapsulates the
+ * entire log.
+ */
  int
  libxfs_log_clear(
-       dev_t                   device,
+       struct xfs_buftarg      *btp,
+       char                    *dptr,
         xfs_daddr_t             start,
-       uint                    length,
+       uint                    length,         /* basic blocks */
         uuid_t                  *fs_uuid,
         int                     version,
-       int                     sunit,
-       int                     fmt)
+       int                     sunit,          /* bytes */
+       int                     fmt,
+       int                     cycle,
+       bool                    max)
  {
-       xfs_buf_t               *bp;
+       struct xfs_buf          *bp = NULL;
         int                     len;
-
-       if (!device || !fs_uuid)
+       xfs_lsn_t               lsn;
+       xfs_lsn_t               tail_lsn;
+       xfs_daddr_t             blk;
+       xfs_daddr_t             end_blk;
+       char                    *ptr;
+
+       if (((btp && dptr) || (!btp && !dptr)) ||
+           (btp && !btp->dev) || !fs_uuid)
                 return -EINVAL;
  
         /* first zero the log */
-       libxfs_device_zero(device, start, length);
+       if (btp)
+               libxfs_device_zero(btp, start, length);
+       else
+               memset(dptr, 0, BBTOB(length));
  
-       /* then write a log record header */
+       /*
+        * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
+        * special reset case where we only write a single record where the lsn
+        * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
+        * the specified cycle and points tail_lsn at the last record of the
+        * previous cycle.
+        */
         len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
-       len = MAX(len, 2);
-       bp = libxfs_getbufr(device, start, len);
-       libxfs_log_header(XFS_BUF_PTR(bp),
-                         fs_uuid, version, sunit, fmt, next, bp);
-       bp->b_flags |= LIBXFS_B_DIRTY;
-       libxfs_putbufr(bp);
+       len = max(len, 2);
+       lsn = xlog_assign_lsn(cycle, 0);
+       if (cycle == XLOG_INIT_CYCLE)
+               tail_lsn = lsn;
+       else
+               tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
+
+       /* write out the first log record */
+       ptr = dptr;
+       if (btp) {
+               bp = libxfs_getbufr(btp, start, len);
+               ptr = bp->b_addr;
+       }
+       libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
+                         next, bp);
+       if (bp) {
+               bp->b_flags |= LIBXFS_B_DIRTY;
+               libxfs_putbufr(bp);
+       }
+
+       /*
+        * There's nothing else to do if this is a log reset. The kernel detects
+        * the rest of the log is zeroed and starts at cycle 1.
+        */
+       if (cycle == XLOG_INIT_CYCLE)
+               return 0;
+
+       /*
+        * Bump the record size for a full log format if the caller allows it.
+        * This is primarily for performance reasons and most callers don't care
+        * about record size since the log is clean after we're done.
+        */
+       if (max)
+               len = BTOBB(BDSTRAT_SIZE);
+
+       /*
+        * Otherwise, fill everything beyond the initial record with records of
+        * the previous cycle so the kernel head/tail detection works correctly.
+        *
+        * We don't particularly care about the record size or content here.
+        * It's only important that the headers are in place such that the
+        * kernel finds 1.) a clean log and 2.) the correct current cycle value.
+        * Therefore, bump up the record size to the max to use larger I/Os and
+        * improve performance.
+        */
+       cycle--;
+       blk = start + len;
+       if (dptr)
+               dptr += BBTOB(len);
+       end_blk = start + length;
+
+       len = min(end_blk - blk, len);
+       while (blk < end_blk) {
+               lsn = xlog_assign_lsn(cycle, blk - start);
+               tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
+
+               ptr = dptr;
+               if (btp) {
+                       bp = libxfs_getbufr(btp, blk, len);
+                       ptr = bp->b_addr;
+               }
+               /*
+                * Note: pass the full buffer length as the sunit to initialize
+                * the entire buffer.
+                */
+               libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
+                                 tail_lsn, next, bp);
+               if (bp) {
+                       bp->b_flags |= LIBXFS_B_DIRTY;
+                       libxfs_putbufr(bp);
+               }
+
+               blk += len;
+               if (dptr)
+                       dptr += BBTOB(len);
+               len = min(end_blk - blk, len);
+       }
+
         return 0;
  }
  
  int
  libxfs_log_header(
-       xfs_caddr_t             caddr,
+       char                    *caddr,
         uuid_t                  *fs_uuid,
         int                     version,
         int                     sunit,
         int                     fmt,
+       xfs_lsn_t               lsn,
+       xfs_lsn_t               tail_lsn,
         libxfs_get_block_t      *nextfunc,
         void                    *private)
  {
         xlog_rec_header_t       *head = (xlog_rec_header_t *)caddr;
-       xfs_caddr_t             p = caddr;
-       uint                    cycle_lsn;
+       char                    *p = caddr;
+       __be32                  cycle_lsn;
         int                     i, len;
+       int                     hdrs = 1;
+
+       if (lsn == NULLCOMMITLSN)
+               lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
+       if (tail_lsn == NULLCOMMITLSN)
+               tail_lsn = lsn;
  
         len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
  
-       /* note that oh_tid actually contains the cycle number
-        * and the tid is stored in h_cycle_data[0] - that's the
-        * way things end up on disk.
-        */
         memset(p, 0, BBSIZE);
-       INT_SET(head->h_magicno,        ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
-       INT_SET(head->h_cycle,          ARCH_CONVERT, 1);
-       INT_SET(head->h_version,        ARCH_CONVERT, version);
-       if (len != 1)
-               INT_SET(head->h_len,            ARCH_CONVERT, sunit - BBSIZE);
-       else
-               INT_SET(head->h_len,            ARCH_CONVERT, 20);
-       INT_SET(head->h_chksum,         ARCH_CONVERT, 0);
-       INT_SET(head->h_prev_block,     ARCH_CONVERT, -1);
-       INT_SET(head->h_num_logops,     ARCH_CONVERT, 1);
-       INT_SET(head->h_cycle_data[0],  ARCH_CONVERT, 0xb0c0d0d0);
-       INT_SET(head->h_fmt,            ARCH_CONVERT, fmt);
-       INT_SET(head->h_size,           ARCH_CONVERT, XLOG_HEADER_CYCLE_SIZE);
-
-       ASSIGN_ANY_LSN_DISK(head->h_lsn, 1, 0);
-       ASSIGN_ANY_LSN_DISK(head->h_tail_lsn, 1, 0);
+       head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+       head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
+       head->h_version = cpu_to_be32(version);
+       head->h_crc = cpu_to_le32(0);
+       head->h_prev_block = cpu_to_be32(-1);
+       head->h_num_logops = cpu_to_be32(1);
+       head->h_fmt = cpu_to_be32(fmt);
+       head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
+
+       head->h_lsn = cpu_to_be64(lsn);
+       head->h_tail_lsn = cpu_to_be64(tail_lsn);
  
         memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
  
-       len = MAX(len, 2);
+       /*
+        * The kernel expects to see either a log record header magic value or
+        * the LSN cycle at the top of every log block. The first word of each
+        * non-header block is copied to the record headers and replaced with
+        * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
+        * details).
+        *
+        * Even though we only ever write an unmount record (one block), we
+        * support writing log records up to the max log buffer size of 256k to
+        * improve log format performance. This means a record can require up
+        * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
+        * data (each header supports 32k of data).
+        */
+       cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
+       if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
+               hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
+               if (sunit % XLOG_HEADER_CYCLE_SIZE)
+                       hdrs++;
+       }
+
+       /*
+        * A fixed number of extended headers is expected based on h_size. If
+        * required, format those now so the unmount record is located
+        * correctly.
+        *
+        * Since we only write an unmount record, we only need one h_cycle_data
+        * entry for the unmount record block. The subsequent record data
+        * blocks are zeroed, which means we can stamp them directly with the
+        * cycle and zero the rest of the cycle data in the extended headers.
+        */
+       if (hdrs > 1) {
+               for (i = 1; i < hdrs; i++) {
+                       p = nextfunc(p, BBSIZE, private);
+                       memset(p, 0, BBSIZE);
+                       /* xlog_rec_ext_header.xh_cycle */
+                       *(__be32 *)p = cycle_lsn;
+               }
+       }
+
+       /*
+        * The total length is the max of the stripe unit or 2 basic block
+        * minimum (1 hdr blk + 1 data blk). The record length is the total
+        * minus however many header blocks are required.
+        */
+       head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
+
+       /*
+        * Write out the unmount record, pack the first word into the record
+        * header and stamp the block with the cycle.
+        */
         p = nextfunc(p, BBSIZE, private);
         unmount_record(p);
  
-       cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
-       for (i = 2; i < len; i++) {
+       head->h_cycle_data[0] = *(__be32 *)p;
+       *(__be32 *)p = cycle_lsn;
+
+       /*
+        * Finally, zero all remaining blocks in the record and stamp each with
+        * the cycle. We don't need to pack any of these blocks because the
+        * cycle data in the headers has already been zeroed.
+        */
+       len = max(len, hdrs + 1);
+       for (i = hdrs + 1; i < len; i++) {
                 p = nextfunc(p, BBSIZE, private);
                 memset(p, 0, BBSIZE);
-               *(uint *)p = cycle_lsn;
+               *(__be32 *)p = cycle_lsn;
         }
  
         return BBTOB(len);
@@ -193,56 +381,93 @@ libxfs_log_header(
  #ifdef XFS_BUF_TRACING
  
  #undef libxfs_readbuf
+#undef libxfs_readbuf_map
  #undef libxfs_writebuf
  #undef libxfs_getbuf
+#undef libxfs_getbuf_map
+#undef libxfs_getbuf_flags
  #undef libxfs_putbuf
  
-xfs_buf_t      *libxfs_readbuf(dev_t, xfs_daddr_t, int, int);
+xfs_buf_t      *libxfs_readbuf(struct xfs_buftarg *, xfs_daddr_t, int, int,
+                               const struct xfs_buf_ops *);
+xfs_buf_t      *libxfs_readbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
+                               int, int, const struct xfs_buf_ops *);
  int            libxfs_writebuf(xfs_buf_t *, int);
-xfs_buf_t      *libxfs_getbuf(dev_t, xfs_daddr_t, int);
+xfs_buf_t      *libxfs_getbuf(struct xfs_buftarg *, xfs_daddr_t, int);
+xfs_buf_t      *libxfs_getbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
+                               int, int);
+xfs_buf_t      *libxfs_getbuf_flags(struct xfs_buftarg *, xfs_daddr_t, int,
+                               unsigned int);
  void           libxfs_putbuf (xfs_buf_t *);
  
+#define        __add_trace(bp, func, file, line)       \
+do {                                           \
+       if (bp) {                               \
+               (bp)->b_func = (func);          \
+               (bp)->b_file = (file);          \
+               (bp)->b_line = (line);          \
+       }                                       \
+} while (0)
+
  xfs_buf_t *
-libxfs_trace_readbuf(const char *func, const char *file, int line, dev_t dev, xfs_daddr_t blkno, int len, int flags)
+libxfs_trace_readbuf(const char *func, const char *file, int line,
+               struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
+               const struct xfs_buf_ops *ops)
  {
-       xfs_buf_t       *bp = libxfs_readbuf(dev, blkno, len, flags);
-
-       bp->b_func = func;
-       bp->b_file = file;
-       bp->b_line = line;
+       xfs_buf_t       *bp = libxfs_readbuf(btp, blkno, len, flags, ops);
+       __add_trace(bp, func, file, line);
+       return bp;
+}
  
+xfs_buf_t *
+libxfs_trace_readbuf_map(const char *func, const char *file, int line,
+               struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, int flags,
+               const struct xfs_buf_ops *ops)
+{
+       xfs_buf_t       *bp = libxfs_readbuf_map(btp, map, nmaps, flags, ops);
+       __add_trace(bp, func, file, line);
         return bp;
  }
  
  int
  libxfs_trace_writebuf(const char *func, const char *file, int line, xfs_buf_t *bp, int flags)
  {
-       bp->b_func = func;
-       bp->b_file = file;
-       bp->b_line = line;
-
+       __add_trace(bp, func, file, line);
         return libxfs_writebuf(bp, flags);
  }
  
  xfs_buf_t *
-libxfs_trace_getbuf(const char *func, const char *file, int line, dev_t device, xfs_daddr_t blkno, int len)
+libxfs_trace_getbuf(const char *func, const char *file, int line,
+               struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
  {
-       xfs_buf_t       *bp = libxfs_getbuf(device, blkno, len);
+       xfs_buf_t       *bp = libxfs_getbuf(btp, blkno, len);
+       __add_trace(bp, func, file, line);
+       return bp;
+}
  
-       bp->b_func = func;
-       bp->b_file = file;
-       bp->b_line = line;
+xfs_buf_t *
+libxfs_trace_getbuf_map(const char *func, const char *file, int line,
+               struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
+               int flags)
+{
+       xfs_buf_t       *bp = libxfs_getbuf_map(btp, map, nmaps, flags);
+       __add_trace(bp, func, file, line);
+       return bp;
+}
  
+xfs_buf_t *
+libxfs_trace_getbuf_flags(const char *func, const char *file, int line,
+               struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, unsigned int flags)
+{
+       xfs_buf_t       *bp = libxfs_getbuf_flags(btp, blkno, len, flags);
+       __add_trace(bp, func, file, line);
         return bp;
  }
  
  void
  libxfs_trace_putbuf(const char *func, const char *file, int line, xfs_buf_t *bp)
  {
-       bp->b_func = func;
-       bp->b_file = file;
-       bp->b_line = line;
-
+       __add_trace(bp, func, file, line);
         libxfs_putbuf(bp);
  }
  
@@ -253,65 +478,90 @@ libxfs_trace_putbuf(const char *func, const char *file, int line, xfs_buf_t *bp)
  xfs_buf_t *
  libxfs_getsb(xfs_mount_t *mp, int flags)
  {
-       return libxfs_readbuf(mp->m_dev, XFS_SB_DADDR,
-                               XFS_FSS_TO_BB(mp, 1), flags);
+       return libxfs_readbuf(mp->m_ddev_targp, XFS_SB_DADDR,
+                               XFS_FSS_TO_BB(mp, 1), flags, &xfs_sb_buf_ops);
  }
  
-xfs_zone_t                     *xfs_buf_zone;
+kmem_zone_t                    *xfs_buf_zone;
  
  static struct cache_mru                xfs_buf_freelist =
         {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
          0, PTHREAD_MUTEX_INITIALIZER };
  
-typedef struct {
-       dev_t           device;
-       xfs_daddr_t     blkno;
-       unsigned int    bblen;
-} xfs_bufkey_t;
+/*
+ * The bufkey is used to pass the new buffer information to the cache object
+ * allocation routine. Because discontiguous buffers need to pass different
+ * information, we need fields to pass that information. However, because the
+ * blkno and bblen is needed for the initial cache entry lookup (i.e. for
+ * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
+ * buffer initialisation instead of a contiguous buffer.
+ */
+struct xfs_bufkey {
+       struct xfs_buftarg      *buftarg;
+       xfs_daddr_t             blkno;
+       unsigned int            bblen;
+       struct xfs_buf_map      *map;
+       int                     nmaps;
+};
  
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define GOLDEN_RATIO_PRIME     0x9e37fffffffc0001UL
+#define CACHE_LINE_SIZE                64
  static unsigned int
-libxfs_bhash(cache_key_t key, unsigned int hashsize)
+libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
  {
-       return (((unsigned int)((xfs_bufkey_t *)key)->blkno) >> 5) % hashsize;
+       uint64_t        hashval = ((struct xfs_bufkey *)key)->blkno;
+       uint64_t        tmp;
+
+       tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
+       tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
+       return tmp % hashsize;
  }
  
  static int
  libxfs_bcompare(struct cache_node *node, cache_key_t key)
  {
-       xfs_buf_t       *bp = (xfs_buf_t *)node;
-       xfs_bufkey_t    *bkey = (xfs_bufkey_t *)key;
+       struct xfs_buf  *bp = (struct xfs_buf *)node;
+       struct xfs_bufkey *bkey = (struct xfs_bufkey *)key;
  
+       if (bp->b_target->dev == bkey->buftarg->dev &&
+           bp->b_bn == bkey->blkno) {
+               if (bp->b_bcount == BBTOB(bkey->bblen))
+                       return CACHE_HIT;
  #ifdef IO_BCOMPARE_CHECK
-       if (bp->b_dev == bkey->device &&
-           bp->b_blkno == bkey->blkno &&
-           bp->b_bcount != BBTOB(bkey->bblen))
-               fprintf(stderr, "%lx: Badness in key lookup (length)\n"
-                       "bp=(bno %llu, len %u bytes) key=(bno %llu, len %u bytes)\n",
-                       pthread_self(),
-                       (unsigned long long)bp->b_blkno, (int)bp->b_bcount,
-                       (unsigned long long)bkey->blkno, BBTOB(bkey->bblen));
+               if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
+                       fprintf(stderr,
+       "%lx: Badness in key lookup (length)\n"
+       "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
+                               pthread_self(),
+                               (unsigned long long)bp->b_bn, (int)bp->b_bcount,
+                               (unsigned long long)bkey->blkno,
+                               BBTOB(bkey->bblen));
+               }
  #endif
-
-       return (bp->b_dev == bkey->device &&
-               bp->b_blkno == bkey->blkno &&
-               bp->b_bcount == BBTOB(bkey->bblen));
+               return CACHE_PURGE;
+       }
+       return CACHE_MISS;
  }
  
  void
  libxfs_bprint(xfs_buf_t *bp)
  {
-       fprintf(stderr, "Buffer 0x%p blkno=%llu bytes=%u flags=0x%x count=%u\n",
-               bp, (unsigned long long)bp->b_blkno, (unsigned)bp->b_bcount,
+       fprintf(stderr, "Buffer %p blkno=%llu bytes=%u flags=0x%x count=%u\n",
+               bp, (unsigned long long)bp->b_bn, (unsigned)bp->b_bcount,
                 bp->b_flags, bp->b_node.cn_count);
  }
  
  static void
-libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes)
+__initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
+               unsigned int bytes)
  {
         bp->b_flags = 0;
-       bp->b_blkno = bno;
+       bp->b_bn = bno;
         bp->b_bcount = bytes;
-       bp->b_dev = device;
+       bp->b_length = BTOBB(bytes);
+       bp->b_target = btp;
+       bp->b_error = 0;
         if (!bp->b_addr)
                 bp->b_addr = memalign(libxfs_device_alignment(), bytes);
         if (!bp->b_addr) {
@@ -321,22 +571,68 @@ libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes)
                         strerror(errno));
                 exit(1);
         }
+       memset(bp->b_addr, 0, bytes);
  #ifdef XFS_BUF_TRACING
         list_head_init(&bp->b_lock_list);
  #endif
         pthread_mutex_init(&bp->b_lock, NULL);
+       bp->b_holder = 0;
+       bp->b_recur = 0;
+       bp->b_ops = NULL;
+
+       if (!bp->b_maps) {
+               bp->b_nmaps = 1;
+               bp->b_maps = &bp->__b_map;
+               bp->b_maps[0].bm_bn = bp->b_bn;
+               bp->b_maps[0].bm_len = bp->b_length;
+       }
+}
+
+static void
+libxfs_initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
+               unsigned int bytes)
+{
+       __initbuf(bp, btp, bno, bytes);
+}
+
+static void
+libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
+               struct xfs_buf_map *map, int nmaps)
+{
+       unsigned int bytes = 0;
+       int i;
+
+       bytes = sizeof(struct xfs_buf_map) * nmaps;
+       bp->b_maps = malloc(bytes);
+       if (!bp->b_maps) {
+               fprintf(stderr,
+                       _("%s: %s can't malloc %u bytes: %s\n"),
+                       progname, __FUNCTION__, bytes,
+                       strerror(errno));
+               exit(1);
+       }
+       bp->b_nmaps = nmaps;
+
+       bytes = 0;
+       for ( i = 0; i < nmaps; i++) {
+               bp->b_maps[i].bm_bn = map[i].bm_bn;
+               bp->b_maps[i].bm_len = map[i].bm_len;
+               bytes += BBTOB(map[i].bm_len);
+       }
+
+       __initbuf(bp, btp, map[0].bm_bn, bytes);
+       bp->b_flags |= LIBXFS_B_DISCONTIG;
  }
  
  xfs_buf_t *
-libxfs_getbufr(dev_t device, xfs_daddr_t blkno, int bblen)
+__libxfs_getbufr(int blen)
  {
         xfs_buf_t       *bp;
-       int             blen = BBTOB(bblen);
  
         /*
          * first look for a buffer that can be used as-is,
          * if one cannot be found, see if there is a buffer,
-        * and if so, free it's buffer and set b_addr to NULL
+        * and if so, free its buffer and set b_addr to NULL
          * before calling libxfs_initbuf.
          */
         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
@@ -353,22 +649,71 @@ libxfs_getbufr(dev_t device, xfs_daddr_t blkno, int bblen)
                         list_del_init(&bp->b_node.cn_mru);
                         free(bp->b_addr);
                         bp->b_addr = NULL;
+                       if (bp->b_maps != &bp->__b_map)
+                               free(bp->b_maps);
+                       bp->b_maps = NULL;
                 }
         } else
-               bp = libxfs_zone_zalloc(xfs_buf_zone);
+               bp = kmem_zone_zalloc(xfs_buf_zone, 0);
         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
+       bp->b_ops = NULL;
+       if (bp->b_flags & LIBXFS_B_DIRTY)
+               fprintf(stderr, "found dirty buffer (bulk) on free list!");
+
+       return bp;
+}
+
+xfs_buf_t *
+libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
+{
+       xfs_buf_t       *bp;
+       int             blen = BBTOB(bblen);
  
-       if (bp != NULL)
-               libxfs_initbuf(bp, device, blkno, blen);
+       bp =__libxfs_getbufr(blen);
+       if (bp)
+               libxfs_initbuf(bp, btp, blkno, blen);
  #ifdef IO_DEBUG
-       printf("%lx: %s: allocated %u bytes buffer, key=%llu(%llu), %p\n",
-               pthread_self(), __FUNCTION__, BBTOB(len),
+       printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
+               pthread_self(), __FUNCTION__, blen,
                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
  #endif
  
         return bp;
  }
  
+xfs_buf_t *
+libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
+               struct xfs_buf_map *map, int nmaps)
+{
+       xfs_buf_t       *bp;
+       int             blen = BBTOB(bblen);
+
+       if (!map || !nmaps) {
+               fprintf(stderr,
+                       _("%s: %s invalid map %p or nmaps %d\n"),
+                       progname, __FUNCTION__, map, nmaps);
+               exit(1);
+       }
+
+       if (blkno != map[0].bm_bn) {
+               fprintf(stderr,
+                       _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
+                       progname, __FUNCTION__, (long long)map[0].bm_bn,
+                       (long long)blkno);
+               exit(1);
+       }
+
+       bp =__libxfs_getbufr(blen);
+       if (bp)
+               libxfs_initbuf_map(bp, btp, map, nmaps);
+#ifdef IO_DEBUG
+       printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
+               pthread_self(), __FUNCTION__, blen,
+               (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
+#endif
+
+       return bp;
+}
  
  #ifdef XFS_BUF_TRACING
  struct list_head       lock_buf_list = {&lock_buf_list, &lock_buf_list};
@@ -377,43 +722,144 @@ int                      lock_buf_count = 0;
  
  extern int     use_xfs_buf_lock;
  
-xfs_buf_t *
-libxfs_getbuf(dev_t device, xfs_daddr_t blkno, int len)
+static struct xfs_buf *
+__cache_lookup(struct xfs_bufkey *key, unsigned int flags)
  {
-       xfs_buf_t       *bp;
-       xfs_bufkey_t    key;
-       int             miss;
+       struct xfs_buf  *bp;
+
+       cache_node_get(libxfs_bcache, key, (struct cache_node **)&bp);
+       if (!bp)
+               return NULL;
+
+       if (use_xfs_buf_lock) {
+               int ret;
+
+               ret = pthread_mutex_trylock(&bp->b_lock);
+               if (ret) {
+                       ASSERT(ret == EAGAIN);
+                       if (flags & LIBXFS_GETBUF_TRYLOCK)
+                               goto out_put;
+
+                       if (pthread_equal(bp->b_holder, pthread_self())) {
+                               fprintf(stderr,
+       _("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
+                                       key->blkno);
+                               bp->b_recur++;
+                               return bp;
+                       } else {
+                               pthread_mutex_lock(&bp->b_lock);
+                       }
+               }
  
-       key.device = device;
-       key.blkno = blkno;
-       key.bblen = len;
+               bp->b_holder = pthread_self();
+       }
  
-       miss = cache_node_get(libxfs_bcache, &key, (struct cache_node **)&bp);
-       if (bp) {
-               if (use_xfs_buf_lock)
-                       pthread_mutex_lock(&bp->b_lock);
-               cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp,
-                       cache_node_get_priority((struct cache_node *)bp) -
+       cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp,
+               cache_node_get_priority((struct cache_node *)bp) -
                                                 CACHE_PREFETCH_PRIORITY);
  #ifdef XFS_BUF_TRACING
-               pthread_mutex_lock(&libxfs_bcache->c_mutex);
-               lock_buf_count++;
-               list_add(&bp->b_lock_list, &lock_buf_list);
-               pthread_mutex_unlock(&libxfs_bcache->c_mutex);
+       pthread_mutex_lock(&libxfs_bcache->c_mutex);
+       lock_buf_count++;
+       list_add(&bp->b_lock_list, &lock_buf_list);
+       pthread_mutex_unlock(&libxfs_bcache->c_mutex);
  #endif
  #ifdef IO_DEBUG
-               printf("%lx %s: %s buffer %p for bno = %llu\n",
-                       pthread_self(), __FUNCTION__, miss ? "miss" : "hit",
-                       bp, (long long)LIBXFS_BBTOOFF64(blkno));
+       printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
+               pthread_self(), __FUNCTION__,
+               bp, bp->b_bn, (long long)LIBXFS_BBTOOFF64(key->blkno));
  #endif
+
+       return bp;
+out_put:
+       cache_node_put(libxfs_bcache, (struct cache_node *)bp);
+       return NULL;
+}
+
+struct xfs_buf *
+libxfs_getbuf_flags(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len,
+               unsigned int flags)
+{
+       struct xfs_bufkey key = {0};
+
+       key.buftarg = btp;
+       key.blkno = blkno;
+       key.bblen = len;
+
+       return __cache_lookup(&key, flags);
+}
+
+/*
+ * Clean the buffer flags for libxfs_getbuf*(), which wants to return
+ * an unused buffer with clean state.  This prevents CRC errors on a
+ * re-read of a corrupt block that was prefetched and freed.  This
+ * can happen with a massively corrupt directory that is discarded,
+ * but whose blocks are then recycled into expanding lost+found.
+ *
+ * Note however that if the buffer's dirty (prefetch calls getbuf)
+ * we'll leave the state alone because we don't want to discard blocks
+ * that have been fixed.
+ */
+static void
+reset_buf_state(
+       struct xfs_buf  *bp)
+{
+       if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
+               bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
+                               LIBXFS_B_UPTODATE);
+}
+
+struct xfs_buf *
+libxfs_getbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
+{
+       struct xfs_buf  *bp;
+
+       bp = libxfs_getbuf_flags(btp, blkno, len, 0);
+       reset_buf_state(bp);
+       return bp;
+}
+
+static struct xfs_buf *
+__libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
+                   int nmaps, int flags)
+{
+       struct xfs_bufkey key = {0};
+       int i;
+
+       if (nmaps == 1)
+               return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
+                                          flags);
+
+       key.buftarg = btp;
+       key.blkno = map[0].bm_bn;
+       for (i = 0; i < nmaps; i++) {
+               key.bblen += map[i].bm_len;
         }
+       key.map = map;
+       key.nmaps = nmaps;
+
+       return __cache_lookup(&key, flags);
+}
+
+struct xfs_buf *
+libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
+                 int nmaps, int flags)
+{
+       struct xfs_buf  *bp;
  
+       bp = __libxfs_getbuf_map(btp, map, nmaps, flags);
+       reset_buf_state(bp);
         return bp;
  }
  
  void
  libxfs_putbuf(xfs_buf_t *bp)
  {
+       /*
+        * ensure that any errors on this use of the buffer don't carry
+        * over to the next user.
+        */
+       bp->b_error = 0;
+
  #ifdef XFS_BUF_TRACING
         pthread_mutex_lock(&libxfs_bcache->c_mutex);
         lock_buf_count--;
@@ -421,19 +867,26 @@ libxfs_putbuf(xfs_buf_t *bp)
         list_del_init(&bp->b_lock_list);
         pthread_mutex_unlock(&libxfs_bcache->c_mutex);
  #endif
-       if (use_xfs_buf_lock)
-               pthread_mutex_unlock(&bp->b_lock);
+       if (use_xfs_buf_lock) {
+               if (bp->b_recur) {
+                       bp->b_recur--;
+               } else {
+                       bp->b_holder = 0;
+                       pthread_mutex_unlock(&bp->b_lock);
+               }
+       }
+
         cache_node_put(libxfs_bcache, (struct cache_node *)bp);
  }
  
  void
  libxfs_purgebuf(xfs_buf_t *bp)
  {
-       xfs_bufkey_t    key;
+       struct xfs_bufkey key = {0};
  
-       key.device = bp->b_dev;
-       key.blkno = bp->b_blkno;
-       key.bblen = bp->b_bcount >> BBSHIFT;
+       key.buftarg = bp->b_target;
+       key.blkno = bp->b_bn;
+       key.bblen = bp->b_length;
  
         cache_node_purge(libxfs_bcache, &key, (struct cache_node *)bp);
  }
@@ -441,91 +894,284 @@ libxfs_purgebuf(xfs_buf_t *bp)
  static struct cache_node *
  libxfs_balloc(cache_key_t key)
  {
-       xfs_bufkey_t    *bufkey = (xfs_bufkey_t *)key;
+       struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key;
+
+       if (bufkey->map)
+               return (struct cache_node *)
+                      libxfs_getbufr_map(bufkey->buftarg,
+                                         bufkey->blkno, bufkey->bblen,
+                                         bufkey->map, bufkey->nmaps);
+       return (struct cache_node *)libxfs_getbufr(bufkey->buftarg,
+                                         bufkey->blkno, bufkey->bblen);
+}
+
  
-       return (struct cache_node *)libxfs_getbufr(bufkey->device,
-                                       bufkey->blkno, bufkey->bblen);
+static int
+__read_buf(int fd, void *buf, int len, off64_t offset, int flags)
+{
+       int     sts;
+
+       sts = pread(fd, buf, len, offset);
+       if (sts < 0) {
+               int error = errno;
+               fprintf(stderr, _("%s: read failed: %s\n"),
+                       progname, strerror(error));
+               if (flags & LIBXFS_EXIT_ON_FAILURE)
+                       exit(1);
+               return -error;
+       } else if (sts != len) {
+               fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
+                       progname, sts, len);
+               if (flags & LIBXFS_EXIT_ON_FAILURE)
+                       exit(1);
+               return -EIO;
+       }
+       return 0;
  }
  
  int
-libxfs_readbufr(dev_t dev, xfs_daddr_t blkno, xfs_buf_t *bp, int len, int flags)
+libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, xfs_buf_t *bp,
+               int len, int flags)
  {
-       int     fd = libxfs_device_to_fd(dev);
+       int     fd = libxfs_device_to_fd(btp->dev);
         int     bytes = BBTOB(len);
+       int     error;
  
         ASSERT(BBTOB(len) <= bp->b_bcount);
  
-       if (pread64(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno)) < 0) {
-               fprintf(stderr, _("%s: read failed: %s\n"),
-                       progname, strerror(errno));
-               if (flags & LIBXFS_EXIT_ON_FAILURE)
-                       exit(1);
-               return errno;
-       }
+       error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
+       if (!error &&
+           bp->b_target->dev == btp->dev &&
+           bp->b_bn == blkno &&
+           bp->b_bcount == bytes)
+               bp->b_flags |= LIBXFS_B_UPTODATE;
  #ifdef IO_DEBUG
-       printf("%lx: %s: read %u bytes, blkno=%llu(%llu), %p\n",
-               pthread_self(), __FUNCTION__, bytes,
+       printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
+               pthread_self(), __FUNCTION__, bytes, error,
                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
  #endif
-       if (bp->b_dev == dev &&
-           bp->b_blkno == blkno &&
-           bp->b_bcount == bytes)
-               bp->b_flags |= LIBXFS_B_UPTODATE;
-       return 0;
+       return error;
+}
+
+void
+libxfs_readbuf_verify(struct xfs_buf *bp, const struct xfs_buf_ops *ops)
+{
+       if (!ops)
+               return;
+       bp->b_ops = ops;
+       bp->b_ops->verify_read(bp);
+       bp->b_flags &= ~LIBXFS_B_UNCHECKED;
  }
  
+
  xfs_buf_t *
-libxfs_readbuf(dev_t dev, xfs_daddr_t blkno, int len, int flags)
+libxfs_readbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
+               const struct xfs_buf_ops *ops)
  {
         xfs_buf_t       *bp;
         int             error;
  
-       bp = libxfs_getbuf(dev, blkno, len);
-       if (bp && !(bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
-               error = libxfs_readbufr(dev, blkno, bp, len, flags);
+       bp = libxfs_getbuf_flags(btp, blkno, len, 0);
+       if (!bp)
+               return NULL;
+
+       /*
+        * if the buffer was prefetched, it is likely that it was not validated.
+        * Hence if we are supplied an ops function and the buffer is marked as
+        * unchecked, we need to validate it now.
+        *
+        * We do this verification even if the buffer is dirty - the
+        * verification is almost certainly going to fail the CRC check in this
+        * case as a dirty buffer has not had the CRC recalculated. However, we
+        * should not be dirtying unchecked buffers and therefore failing it
+        * here because it's dirty and unchecked indicates we've screwed up
+        * somewhere else.
+        */
+       bp->b_error = 0;
+       if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
+               if (bp->b_flags & LIBXFS_B_UNCHECKED)
+                       libxfs_readbuf_verify(bp, ops);
+               return bp;
+       }
+
+       /*
+        * Set the ops on a cache miss (i.e. first physical read) as the
+        * verifier may change the ops to match the type of buffer it contains.
+        * A cache hit might reset the verifier to the original type if we set
+        * it again, but it won't get called again and set to match the buffer
+        * contents. *cough* xfs_da_node_buf_ops *cough*.
+        */
+       error = libxfs_readbufr(btp, blkno, bp, len, flags);
+       if (error)
+               bp->b_error = error;
+       else
+               libxfs_readbuf_verify(bp, ops);
+       return bp;
+}
+
+int
+libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
+{
+       int     fd;
+       int     error = 0;
+       void    *buf;
+       int     i;
+
+       fd = libxfs_device_to_fd(btp->dev);
+       buf = bp->b_addr;
+       for (i = 0; i < bp->b_nmaps; i++) {
+               off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
+               int len = BBTOB(bp->b_maps[i].bm_len);
+
+               error = __read_buf(fd, buf, len, offset, flags);
                 if (error) {
-                       libxfs_putbuf(bp);
-                       return NULL;
+                       bp->b_error = error;
+                       break;
                 }
+               buf += len;
         }
+
+       if (!error)
+               bp->b_flags |= LIBXFS_B_UPTODATE;
+#ifdef IO_DEBUG
+       printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
+               pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
+               (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
+#endif
+       return error;
+}
+
+struct xfs_buf *
+libxfs_readbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
+               int flags, const struct xfs_buf_ops *ops)
+{
+       struct xfs_buf  *bp;
+       int             error = 0;
+
+       if (nmaps == 1)
+               return libxfs_readbuf(btp, map[0].bm_bn, map[0].bm_len,
+                                       flags, ops);
+
+       bp = __libxfs_getbuf_map(btp, map, nmaps, 0);
+       if (!bp)
+               return NULL;
+
+       bp->b_error = 0;
+       if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
+               if (bp->b_flags & LIBXFS_B_UNCHECKED)
+                       libxfs_readbuf_verify(bp, ops);
+               return bp;
+       }
+       error = libxfs_readbufr_map(btp, bp, flags);
+       if (!error)
+               libxfs_readbuf_verify(bp, ops);
+
+#ifdef IO_DEBUGX
+       printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
+               pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
+               (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
+#endif
         return bp;
  }
  
-int
-libxfs_writebufr(xfs_buf_t *bp)
+static int
+__write_buf(int fd, void *buf, int len, off64_t offset, int flags)
  {
         int     sts;
-       int     fd = libxfs_device_to_fd(bp->b_dev);
  
-       sts = pwrite64(fd, bp->b_addr, bp->b_bcount, LIBXFS_BBTOOFF64(bp->b_blkno));
+       sts = pwrite(fd, buf, len, offset);
         if (sts < 0) {
-               fprintf(stderr, _("%s: pwrite64 failed: %s\n"),
-                       progname, strerror(errno));
-               if (bp->b_flags & LIBXFS_B_EXIT)
+               int error = errno;
+               fprintf(stderr, _("%s: pwrite failed: %s\n"),
+                       progname, strerror(error));
+               if (flags & LIBXFS_B_EXIT)
                         exit(1);
-               return errno;
-       }
-       else if (sts != bp->b_bcount) {
-               fprintf(stderr, _("%s: error - wrote only %d of %d bytes\n"),
-                       progname, sts, bp->b_bcount);
-               if (bp->b_flags & LIBXFS_B_EXIT)
+               return -error;
+       } else if (sts != len) {
+               fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
+                       progname, sts, len);
+               if (flags & LIBXFS_B_EXIT)
                         exit(1);
-               return EIO;
+               return -EIO;
+       }
+       return 0;
+}
+
+int
+libxfs_writebufr(xfs_buf_t *bp)
+{
+       int     fd = libxfs_device_to_fd(bp->b_target->dev);
+
+       /*
+        * we never write buffers that are marked stale. This indicates they
+        * contain data that has been invalidated, and even if the buffer is
+        * dirty it must *never* be written. Verifiers are wonderful for finding
+        * bugs like this. Make sure the error is obvious as to the cause.
+        */
+       if (bp->b_flags & LIBXFS_B_STALE) {
+               bp->b_error = -ESTALE;
+               return bp->b_error;
         }
+
+       /*
+        * clear any pre-existing error status on the buffer. This can occur if
+        * the buffer is corrupt on disk and the repair process doesn't clear
+        * the error before fixing and writing it back.
+        */
+       bp->b_error = 0;
+       if (bp->b_ops) {
+               bp->b_ops->verify_write(bp);
+               if (bp->b_error) {
+                       fprintf(stderr,
+       _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
+                               __func__, bp->b_ops->name,
+                               (long long)bp->b_bn, bp->b_bcount);
+                       return bp->b_error;
+               }
+       }
+
+       if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
+               bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
+                                   LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
+       } else {
+               int     i;
+               void    *buf = bp->b_addr;
+
+               for (i = 0; i < bp->b_nmaps; i++) {
+                       off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
+                       int len = BBTOB(bp->b_maps[i].bm_len);
+
+                       bp->b_error = __write_buf(fd, buf, len, offset,
+                                                 bp->b_flags);
+                       if (bp->b_error)
+                               break;
+                       buf += len;
+               }
+       }
+
  #ifdef IO_DEBUG
-       printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p\n",
+       printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
                         pthread_self(), __FUNCTION__, bp->b_bcount,
-                       (long long)LIBXFS_BBTOOFF64(bp->b_blkno),
-                       (long long)bp->b_blkno, bp);
+                       (long long)LIBXFS_BBTOOFF64(bp->b_bn),
+                       (long long)bp->b_bn, bp, bp->b_error);
  #endif
-       bp->b_flags |= LIBXFS_B_UPTODATE;
-       bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT);
-       return 0;
+       if (!bp->b_error) {
+               bp->b_flags |= LIBXFS_B_UPTODATE;
+               bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
+                                LIBXFS_B_UNCHECKED);
+       }
+       return bp->b_error;
  }
  
  int
  libxfs_writebuf_int(xfs_buf_t *bp, int flags)
  {
+       /*
+        * Clear any error hanging over from reading the buffer. This prevents
+        * subsequent reads after this write from seeing stale errors.
+        */
+       bp->b_error = 0;
+       bp->b_flags &= ~LIBXFS_B_STALE;
         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
         return 0;
  }
@@ -533,6 +1179,18 @@ libxfs_writebuf_int(xfs_buf_t *bp, int flags)
  int
  libxfs_writebuf(xfs_buf_t *bp, int flags)
  {
+#ifdef IO_DEBUG
+       printf("%lx: %s: dirty blkno=%llu(%llu)\n",
+                       pthread_self(), __FUNCTION__,
+                       (long long)LIBXFS_BBTOOFF64(bp->b_bn),
+                       (long long)bp->b_bn);
+#endif
+       /*
+        * Clear any error hanging over from reading the buffer. This prevents
+        * subsequent reads after this write from seeing stale errors.
+        */
+       bp->b_error = 0;
+       bp->b_flags &= ~LIBXFS_B_STALE;
         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
         libxfs_putbuf(bp);
         return 0;
@@ -544,8 +1202,8 @@ libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
  #ifdef IO_DEBUG
         if (boff + len > bp->b_bcount) {
                 printf("Badness, iomove out of range!\n"
-                       "bp=(bno %llu, bytes %u) range=(boff %u, bytes %u)\n",
-                       (long long)bp->b_blkno, bp->b_bcount, boff, len);
+                       "bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
+                       (long long)bp->b_bn, bp->b_bcount, boff, len);
                 abort();
         }
  #endif
@@ -563,23 +1221,26 @@ libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
  }
  
  static void
-libxfs_brelse(struct cache_node *node)
+libxfs_brelse(
+       struct cache_node       *node)
  {
-       xfs_buf_t               *bp = (xfs_buf_t *)node;
+       struct xfs_buf          *bp = (struct xfs_buf *)node;
  
-       if (bp != NULL) {
-               if (bp->b_flags & LIBXFS_B_DIRTY)
-                       libxfs_writebufr(bp);
-               pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
-               list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
-               pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
-       }
+       if (!bp)
+               return;
+       if (bp->b_flags & LIBXFS_B_DIRTY)
+               fprintf(stderr,
+                       "releasing dirty buffer to free list!");
+
+       pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+       list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
+       pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
  }
  
  static unsigned int
  libxfs_bulkrelse(
-       struct cache            *cache,
-       struct list_head        *list)
+       struct cache            *cache,
+       struct list_head        *list)
  {
         xfs_buf_t               *bp;
         int                     count = 0;
@@ -589,29 +1250,59 @@ libxfs_bulkrelse(
  
         list_for_each_entry(bp, list, b_node.cn_mru) {
                 if (bp->b_flags & LIBXFS_B_DIRTY)
-                       libxfs_writebufr(bp);
+                       fprintf(stderr,
+                               "releasing dirty buffer (bulk) to free list!");
                 count++;
         }
  
         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
-       __list_splice(list, &xfs_buf_freelist.cm_list);
+       list_splice(list, &xfs_buf_freelist.cm_list);
         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
  
         return count;
  }
  
-static void
-libxfs_bflush(struct cache_node *node)
+/*
+ * Free everything from the xfs_buf_freelist MRU, used at final teardown
+ */
+void
+libxfs_bcache_free(void)
  {
-       xfs_buf_t               *bp = (xfs_buf_t *)node;
+       struct list_head        *cm_list;
+       xfs_buf_t               *bp, *next;
+
+       cm_list = &xfs_buf_freelist.cm_list;
+       list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
+               free(bp->b_addr);
+               if (bp->b_maps != &bp->__b_map)
+                       free(bp->b_maps);
+               kmem_zone_free(xfs_buf_zone, bp);
+       }
+}
  
-       if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY))
-               libxfs_writebufr(bp);
+/*
+ * When a buffer is marked dirty, the error is cleared. Hence if we are trying
+ * to flush a buffer prior to cache reclaim that has an error on it it means
+ * we've already tried to flush it and it failed. Prevent repeated corruption
+ * errors from being reported by skipping such buffers - when the corruption is
+ * fixed the buffer will be marked dirty again and we can write it again.
+ */
+static int
+libxfs_bflush(
+       struct cache_node       *node)
+{
+       struct xfs_buf          *bp = (struct xfs_buf *)node;
+
+       if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
+               return libxfs_writebufr(bp);
+       return bp->b_error;
  }
  
  void
  libxfs_putbufr(xfs_buf_t *bp)
  {
+       if (bp->b_flags & LIBXFS_B_DIRTY)
+               libxfs_writebufr(bp);
         libxfs_brelse((struct cache_node *)bp);
  }
  
@@ -635,179 +1326,102 @@ libxfs_bcache_overflowed(void)
  }
  
  struct cache_operations libxfs_bcache_operations = {
-       /* .hash */     libxfs_bhash,
-       /* .alloc */    libxfs_balloc,
-       /* .flush */    libxfs_bflush,
-       /* .relse */    libxfs_brelse,
-       /* .compare */  libxfs_bcompare,
-       /* .bulkrelse */libxfs_bulkrelse
+       .hash           = libxfs_bhash,
+       .alloc          = libxfs_balloc,
+       .flush          = libxfs_bflush,
+       .relse          = libxfs_brelse,
+       .compare        = libxfs_bcompare,
+       .bulkrelse      = libxfs_bulkrelse
  };
  
  
  /*
- * Simple memory interface
+ * Inode cache stubs.
   */
  
-xfs_zone_t *
-libxfs_zone_init(int size, char *name)
-{
-       xfs_zone_t      *ptr;
-
-       if ((ptr = malloc(sizeof(xfs_zone_t))) == NULL) {
-               fprintf(stderr, _("%s: zone init failed (%s, %d bytes): %s\n"),
-                       progname, name, (int)sizeof(xfs_zone_t), strerror(errno));
-               exit(1);
-       }
-       ptr->zone_unitsize = size;
-       ptr->zone_name = name;
-#ifdef MEM_DEBUG
-       ptr->allocated = 0;
-       fprintf(stderr, "new zone %p for \"%s\", size=%d\n", ptr, name, size);
-#endif
-       return ptr;
-}
+kmem_zone_t            *xfs_inode_zone;
+extern kmem_zone_t     *xfs_ili_zone;
  
-void *
-libxfs_zone_zalloc(xfs_zone_t *z)
+/*
+ * If there are inline format data / attr forks attached to this inode,
+ * make sure they're not corrupt.
+ */
+bool
+libxfs_inode_verify_forks(
+       struct xfs_inode        *ip,
+       struct xfs_ifork_ops    *ops)
  {
-       void    *ptr;
-
-       if ((ptr = calloc(z->zone_unitsize, 1)) == NULL) {
-               fprintf(stderr, _("%s: zone calloc failed (%s, %d bytes): %s\n"),
-                       progname, z->zone_name, z->zone_unitsize,
-                       strerror(errno));
-               exit(1);
+       struct xfs_ifork        *ifp;
+       xfs_failaddr_t          fa;
+
+       if (!ops)
+               return true;
+
+       fa = xfs_ifork_verify_data(ip, ops);
+       if (fa) {
+               ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+               xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
+                               ifp->if_u1.if_data, ifp->if_bytes, fa);
+               return false;
         }
-#ifdef MEM_DEBUG
-       z->allocated++;
-       fprintf(stderr, "## zone alloc'd item %p from %s (%d bytes) (%d active)\n",
-               ptr, z->zone_name,  z->zone_unitsize,
-               z->allocated);
-#endif
-       return ptr;
-}
  
-void
-libxfs_zone_free(xfs_zone_t *z, void *ptr)
-{
-#ifdef MEM_DEBUG
-       z->allocated--;
-       fprintf(stderr, "## zone freed item %p from %s (%d bytes) (%d active)\n",
-               ptr, z->zone_name, z->zone_unitsize,
-               z->allocated);
-#endif
-       if (ptr != NULL) {
-               free(ptr);
-               ptr = NULL;
+       fa = xfs_ifork_verify_attr(ip, ops);
+       if (fa) {
+               ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
+               xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
+                               ifp ? ifp->if_u1.if_data : NULL,
+                               ifp ? ifp->if_bytes : 0, fa);
+               return false;
         }
+       return true;
  }
  
-void *
-libxfs_malloc(size_t size)
+int
+libxfs_iget(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       xfs_ino_t               ino,
+       uint                    lock_flags,
+       struct xfs_inode        **ipp,
+       struct xfs_ifork_ops    *ifork_ops)
  {
-       void    *ptr;
-
-       if ((ptr = calloc(1, size)) == NULL) {
-               fprintf(stderr, _("%s: calloc failed (%d bytes): %s\n"),
-                       progname, (int)size, strerror(errno));
-               exit(1);
+       struct xfs_inode        *ip;
+       int                     error = 0;
+
+       ip = kmem_zone_zalloc(xfs_inode_zone, 0);
+       if (!ip)
+               return -ENOMEM;
+
+       ip->i_ino = ino;
+       ip->i_mount = mp;
+       error = xfs_iread(mp, tp, ip, 0);
+       if (error) {
+               kmem_zone_free(xfs_inode_zone, ip);
+               *ipp = NULL;
+               return error;
         }
-#ifdef MEM_DEBUG
-       fprintf(stderr, "## calloc'd item %p size %d bytes\n", ptr, size);
-#endif
-       return ptr;
-}
  
-void
-libxfs_free(void *ptr)
-{
-#ifdef MEM_DEBUG
-       fprintf(stderr, "## freed item %p\n", ptr);
-#endif
-       if (ptr != NULL) {
-               free(ptr);
-               ptr = NULL;
+       if (!libxfs_inode_verify_forks(ip, ifork_ops)) {
+               libxfs_iput(ip);
+               return -EFSCORRUPTED;
         }
-}
  
-void *
-libxfs_realloc(void *ptr, size_t size)
-{
-#ifdef MEM_DEBUG
-       void *optr=ptr;
-#endif
-       if ((ptr = realloc(ptr, size)) == NULL) {
-               fprintf(stderr, _("%s: realloc failed (%d bytes): %s\n"),
-                       progname, (int)size, strerror(errno));
-               exit(1);
-       }
-#ifdef MEM_DEBUG
-       fprintf(stderr, "## realloc'd item %p now %p size %d bytes\n",
-               optr, ptr, size);
-#endif
-       return ptr;
-}
-
-
-/*
- * Inode cache interfaces
- */
-
-extern xfs_zone_t      *xfs_ili_zone;
-extern xfs_zone_t      *xfs_inode_zone;
-
-static unsigned int
-libxfs_ihash(cache_key_t key, unsigned int hashsize)
-{
-       return ((unsigned int)*(xfs_ino_t *)key) % hashsize;
-}
-
-static int
-libxfs_icompare(struct cache_node *node, cache_key_t key)
-{
-       xfs_inode_t     *ip = (xfs_inode_t *)node;
-
-       return (ip->i_ino == *(xfs_ino_t *)key);
-}
-
-int
-libxfs_iget(xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, uint lock_flags,
-               xfs_inode_t **ipp, xfs_daddr_t bno)
-{
-       xfs_inode_t     *ip;
-       int             error = 0;
+       /*
+        * set up the inode ops structure that the libxfs code relies on
+        */
+       if (XFS_ISDIR(ip))
+               ip->d_ops = mp->m_dir_inode_ops;
+       else
+               ip->d_ops = mp->m_nondir_inode_ops;
  
-       if (cache_node_get(libxfs_icache, &ino, (struct cache_node **)&ip)) {
-#ifdef INO_DEBUG
-               fprintf(stderr, "%s: allocated inode, ino=%llu(%llu), %p\n",
-                       __FUNCTION__, (unsigned long long)ino, bno, ip);
-#endif
-               if ((error = libxfs_iread(mp, tp, ino, ip, bno))) {
-                       cache_node_purge(libxfs_icache, &ino,
-                                       (struct cache_node *)ip);
-                       ip = NULL;
-               }
-       }
         *ipp = ip;
-       return error;
-}
-
-void
-libxfs_iput(xfs_inode_t *ip, uint lock_flags)
-{
-       cache_node_put(libxfs_icache, (struct cache_node *)ip);
-}
-
-static struct cache_node *
-libxfs_ialloc(cache_key_t key)
-{
-       return libxfs_zone_zalloc(xfs_inode_zone);
+       return 0;
  }
  
  static void
  libxfs_idestroy(xfs_inode_t *ip)
  {
-       switch (ip->i_d.di_mode & S_IFMT) {
+       switch (VFS_I(ip)->i_mode & S_IFMT) {
                 case S_IFREG:
                 case S_IFDIR:
                 case S_IFLNK:
@@ -816,34 +1430,16 @@ libxfs_idestroy(xfs_inode_t *ip)
         }
         if (ip->i_afp)
                 libxfs_idestroy_fork(ip, XFS_ATTR_FORK);
-}
-
-static void
-libxfs_irelse(struct cache_node *node)
-{
-       xfs_inode_t     *ip = (xfs_inode_t *)node;
-
-       if (ip != NULL) {
-               if (ip->i_itemp)
-                       libxfs_zone_free(xfs_ili_zone, ip->i_itemp);
-               ip->i_itemp = NULL;
-               libxfs_idestroy(ip);
-               libxfs_zone_free(xfs_inode_zone, ip);
-               ip = NULL;
-       }
+       if (ip->i_cowfp)
+               xfs_idestroy_fork(ip, XFS_COW_FORK);
  }
  
  void
-libxfs_icache_purge(void)
+libxfs_iput(xfs_inode_t *ip)
  {
-       cache_purge(libxfs_icache);
+       if (ip->i_itemp)
+               kmem_zone_free(xfs_ili_zone, ip->i_itemp);
+       ip->i_itemp = NULL;
+       libxfs_idestroy(ip);
+       kmem_zone_free(xfs_inode_zone, ip);
  }
-
-struct cache_operations libxfs_icache_operations = {
-       /* .hash */     libxfs_ihash,
-       /* .alloc */    libxfs_ialloc,
-       /* .flush */    NULL,
-       /* .relse */    libxfs_irelse,
-       /* .compare */  libxfs_icompare,
-       /* .bulkrelse */ NULL
-};