xfs: fix transaction leak on remote attr set/remove failure

[thirdparty/xfsprogs-dev.git] / libxfs / rdwr.c
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c

index 57c12c1d4c9ba4cdb3f1c78f6330f32e59bef108..14a4633e9fa60d3a722d11b257b879ed74e67adf 100644 (file)
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -1,29 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
  /*
   * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   * All Rights Reserved.
+ */
+
+
+#include "libxfs_priv.h"
+#include "init.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+
+#include "libxfs.h"            /* for LIBXFS_EXIT_ON_FAILURE */
+
+/*
+ * Important design/architecture note:
+ *
+ * The userspace code that uses the buffer cache is much less constrained than
+ * the kernel code. The userspace code is pretty nasty in places, especially
+ * when it comes to buffer error handling.  Very little of the userspace code
+ * outside libxfs clears bp->b_error - very little code even checks it - so the
+ * libxfs code is tripping on stale errors left by the userspace code.
+ *
+ * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
+ * in the kernel, because those functions are used by the libxfs_readbuf_*
+ * functions and hence need to leave the buffers unchanged on cache hits. This
+ * is actually the only way to gather a write error from a libxfs_writebuf()
+ * call - you need to get the buffer again so you can check bp->b_error field -
+ * assuming that the buffer is still in the cache when you check, that is.
   *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
+ * This is very different to the kernel code which does not release buffers on a
+ * write so we can wait on IO and check errors. The kernel buffer cache also
+ * guarantees a buffer of a known initial state from xfs_buf_get() even on a
+ * cache hit.
   *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * IOWs, userspace is behaving quite differently to the kernel and as a result
+ * it leaks errors from reads, invalidations and writes through
+ * libxfs_getbuf/libxfs_readbuf.
   *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ * The result of this is that until the userspace code outside libxfs is cleaned
+ * up, functions that release buffers from userspace control (i.e
+ * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
+ * propagation of stale errors into future buffer operations.
   */
  
-#include <xfs/libxfs.h>
-#include "init.h"
-
  #define BDSTRAT_SIZE   (256 * 1024)
  
  #define IO_BCOMPARE_CHECK
  
-void
+/* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
+int
  libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
  {
         xfs_off_t       start_offset, end_offset, offset;
@@ -43,7 +76,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
         fd = libxfs_device_to_fd(btp->dev);
         start_offset = LIBXFS_BBTOOFF64(start);
  
-       if ((lseek64(fd, start_offset, SEEK_SET)) < 0) {
+       if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
                 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
                         progname, __FUNCTION__,
                         (unsigned long long)start_offset, strerror(errno));
@@ -65,6 +98,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
                 offset += bytes;
         }
         free(z);
+       return 0;
  }
  
  static void unmount_record(void *p)
@@ -72,13 +106,14 @@ static void unmount_record(void *p)
         xlog_op_header_t        *op = (xlog_op_header_t *)p;
         /* the data section must be 32 bit size aligned */
         struct {
-           __uint16_t magic;
-           __uint16_t pad1;
-           __uint32_t pad2; /* may as well make it 64 bits */
+           uint16_t magic;
+           uint16_t pad1;
+           uint32_t pad2; /* may as well make it 64 bits */
         } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
  
         memset(p, 0, BBSIZE);
-       op->oh_tid = cpu_to_be32(1);
+       /* dummy tid to mark this as written from userspace */
+       op->oh_tid = cpu_to_be32(0xb0c0d0d0);
         op->oh_len = cpu_to_be32(sizeof(magic));
         op->oh_clientid = XFS_LOG;
         op->oh_flags = XLOG_UNMOUNT_TRANS;
@@ -88,92 +123,248 @@ static void unmount_record(void *p)
         memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
  }
  
-static xfs_caddr_t next(xfs_caddr_t ptr, int offset, void *private)
+static char *next(
+       char            *ptr,
+       int             offset,
+       void            *private)
  {
-       xfs_buf_t       *buf = (xfs_buf_t *)private;
+       struct xfs_buf  *buf = (struct xfs_buf *)private;
  
-       if (XFS_BUF_COUNT(buf) < (int)(ptr - XFS_BUF_PTR(buf)) + offset)
+       if (buf &&
+           (buf->b_bcount < (int)(ptr - (char *)buf->b_addr) + offset))
                 abort();
+
         return ptr + offset;
  }
  
+/*
+ * Format the log. The caller provides either a buftarg which is used to access
+ * the log via buffers or a direct pointer to a buffer that encapsulates the
+ * entire log.
+ */
  int
  libxfs_log_clear(
         struct xfs_buftarg      *btp,
+       char                    *dptr,
         xfs_daddr_t             start,
-       uint                    length,
+       uint                    length,         /* basic blocks */
         uuid_t                  *fs_uuid,
         int                     version,
-       int                     sunit,
-       int                     fmt)
+       int                     sunit,          /* bytes */
+       int                     fmt,
+       int                     cycle,
+       bool                    max)
  {
-       xfs_buf_t               *bp;
+       struct xfs_buf          *bp = NULL;
         int                     len;
-
-       if (!btp->dev || !fs_uuid)
+       xfs_lsn_t               lsn;
+       xfs_lsn_t               tail_lsn;
+       xfs_daddr_t             blk;
+       xfs_daddr_t             end_blk;
+       char                    *ptr;
+
+       if (((btp && dptr) || (!btp && !dptr)) ||
+           (btp && !btp->dev) || !fs_uuid)
                 return -EINVAL;
  
         /* first zero the log */
-       libxfs_device_zero(btp, start, length);
+       if (btp)
+               libxfs_device_zero(btp, start, length);
+       else
+               memset(dptr, 0, BBTOB(length));
  
-       /* then write a log record header */
+       /*
+        * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
+        * special reset case where we only write a single record where the lsn
+        * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
+        * the specified cycle and points tail_lsn at the last record of the
+        * previous cycle.
+        */
         len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
-       len = MAX(len, 2);
-       bp = libxfs_getbufr(btp, start, len);
-       libxfs_log_header(XFS_BUF_PTR(bp),
-                         fs_uuid, version, sunit, fmt, next, bp);
-       bp->b_flags |= LIBXFS_B_DIRTY;
-       libxfs_putbufr(bp);
+       len = max(len, 2);
+       lsn = xlog_assign_lsn(cycle, 0);
+       if (cycle == XLOG_INIT_CYCLE)
+               tail_lsn = lsn;
+       else
+               tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
+
+       /* write out the first log record */
+       ptr = dptr;
+       if (btp) {
+               bp = libxfs_getbufr(btp, start, len);
+               ptr = bp->b_addr;
+       }
+       libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
+                         next, bp);
+       if (bp) {
+               bp->b_flags |= LIBXFS_B_DIRTY;
+               libxfs_putbufr(bp);
+       }
+
+       /*
+        * There's nothing else to do if this is a log reset. The kernel detects
+        * the rest of the log is zeroed and starts at cycle 1.
+        */
+       if (cycle == XLOG_INIT_CYCLE)
+               return 0;
+
+       /*
+        * Bump the record size for a full log format if the caller allows it.
+        * This is primarily for performance reasons and most callers don't care
+        * about record size since the log is clean after we're done.
+        */
+       if (max)
+               len = BTOBB(BDSTRAT_SIZE);
+
+       /*
+        * Otherwise, fill everything beyond the initial record with records of
+        * the previous cycle so the kernel head/tail detection works correctly.
+        *
+        * We don't particularly care about the record size or content here.
+        * It's only important that the headers are in place such that the
+        * kernel finds 1.) a clean log and 2.) the correct current cycle value.
+        * Therefore, bump up the record size to the max to use larger I/Os and
+        * improve performance.
+        */
+       cycle--;
+       blk = start + len;
+       if (dptr)
+               dptr += BBTOB(len);
+       end_blk = start + length;
+
+       len = min(end_blk - blk, len);
+       while (blk < end_blk) {
+               lsn = xlog_assign_lsn(cycle, blk - start);
+               tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
+
+               ptr = dptr;
+               if (btp) {
+                       bp = libxfs_getbufr(btp, blk, len);
+                       ptr = bp->b_addr;
+               }
+               /*
+                * Note: pass the full buffer length as the sunit to initialize
+                * the entire buffer.
+                */
+               libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
+                                 tail_lsn, next, bp);
+               if (bp) {
+                       bp->b_flags |= LIBXFS_B_DIRTY;
+                       libxfs_putbufr(bp);
+               }
+
+               blk += len;
+               if (dptr)
+                       dptr += BBTOB(len);
+               len = min(end_blk - blk, len);
+       }
+
         return 0;
  }
  
  int
  libxfs_log_header(
-       xfs_caddr_t             caddr,
+       char                    *caddr,
         uuid_t                  *fs_uuid,
         int                     version,
         int                     sunit,
         int                     fmt,
+       xfs_lsn_t               lsn,
+       xfs_lsn_t               tail_lsn,
         libxfs_get_block_t      *nextfunc,
         void                    *private)
  {
         xlog_rec_header_t       *head = (xlog_rec_header_t *)caddr;
-       xfs_caddr_t             p = caddr;
+       char                    *p = caddr;
         __be32                  cycle_lsn;
         int                     i, len;
+       int                     hdrs = 1;
+
+       if (lsn == NULLCOMMITLSN)
+               lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
+       if (tail_lsn == NULLCOMMITLSN)
+               tail_lsn = lsn;
  
         len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
  
-       /* note that oh_tid actually contains the cycle number
-        * and the tid is stored in h_cycle_data[0] - that's the
-        * way things end up on disk.
-        */
         memset(p, 0, BBSIZE);
         head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
-       head->h_cycle = cpu_to_be32(1);
+       head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
         head->h_version = cpu_to_be32(version);
-       if (len != 1)
-               head->h_len = cpu_to_be32(sunit - BBSIZE);
-       else
-               head->h_len = cpu_to_be32(20);
-       head->h_crc = cpu_to_be32(0);
+       head->h_crc = cpu_to_le32(0);
         head->h_prev_block = cpu_to_be32(-1);
         head->h_num_logops = cpu_to_be32(1);
-       head->h_cycle_data[0] = cpu_to_be32(0xb0c0d0d0);
         head->h_fmt = cpu_to_be32(fmt);
-       head->h_size = cpu_to_be32(XLOG_HEADER_CYCLE_SIZE);
+       head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
  
-       head->h_lsn = cpu_to_be64(xlog_assign_lsn(1, 0));
-       head->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(1, 0));
+       head->h_lsn = cpu_to_be64(lsn);
+       head->h_tail_lsn = cpu_to_be64(tail_lsn);
  
         memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
  
-       len = MAX(len, 2);
+       /*
+        * The kernel expects to see either a log record header magic value or
+        * the LSN cycle at the top of every log block. The first word of each
+        * non-header block is copied to the record headers and replaced with
+        * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
+        * details).
+        *
+        * Even though we only ever write an unmount record (one block), we
+        * support writing log records up to the max log buffer size of 256k to
+        * improve log format performance. This means a record can require up
+        * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
+        * data (each header supports 32k of data).
+        */
+       cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
+       if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
+               hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
+               if (sunit % XLOG_HEADER_CYCLE_SIZE)
+                       hdrs++;
+       }
+
+       /*
+        * A fixed number of extended headers is expected based on h_size. If
+        * required, format those now so the unmount record is located
+        * correctly.
+        *
+        * Since we only write an unmount record, we only need one h_cycle_data
+        * entry for the unmount record block. The subsequent record data
+        * blocks are zeroed, which means we can stamp them directly with the
+        * cycle and zero the rest of the cycle data in the extended headers.
+        */
+       if (hdrs > 1) {
+               for (i = 1; i < hdrs; i++) {
+                       p = nextfunc(p, BBSIZE, private);
+                       memset(p, 0, BBSIZE);
+                       /* xlog_rec_ext_header.xh_cycle */
+                       *(__be32 *)p = cycle_lsn;
+               }
+       }
+
+       /*
+        * The total length is the max of the stripe unit or 2 basic block
+        * minimum (1 hdr blk + 1 data blk). The record length is the total
+        * minus however many header blocks are required.
+        */
+       head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
+
+       /*
+        * Write out the unmount record, pack the first word into the record
+        * header and stamp the block with the cycle.
+        */
         p = nextfunc(p, BBSIZE, private);
         unmount_record(p);
  
-       cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
-       for (i = 2; i < len; i++) {
+       head->h_cycle_data[0] = *(__be32 *)p;
+       *(__be32 *)p = cycle_lsn;
+
+       /*
+        * Finally, zero all remaining blocks in the record and stamp each with
+        * the cycle. We don't need to pack any of these blocks because the
+        * cycle data in the headers has already been zeroed.
+        */
+       len = max(len, hdrs + 1);
+       for (i = hdrs + 1; i < len; i++) {
                 p = nextfunc(p, BBSIZE, private);
                 memset(p, 0, BBSIZE);
                 *(__be32 *)p = cycle_lsn;
@@ -356,7 +547,7 @@ libxfs_bcompare(struct cache_node *node, cache_key_t key)
  void
  libxfs_bprint(xfs_buf_t *bp)
  {
-       fprintf(stderr, "Buffer 0x%p blkno=%llu bytes=%u flags=0x%x count=%u\n",
+       fprintf(stderr, "Buffer %p blkno=%llu bytes=%u flags=0x%x count=%u\n",
                 bp, (unsigned long long)bp->b_bn, (unsigned)bp->b_bcount,
                 bp->b_flags, bp->b_node.cn_count);
  }
@@ -380,6 +571,7 @@ __initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
                         strerror(errno));
                 exit(1);
         }
+       memset(bp->b_addr, 0, bytes);
  #ifdef XFS_BUF_TRACING
         list_head_init(&bp->b_lock_list);
  #endif
@@ -387,6 +579,13 @@ __initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
         bp->b_holder = 0;
         bp->b_recur = 0;
         bp->b_ops = NULL;
+
+       if (!bp->b_maps) {
+               bp->b_nmaps = 1;
+               bp->b_maps = &bp->__b_map;
+               bp->b_maps[0].bm_bn = bp->b_bn;
+               bp->b_maps[0].bm_len = bp->b_length;
+       }
  }
  
  static void
@@ -404,8 +603,8 @@ libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
         int i;
  
         bytes = sizeof(struct xfs_buf_map) * nmaps;
-       bp->b_map = malloc(bytes);
-       if (!bp->b_map) {
+       bp->b_maps = malloc(bytes);
+       if (!bp->b_maps) {
                 fprintf(stderr,
                         _("%s: %s can't malloc %u bytes: %s\n"),
                         progname, __FUNCTION__, bytes,
@@ -416,8 +615,8 @@ libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
  
         bytes = 0;
         for ( i = 0; i < nmaps; i++) {
-               bp->b_map[i].bm_bn = map[i].bm_bn;
-               bp->b_map[i].bm_len = map[i].bm_len;
+               bp->b_maps[i].bm_bn = map[i].bm_bn;
+               bp->b_maps[i].bm_len = map[i].bm_len;
                 bytes += BBTOB(map[i].bm_len);
         }
  
@@ -450,13 +649,16 @@ __libxfs_getbufr(int blen)
                         list_del_init(&bp->b_node.cn_mru);
                         free(bp->b_addr);
                         bp->b_addr = NULL;
-                       free(bp->b_map);
-                       bp->b_map = NULL;
+                       if (bp->b_maps != &bp->__b_map)
+                               free(bp->b_maps);
+                       bp->b_maps = NULL;
                 }
         } else
                 bp = kmem_zone_zalloc(xfs_buf_zone, 0);
         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
         bp->b_ops = NULL;
+       if (bp->b_flags & LIBXFS_B_DIRTY)
+               fprintf(stderr, "found dirty buffer (bulk) on free list!");
  
         return bp;
  }
@@ -586,15 +788,39 @@ libxfs_getbuf_flags(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len,
         return __cache_lookup(&key, flags);
  }
  
+/*
+ * Clean the buffer flags for libxfs_getbuf*(), which wants to return
+ * an unused buffer with clean state.  This prevents CRC errors on a
+ * re-read of a corrupt block that was prefetched and freed.  This
+ * can happen with a massively corrupt directory that is discarded,
+ * but whose blocks are then recycled into expanding lost+found.
+ *
+ * Note however that if the buffer's dirty (prefetch calls getbuf)
+ * we'll leave the state alone because we don't want to discard blocks
+ * that have been fixed.
+ */
+static void
+reset_buf_state(
+       struct xfs_buf  *bp)
+{
+       if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
+               bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
+                               LIBXFS_B_UPTODATE);
+}
+
  struct xfs_buf *
  libxfs_getbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
  {
-       return libxfs_getbuf_flags(btp, blkno, len, 0);
+       struct xfs_buf  *bp;
+
+       bp = libxfs_getbuf_flags(btp, blkno, len, 0);
+       reset_buf_state(bp);
+       return bp;
  }
  
-struct xfs_buf *
-libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
-                 int nmaps, int flags)
+static struct xfs_buf *
+__libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
+                   int nmaps, int flags)
  {
         struct xfs_bufkey key = {0};
         int i;
@@ -614,9 +840,26 @@ libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
         return __cache_lookup(&key, flags);
  }
  
+struct xfs_buf *
+libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
+                 int nmaps, int flags)
+{
+       struct xfs_buf  *bp;
+
+       bp = __libxfs_getbuf_map(btp, map, nmaps, flags);
+       reset_buf_state(bp);
+       return bp;
+}
+
  void
  libxfs_putbuf(xfs_buf_t *bp)
  {
+       /*
+        * ensure that any errors on this use of the buffer don't carry
+        * over to the next user.
+        */
+       bp->b_error = 0;
+
  #ifdef XFS_BUF_TRACING
         pthread_mutex_lock(&libxfs_bcache->c_mutex);
         lock_buf_count--;
@@ -632,6 +875,7 @@ libxfs_putbuf(xfs_buf_t *bp)
                         pthread_mutex_unlock(&bp->b_lock);
                 }
         }
+
         cache_node_put(libxfs_bcache, (struct cache_node *)bp);
  }
  
@@ -667,20 +911,20 @@ __read_buf(int fd, void *buf, int len, off64_t offset, int flags)
  {
         int     sts;
  
-       sts = pread64(fd, buf, len, offset);
+       sts = pread(fd, buf, len, offset);
         if (sts < 0) {
                 int error = errno;
                 fprintf(stderr, _("%s: read failed: %s\n"),
                         progname, strerror(error));
                 if (flags & LIBXFS_EXIT_ON_FAILURE)
                         exit(1);
-               return error;
+               return -error;
         } else if (sts != len) {
                 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
                         progname, sts, len);
                 if (flags & LIBXFS_EXIT_ON_FAILURE)
                         exit(1);
-               return EIO;
+               return -EIO;
         }
         return 0;
  }
@@ -727,7 +971,7 @@ libxfs_readbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
         xfs_buf_t       *bp;
         int             error;
  
-       bp = libxfs_getbuf(btp, blkno, len);
+       bp = libxfs_getbuf_flags(btp, blkno, len, 0);
         if (!bp)
                 return NULL;
  
@@ -770,14 +1014,14 @@ libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
  {
         int     fd;
         int     error = 0;
-       char    *buf;
+       void    *buf;
         int     i;
  
         fd = libxfs_device_to_fd(btp->dev);
         buf = bp->b_addr;
         for (i = 0; i < bp->b_nmaps; i++) {
-               off64_t offset = LIBXFS_BBTOOFF64(bp->b_map[i].bm_bn);
-               int len = BBTOB(bp->b_map[i].bm_len);
+               off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
+               int len = BBTOB(bp->b_maps[i].bm_len);
  
                 error = __read_buf(fd, buf, len, offset, flags);
                 if (error) {
@@ -790,9 +1034,9 @@ libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
         if (!error)
                 bp->b_flags |= LIBXFS_B_UPTODATE;
  #ifdef IO_DEBUG
-       printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
-               pthread_self(), __FUNCTION__, , error,
-               (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
+       printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
+               pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
+               (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
  #endif
         return error;
  }
@@ -808,7 +1052,7 @@ libxfs_readbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
                 return libxfs_readbuf(btp, map[0].bm_bn, map[0].bm_len,
                                         flags, ops);
  
-       bp = libxfs_getbuf_map(btp, map, nmaps, 0);
+       bp = __libxfs_getbuf_map(btp, map, nmaps, 0);
         if (!bp)
                 return NULL;
  
@@ -822,7 +1066,7 @@ libxfs_readbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
         if (!error)
                 libxfs_readbuf_verify(bp, ops);
  
-#ifdef IO_DEBUG
+#ifdef IO_DEBUGX
         printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
                 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
                 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
@@ -835,20 +1079,20 @@ __write_buf(int fd, void *buf, int len, off64_t offset, int flags)
  {
         int     sts;
  
-       sts = pwrite64(fd, buf, len, offset);
+       sts = pwrite(fd, buf, len, offset);
         if (sts < 0) {
                 int error = errno;
-               fprintf(stderr, _("%s: pwrite64 failed: %s\n"),
+               fprintf(stderr, _("%s: pwrite failed: %s\n"),
                         progname, strerror(error));
                 if (flags & LIBXFS_B_EXIT)
                         exit(1);
-               return error;
+               return -error;
         } else if (sts != len) {
-               fprintf(stderr, _("%s: error - pwrite64 only %d of %d bytes\n"),
+               fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
                         progname, sts, len);
                 if (flags & LIBXFS_B_EXIT)
                         exit(1);
-               return EIO;
+               return -EIO;
         }
         return 0;
  }
@@ -857,7 +1101,6 @@ int
  libxfs_writebufr(xfs_buf_t *bp)
  {
         int     fd = libxfs_device_to_fd(bp->b_target->dev);
-       int     error = 0;
  
         /*
          * we never write buffers that are marked stale. This indicates they
@@ -866,7 +1109,7 @@ libxfs_writebufr(xfs_buf_t *bp)
          * bugs like this. Make sure the error is obvious as to the cause.
          */
         if (bp->b_flags & LIBXFS_B_STALE) {
-               bp->b_error = ESTALE;
+               bp->b_error = -ESTALE;
                 return bp->b_error;
         }
  
@@ -880,44 +1123,44 @@ libxfs_writebufr(xfs_buf_t *bp)
                 bp->b_ops->verify_write(bp);
                 if (bp->b_error) {
                         fprintf(stderr,
-       _("%s: write verifer failed on bno 0x%llx/0x%x\n"),
-                               __func__, (long long)bp->b_bn, bp->b_bcount);
+       _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
+                               __func__, bp->b_ops->name,
+                               (long long)bp->b_bn, bp->b_bcount);
                         return bp->b_error;
                 }
         }
  
         if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
-               error = __write_buf(fd, bp->b_addr, bp->b_bcount,
+               bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
                                     LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
         } else {
                 int     i;
-               char    *buf = bp->b_addr;
+               void    *buf = bp->b_addr;
  
                 for (i = 0; i < bp->b_nmaps; i++) {
-                       off64_t offset = LIBXFS_BBTOOFF64(bp->b_map[i].bm_bn);
-                       int len = BBTOB(bp->b_map[i].bm_len);
+                       off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
+                       int len = BBTOB(bp->b_maps[i].bm_len);
  
-                       error = __write_buf(fd, buf, len, offset, bp->b_flags);
-                       if (error) {
-                               bp->b_error = error;
+                       bp->b_error = __write_buf(fd, buf, len, offset,
+                                                 bp->b_flags);
+                       if (bp->b_error)
                                 break;
-                       }
                         buf += len;
                 }
         }
  
  #ifdef IO_DEBUG
-       printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p\n",
+       printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
                         pthread_self(), __FUNCTION__, bp->b_bcount,
                         (long long)LIBXFS_BBTOOFF64(bp->b_bn),
-                       (long long)bp->b_bn, bp);
+                       (long long)bp->b_bn, bp, bp->b_error);
  #endif
-       if (!error) {
+       if (!bp->b_error) {
                 bp->b_flags |= LIBXFS_B_UPTODATE;
                 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
                                  LIBXFS_B_UNCHECKED);
         }
-       return error;
+       return bp->b_error;
  }
  
  int
@@ -928,6 +1171,7 @@ libxfs_writebuf_int(xfs_buf_t *bp, int flags)
          * subsequent reads after this write from seeing stale errors.
          */
         bp->b_error = 0;
+       bp->b_flags &= ~LIBXFS_B_STALE;
         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
         return 0;
  }
@@ -946,6 +1190,7 @@ libxfs_writebuf(xfs_buf_t *bp, int flags)
          * subsequent reads after this write from seeing stale errors.
          */
         bp->b_error = 0;
+       bp->b_flags &= ~LIBXFS_B_STALE;
         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
         libxfs_putbuf(bp);
         return 0;
@@ -976,23 +1221,26 @@ libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
  }
  
  static void
-libxfs_brelse(struct cache_node *node)
+libxfs_brelse(
+       struct cache_node       *node)
  {
-       xfs_buf_t               *bp = (xfs_buf_t *)node;
+       struct xfs_buf          *bp = (struct xfs_buf *)node;
  
-       if (bp != NULL) {
-               if (bp->b_flags & LIBXFS_B_DIRTY)
-                       libxfs_writebufr(bp);
-               pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
-               list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
-               pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
-       }
+       if (!bp)
+               return;
+       if (bp->b_flags & LIBXFS_B_DIRTY)
+               fprintf(stderr,
+                       "releasing dirty buffer to free list!");
+
+       pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+       list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
+       pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
  }
  
  static unsigned int
  libxfs_bulkrelse(
-       struct cache            *cache,
-       struct list_head        *list)
+       struct cache            *cache,
+       struct list_head        *list)
  {
         xfs_buf_t               *bp;
         int                     count = 0;
@@ -1002,29 +1250,59 @@ libxfs_bulkrelse(
  
         list_for_each_entry(bp, list, b_node.cn_mru) {
                 if (bp->b_flags & LIBXFS_B_DIRTY)
-                       libxfs_writebufr(bp);
+                       fprintf(stderr,
+                               "releasing dirty buffer (bulk) to free list!");
                 count++;
         }
  
         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
-       __list_splice(list, &xfs_buf_freelist.cm_list);
+       list_splice(list, &xfs_buf_freelist.cm_list);
         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
  
         return count;
  }
  
-static void
-libxfs_bflush(struct cache_node *node)
+/*
+ * Free everything from the xfs_buf_freelist MRU, used at final teardown
+ */
+void
+libxfs_bcache_free(void)
  {
-       xfs_buf_t               *bp = (xfs_buf_t *)node;
+       struct list_head        *cm_list;
+       xfs_buf_t               *bp, *next;
+
+       cm_list = &xfs_buf_freelist.cm_list;
+       list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
+               free(bp->b_addr);
+               if (bp->b_maps != &bp->__b_map)
+                       free(bp->b_maps);
+               kmem_zone_free(xfs_buf_zone, bp);
+       }
+}
  
-       if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY))
-               libxfs_writebufr(bp);
+/*
+ * When a buffer is marked dirty, the error is cleared. Hence if we are trying
+ * to flush a buffer prior to cache reclaim that has an error on it it means
+ * we've already tried to flush it and it failed. Prevent repeated corruption
+ * errors from being reported by skipping such buffers - when the corruption is
+ * fixed the buffer will be marked dirty again and we can write it again.
+ */
+static int
+libxfs_bflush(
+       struct cache_node       *node)
+{
+       struct xfs_buf          *bp = (struct xfs_buf *)node;
+
+       if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
+               return libxfs_writebufr(bp);
+       return bp->b_error;
  }
  
  void
  libxfs_putbufr(xfs_buf_t *bp)
  {
+       if (bp->b_flags & LIBXFS_B_DIRTY)
+               libxfs_writebufr(bp);
         libxfs_brelse((struct cache_node *)bp);
  }
  
@@ -1048,12 +1326,12 @@ libxfs_bcache_overflowed(void)
  }
  
  struct cache_operations libxfs_bcache_operations = {
-       /* .hash */     libxfs_bhash,
-       /* .alloc */    libxfs_balloc,
-       /* .flush */    libxfs_bflush,
-       /* .relse */    libxfs_brelse,
-       /* .compare */  libxfs_bcompare,
-       /* .bulkrelse */libxfs_bulkrelse
+       .hash           = libxfs_bhash,
+       .alloc          = libxfs_balloc,
+       .flush          = libxfs_bflush,
+       .relse          = libxfs_brelse,
+       .compare        = libxfs_bcompare,
+       .bulkrelse      = libxfs_bulkrelse
  };
  
  
@@ -1061,29 +1339,81 @@ struct cache_operations libxfs_bcache_operations = {
   * Inode cache stubs.
   */
  
+kmem_zone_t            *xfs_inode_zone;
  extern kmem_zone_t     *xfs_ili_zone;
-extern kmem_zone_t     *xfs_inode_zone;
+
+/*
+ * If there are inline format data / attr forks attached to this inode,
+ * make sure they're not corrupt.
+ */
+bool
+libxfs_inode_verify_forks(
+       struct xfs_inode        *ip,
+       struct xfs_ifork_ops    *ops)
+{
+       struct xfs_ifork        *ifp;
+       xfs_failaddr_t          fa;
+
+       if (!ops)
+               return true;
+
+       fa = xfs_ifork_verify_data(ip, ops);
+       if (fa) {
+               ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+               xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
+                               ifp->if_u1.if_data, ifp->if_bytes, fa);
+               return false;
+       }
+
+       fa = xfs_ifork_verify_attr(ip, ops);
+       if (fa) {
+               ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
+               xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
+                               ifp ? ifp->if_u1.if_data : NULL,
+                               ifp ? ifp->if_bytes : 0, fa);
+               return false;
+       }
+       return true;
+}
  
  int
-libxfs_iget(xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, uint lock_flags,
-               xfs_inode_t **ipp, xfs_daddr_t bno)
+libxfs_iget(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       xfs_ino_t               ino,
+       uint                    lock_flags,
+       struct xfs_inode        **ipp,
+       struct xfs_ifork_ops    *ifork_ops)
  {
-       xfs_inode_t     *ip;
-       int             error = 0;
+       struct xfs_inode        *ip;
+       int                     error = 0;
  
         ip = kmem_zone_zalloc(xfs_inode_zone, 0);
         if (!ip)
-               return ENOMEM;
+               return -ENOMEM;
  
         ip->i_ino = ino;
         ip->i_mount = mp;
-       error = xfs_iread(mp, tp, ip, bno);
+       error = xfs_iread(mp, tp, ip, 0);
         if (error) {
                 kmem_zone_free(xfs_inode_zone, ip);
                 *ipp = NULL;
                 return error;
         }
  
+       if (!libxfs_inode_verify_forks(ip, ifork_ops)) {
+               libxfs_iput(ip);
+               return -EFSCORRUPTED;
+       }
+
+       /*
+        * set up the inode ops structure that the libxfs code relies on
+        */
+       if (XFS_ISDIR(ip))
+               ip->d_ops = mp->m_dir_inode_ops;
+       else
+               ip->d_ops = mp->m_nondir_inode_ops;
+
         *ipp = ip;
         return 0;
  }
@@ -1091,7 +1421,7 @@ libxfs_iget(xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, uint lock_flags,
  static void
  libxfs_idestroy(xfs_inode_t *ip)
  {
-       switch (ip->i_d.di_mode & S_IFMT) {
+       switch (VFS_I(ip)->i_mode & S_IFMT) {
                 case S_IFREG:
                 case S_IFDIR:
                 case S_IFLNK:
@@ -1100,10 +1430,12 @@ libxfs_idestroy(xfs_inode_t *ip)
         }
         if (ip->i_afp)
                 libxfs_idestroy_fork(ip, XFS_ATTR_FORK);
+       if (ip->i_cowfp)
+               xfs_idestroy_fork(ip, XFS_COW_FORK);
  }
  
  void
-libxfs_iput(xfs_inode_t *ip, uint lock_flags)
+libxfs_iput(xfs_inode_t *ip)
  {
         if (ip->i_itemp)
                 kmem_zone_free(xfs_ili_zone, ip->i_itemp);