xfs: fix transaction leak on remote attr set/remove failure

[thirdparty/xfsprogs-dev.git] / libxfs / rdwr.c
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c

index da781daa47257747244ff30dd5e55429f4066cf8..14a4633e9fa60d3a722d11b257b879ed74e67adf 100644 (file)
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
  /*
   * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
  
  
@@ -67,7 +55,8 @@
  
  #define IO_BCOMPARE_CHECK
  
-void
+/* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
+int
  libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
  {
         xfs_off_t       start_offset, end_offset, offset;
@@ -87,7 +76,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
         fd = libxfs_device_to_fd(btp->dev);
         start_offset = LIBXFS_BBTOOFF64(start);
  
-       if ((lseek64(fd, start_offset, SEEK_SET)) < 0) {
+       if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
                 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
                         progname, __FUNCTION__,
                         (unsigned long long)start_offset, strerror(errno));
@@ -109,6 +98,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
                 offset += bytes;
         }
         free(z);
+       return 0;
  }
  
  static void unmount_record(void *p)
@@ -116,9 +106,9 @@ static void unmount_record(void *p)
         xlog_op_header_t        *op = (xlog_op_header_t *)p;
         /* the data section must be 32 bit size aligned */
         struct {
-           __uint16_t magic;
-           __uint16_t pad1;
-           __uint32_t pad2; /* may as well make it 64 bits */
+           uint16_t magic;
+           uint16_t pad1;
+           uint32_t pad2; /* may as well make it 64 bits */
         } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
  
         memset(p, 0, BBSIZE);
@@ -133,49 +123,142 @@ static void unmount_record(void *p)
         memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
  }
  
-static char *next(char *ptr, int offset, void *private)
+static char *next(
+       char            *ptr,
+       int             offset,
+       void            *private)
  {
-       xfs_buf_t       *buf = (xfs_buf_t *)private;
+       struct xfs_buf  *buf = (struct xfs_buf *)private;
  
-       if (XFS_BUF_COUNT(buf) < (int)(ptr - XFS_BUF_PTR(buf)) + offset)
+       if (buf &&
+           (buf->b_bcount < (int)(ptr - (char *)buf->b_addr) + offset))
                 abort();
+
         return ptr + offset;
  }
  
+/*
+ * Format the log. The caller provides either a buftarg which is used to access
+ * the log via buffers or a direct pointer to a buffer that encapsulates the
+ * entire log.
+ */
  int
  libxfs_log_clear(
         struct xfs_buftarg      *btp,
+       char                    *dptr,
         xfs_daddr_t             start,
-       uint                    length,
+       uint                    length,         /* basic blocks */
         uuid_t                  *fs_uuid,
         int                     version,
-       int                     sunit,
+       int                     sunit,          /* bytes */
         int                     fmt,
-       int                     cycle)
+       int                     cycle,
+       bool                    max)
  {
-       xfs_buf_t               *bp;
+       struct xfs_buf          *bp = NULL;
         int                     len;
         xfs_lsn_t               lsn;
+       xfs_lsn_t               tail_lsn;
+       xfs_daddr_t             blk;
+       xfs_daddr_t             end_blk;
+       char                    *ptr;
  
-       if (!btp->dev || !fs_uuid)
+       if (((btp && dptr) || (!btp && !dptr)) ||
+           (btp && !btp->dev) || !fs_uuid)
                 return -EINVAL;
  
-       if (cycle != XLOG_INIT_CYCLE)
-               return -EINVAL;
+       /* first zero the log */
+       if (btp)
+               libxfs_device_zero(btp, start, length);
+       else
+               memset(dptr, 0, BBTOB(length));
  
+       /*
+        * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
+        * special reset case where we only write a single record where the lsn
+        * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
+        * the specified cycle and points tail_lsn at the last record of the
+        * previous cycle.
+        */
+       len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
+       len = max(len, 2);
         lsn = xlog_assign_lsn(cycle, 0);
+       if (cycle == XLOG_INIT_CYCLE)
+               tail_lsn = lsn;
+       else
+               tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
  
-       /* first zero the log */
-       libxfs_device_zero(btp, start, length);
+       /* write out the first log record */
+       ptr = dptr;
+       if (btp) {
+               bp = libxfs_getbufr(btp, start, len);
+               ptr = bp->b_addr;
+       }
+       libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
+                         next, bp);
+       if (bp) {
+               bp->b_flags |= LIBXFS_B_DIRTY;
+               libxfs_putbufr(bp);
+       }
+
+       /*
+        * There's nothing else to do if this is a log reset. The kernel detects
+        * the rest of the log is zeroed and starts at cycle 1.
+        */
+       if (cycle == XLOG_INIT_CYCLE)
+               return 0;
+
+       /*
+        * Bump the record size for a full log format if the caller allows it.
+        * This is primarily for performance reasons and most callers don't care
+        * about record size since the log is clean after we're done.
+        */
+       if (max)
+               len = BTOBB(BDSTRAT_SIZE);
+
+       /*
+        * Otherwise, fill everything beyond the initial record with records of
+        * the previous cycle so the kernel head/tail detection works correctly.
+        *
+        * We don't particularly care about the record size or content here.
+        * It's only important that the headers are in place such that the
+        * kernel finds 1.) a clean log and 2.) the correct current cycle value.
+        * Therefore, bump up the record size to the max to use larger I/Os and
+        * improve performance.
+        */
+       cycle--;
+       blk = start + len;
+       if (dptr)
+               dptr += BBTOB(len);
+       end_blk = start + length;
+
+       len = min(end_blk - blk, len);
+       while (blk < end_blk) {
+               lsn = xlog_assign_lsn(cycle, blk - start);
+               tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
+
+               ptr = dptr;
+               if (btp) {
+                       bp = libxfs_getbufr(btp, blk, len);
+                       ptr = bp->b_addr;
+               }
+               /*
+                * Note: pass the full buffer length as the sunit to initialize
+                * the entire buffer.
+                */
+               libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
+                                 tail_lsn, next, bp);
+               if (bp) {
+                       bp->b_flags |= LIBXFS_B_DIRTY;
+                       libxfs_putbufr(bp);
+               }
+
+               blk += len;
+               if (dptr)
+                       dptr += BBTOB(len);
+               len = min(end_blk - blk, len);
+       }
  
-       /* then write a log record header */
-       len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
-       len = MAX(len, 2);
-       bp = libxfs_getbufr(btp, start, len);
-       libxfs_log_header(XFS_BUF_PTR(bp), fs_uuid, version, sunit, fmt, lsn,
-                         lsn, next, bp);
-       bp->b_flags |= LIBXFS_B_DIRTY;
-       libxfs_putbufr(bp);
         return 0;
  }
  
@@ -195,6 +278,7 @@ libxfs_log_header(
         char                    *p = caddr;
         __be32                  cycle_lsn;
         int                     i, len;
+       int                     hdrs = 1;
  
         if (lsn == NULLCOMMITLSN)
                 lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
@@ -207,41 +291,80 @@ libxfs_log_header(
         head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
         head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
         head->h_version = cpu_to_be32(version);
-       if (len != 1)
-               head->h_len = cpu_to_be32(sunit - BBSIZE);
-       else
-               head->h_len = cpu_to_be32(20);
         head->h_crc = cpu_to_le32(0);
         head->h_prev_block = cpu_to_be32(-1);
         head->h_num_logops = cpu_to_be32(1);
         head->h_fmt = cpu_to_be32(fmt);
-       head->h_size = cpu_to_be32(XLOG_HEADER_CYCLE_SIZE);
+       head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
  
         head->h_lsn = cpu_to_be64(lsn);
         head->h_tail_lsn = cpu_to_be64(tail_lsn);
  
         memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
  
-       p = nextfunc(p, BBSIZE, private);
-       unmount_record(p);
-
         /*
-        * The kernel expects to see either a log record header magic or the LSN
-        * cycle at the top of every log block (for example, see
-        * xlog_[un]pack_data() and xlog_get_cycle()). Pack the unmount record
-        * block appropriately here.
+        * The kernel expects to see either a log record header magic value or
+        * the LSN cycle at the top of every log block. The first word of each
+        * non-header block is copied to the record headers and replaced with
+        * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
+        * details).
+        *
+        * Even though we only ever write an unmount record (one block), we
+        * support writing log records up to the max log buffer size of 256k to
+        * improve log format performance. This means a record can require up
+        * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
+        * data (each header supports 32k of data).
          */
         cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
+       if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
+               hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
+               if (sunit % XLOG_HEADER_CYCLE_SIZE)
+                       hdrs++;
+       }
+
+       /*
+        * A fixed number of extended headers is expected based on h_size. If
+        * required, format those now so the unmount record is located
+        * correctly.
+        *
+        * Since we only write an unmount record, we only need one h_cycle_data
+        * entry for the unmount record block. The subsequent record data
+        * blocks are zeroed, which means we can stamp them directly with the
+        * cycle and zero the rest of the cycle data in the extended headers.
+        */
+       if (hdrs > 1) {
+               for (i = 1; i < hdrs; i++) {
+                       p = nextfunc(p, BBSIZE, private);
+                       memset(p, 0, BBSIZE);
+                       /* xlog_rec_ext_header.xh_cycle */
+                       *(__be32 *)p = cycle_lsn;
+               }
+       }
+
+       /*
+        * The total length is the max of the stripe unit or 2 basic block
+        * minimum (1 hdr blk + 1 data blk). The record length is the total
+        * minus however many header blocks are required.
+        */
+       head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
+
+       /*
+        * Write out the unmount record, pack the first word into the record
+        * header and stamp the block with the cycle.
+        */
+       p = nextfunc(p, BBSIZE, private);
+       unmount_record(p);
+
         head->h_cycle_data[0] = *(__be32 *)p;
         *(__be32 *)p = cycle_lsn;
  
         /*
-        * Now zero any remaining blocks in the record and stamp with the cycle.
-        * Note that we don't need to swap into h_cycle_data because it has
-        * already been initialized to zero.
+        * Finally, zero all remaining blocks in the record and stamp each with
+        * the cycle. We don't need to pack any of these blocks because the
+        * cycle data in the headers has already been zeroed.
          */
-       len = MAX(len, 2);
-       for (i = 2; i < len; i++) {
+       len = max(len, hdrs + 1);
+       for (i = hdrs + 1; i < len; i++) {
                 p = nextfunc(p, BBSIZE, private);
                 memset(p, 0, BBSIZE);
                 *(__be32 *)p = cycle_lsn;
@@ -424,7 +547,7 @@ libxfs_bcompare(struct cache_node *node, cache_key_t key)
  void
  libxfs_bprint(xfs_buf_t *bp)
  {
-       fprintf(stderr, "Buffer 0x%p blkno=%llu bytes=%u flags=0x%x count=%u\n",
+       fprintf(stderr, "Buffer %p blkno=%llu bytes=%u flags=0x%x count=%u\n",
                 bp, (unsigned long long)bp->b_bn, (unsigned)bp->b_bcount,
                 bp->b_flags, bp->b_node.cn_count);
  }
@@ -456,6 +579,13 @@ __initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
         bp->b_holder = 0;
         bp->b_recur = 0;
         bp->b_ops = NULL;
+
+       if (!bp->b_maps) {
+               bp->b_nmaps = 1;
+               bp->b_maps = &bp->__b_map;
+               bp->b_maps[0].bm_bn = bp->b_bn;
+               bp->b_maps[0].bm_len = bp->b_length;
+       }
  }
  
  static void
@@ -473,8 +603,8 @@ libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
         int i;
  
         bytes = sizeof(struct xfs_buf_map) * nmaps;
-       bp->b_map = malloc(bytes);
-       if (!bp->b_map) {
+       bp->b_maps = malloc(bytes);
+       if (!bp->b_maps) {
                 fprintf(stderr,
                         _("%s: %s can't malloc %u bytes: %s\n"),
                         progname, __FUNCTION__, bytes,
@@ -485,8 +615,8 @@ libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
  
         bytes = 0;
         for ( i = 0; i < nmaps; i++) {
-               bp->b_map[i].bm_bn = map[i].bm_bn;
-               bp->b_map[i].bm_len = map[i].bm_len;
+               bp->b_maps[i].bm_bn = map[i].bm_bn;
+               bp->b_maps[i].bm_len = map[i].bm_len;
                 bytes += BBTOB(map[i].bm_len);
         }
  
@@ -519,13 +649,16 @@ __libxfs_getbufr(int blen)
                         list_del_init(&bp->b_node.cn_mru);
                         free(bp->b_addr);
                         bp->b_addr = NULL;
-                       free(bp->b_map);
-                       bp->b_map = NULL;
+                       if (bp->b_maps != &bp->__b_map)
+                               free(bp->b_maps);
+                       bp->b_maps = NULL;
                 }
         } else
                 bp = kmem_zone_zalloc(xfs_buf_zone, 0);
         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
         bp->b_ops = NULL;
+       if (bp->b_flags & LIBXFS_B_DIRTY)
+               fprintf(stderr, "found dirty buffer (bulk) on free list!");
  
         return bp;
  }
@@ -778,14 +911,14 @@ __read_buf(int fd, void *buf, int len, off64_t offset, int flags)
  {
         int     sts;
  
-       sts = pread64(fd, buf, len, offset);
+       sts = pread(fd, buf, len, offset);
         if (sts < 0) {
-               int error = -errno;
+               int error = errno;
                 fprintf(stderr, _("%s: read failed: %s\n"),
                         progname, strerror(error));
                 if (flags & LIBXFS_EXIT_ON_FAILURE)
                         exit(1);
-               return error;
+               return -error;
         } else if (sts != len) {
                 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
                         progname, sts, len);
@@ -881,14 +1014,14 @@ libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
  {
         int     fd;
         int     error = 0;
-       char    *buf;
+       void    *buf;
         int     i;
  
         fd = libxfs_device_to_fd(btp->dev);
         buf = bp->b_addr;
         for (i = 0; i < bp->b_nmaps; i++) {
-               off64_t offset = LIBXFS_BBTOOFF64(bp->b_map[i].bm_bn);
-               int len = BBTOB(bp->b_map[i].bm_len);
+               off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
+               int len = BBTOB(bp->b_maps[i].bm_len);
  
                 error = __read_buf(fd, buf, len, offset, flags);
                 if (error) {
@@ -901,9 +1034,9 @@ libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
         if (!error)
                 bp->b_flags |= LIBXFS_B_UPTODATE;
  #ifdef IO_DEBUG
-       printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
-               pthread_self(), __FUNCTION__, , error,
-               (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
+       printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
+               pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
+               (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
  #endif
         return error;
  }
@@ -933,7 +1066,7 @@ libxfs_readbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
         if (!error)
                 libxfs_readbuf_verify(bp, ops);
  
-#ifdef IO_DEBUG
+#ifdef IO_DEBUGX
         printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
                 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
                 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
@@ -946,16 +1079,16 @@ __write_buf(int fd, void *buf, int len, off64_t offset, int flags)
  {
         int     sts;
  
-       sts = pwrite64(fd, buf, len, offset);
+       sts = pwrite(fd, buf, len, offset);
         if (sts < 0) {
-               int error = -errno;
-               fprintf(stderr, _("%s: pwrite64 failed: %s\n"),
+               int error = errno;
+               fprintf(stderr, _("%s: pwrite failed: %s\n"),
                         progname, strerror(error));
                 if (flags & LIBXFS_B_EXIT)
                         exit(1);
-               return error;
+               return -error;
         } else if (sts != len) {
-               fprintf(stderr, _("%s: error - pwrite64 only %d of %d bytes\n"),
+               fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
                         progname, sts, len);
                 if (flags & LIBXFS_B_EXIT)
                         exit(1);
@@ -968,7 +1101,6 @@ int
  libxfs_writebufr(xfs_buf_t *bp)
  {
         int     fd = libxfs_device_to_fd(bp->b_target->dev);
-       int     error = 0;
  
         /*
          * we never write buffers that are marked stale. This indicates they
@@ -991,28 +1123,28 @@ libxfs_writebufr(xfs_buf_t *bp)
                 bp->b_ops->verify_write(bp);
                 if (bp->b_error) {
                         fprintf(stderr,
-       _("%s: write verifer failed on bno 0x%llx/0x%x\n"),
-                               __func__, (long long)bp->b_bn, bp->b_bcount);
+       _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
+                               __func__, bp->b_ops->name,
+                               (long long)bp->b_bn, bp->b_bcount);
                         return bp->b_error;
                 }
         }
  
         if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
-               error = __write_buf(fd, bp->b_addr, bp->b_bcount,
+               bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
                                     LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
         } else {
                 int     i;
-               char    *buf = bp->b_addr;
+               void    *buf = bp->b_addr;
  
                 for (i = 0; i < bp->b_nmaps; i++) {
-                       off64_t offset = LIBXFS_BBTOOFF64(bp->b_map[i].bm_bn);
-                       int len = BBTOB(bp->b_map[i].bm_len);
+                       off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
+                       int len = BBTOB(bp->b_maps[i].bm_len);
  
-                       error = __write_buf(fd, buf, len, offset, bp->b_flags);
-                       if (error) {
-                               bp->b_error = error;
+                       bp->b_error = __write_buf(fd, buf, len, offset,
+                                                 bp->b_flags);
+                       if (bp->b_error)
                                 break;
-                       }
                         buf += len;
                 }
         }
@@ -1021,14 +1153,14 @@ libxfs_writebufr(xfs_buf_t *bp)
         printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
                         pthread_self(), __FUNCTION__, bp->b_bcount,
                         (long long)LIBXFS_BBTOOFF64(bp->b_bn),
-                       (long long)bp->b_bn, bp, error);
+                       (long long)bp->b_bn, bp, bp->b_error);
  #endif
-       if (!error) {
+       if (!bp->b_error) {
                 bp->b_flags |= LIBXFS_B_UPTODATE;
                 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
                                  LIBXFS_B_UNCHECKED);
         }
-       return error;
+       return bp->b_error;
  }
  
  int
@@ -1089,23 +1221,26 @@ libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
  }
  
  static void
-libxfs_brelse(struct cache_node *node)
+libxfs_brelse(
+       struct cache_node       *node)
  {
-       xfs_buf_t               *bp = (xfs_buf_t *)node;
+       struct xfs_buf          *bp = (struct xfs_buf *)node;
  
-       if (bp != NULL) {
-               if (bp->b_flags & LIBXFS_B_DIRTY)
-                       libxfs_writebufr(bp);
-               pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
-               list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
-               pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
-       }
+       if (!bp)
+               return;
+       if (bp->b_flags & LIBXFS_B_DIRTY)
+               fprintf(stderr,
+                       "releasing dirty buffer to free list!");
+
+       pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+       list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
+       pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
  }
  
  static unsigned int
  libxfs_bulkrelse(
-       struct cache            *cache,
-       struct list_head        *list)
+       struct cache            *cache,
+       struct list_head        *list)
  {
         xfs_buf_t               *bp;
         int                     count = 0;
@@ -1115,29 +1250,59 @@ libxfs_bulkrelse(
  
         list_for_each_entry(bp, list, b_node.cn_mru) {
                 if (bp->b_flags & LIBXFS_B_DIRTY)
-                       libxfs_writebufr(bp);
+                       fprintf(stderr,
+                               "releasing dirty buffer (bulk) to free list!");
                 count++;
         }
  
         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
-       __list_splice(list, &xfs_buf_freelist.cm_list);
+       list_splice(list, &xfs_buf_freelist.cm_list);
         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
  
         return count;
  }
  
-static void
-libxfs_bflush(struct cache_node *node)
+/*
+ * Free everything from the xfs_buf_freelist MRU, used at final teardown
+ */
+void
+libxfs_bcache_free(void)
  {
-       xfs_buf_t               *bp = (xfs_buf_t *)node;
+       struct list_head        *cm_list;
+       xfs_buf_t               *bp, *next;
+
+       cm_list = &xfs_buf_freelist.cm_list;
+       list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
+               free(bp->b_addr);
+               if (bp->b_maps != &bp->__b_map)
+                       free(bp->b_maps);
+               kmem_zone_free(xfs_buf_zone, bp);
+       }
+}
  
-       if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY))
-               libxfs_writebufr(bp);
+/*
+ * When a buffer is marked dirty, the error is cleared. Hence if we are trying
+ * to flush a buffer prior to cache reclaim that has an error on it it means
+ * we've already tried to flush it and it failed. Prevent repeated corruption
+ * errors from being reported by skipping such buffers - when the corruption is
+ * fixed the buffer will be marked dirty again and we can write it again.
+ */
+static int
+libxfs_bflush(
+       struct cache_node       *node)
+{
+       struct xfs_buf          *bp = (struct xfs_buf *)node;
+
+       if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
+               return libxfs_writebufr(bp);
+       return bp->b_error;
  }
  
  void
  libxfs_putbufr(xfs_buf_t *bp)
  {
+       if (bp->b_flags & LIBXFS_B_DIRTY)
+               libxfs_writebufr(bp);
         libxfs_brelse((struct cache_node *)bp);
  }
  
@@ -1174,15 +1339,54 @@ struct cache_operations libxfs_bcache_operations = {
   * Inode cache stubs.
   */
  
+kmem_zone_t            *xfs_inode_zone;
  extern kmem_zone_t     *xfs_ili_zone;
-extern kmem_zone_t     *xfs_inode_zone;
+
+/*
+ * If there are inline format data / attr forks attached to this inode,
+ * make sure they're not corrupt.
+ */
+bool
+libxfs_inode_verify_forks(
+       struct xfs_inode        *ip,
+       struct xfs_ifork_ops    *ops)
+{
+       struct xfs_ifork        *ifp;
+       xfs_failaddr_t          fa;
+
+       if (!ops)
+               return true;
+
+       fa = xfs_ifork_verify_data(ip, ops);
+       if (fa) {
+               ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+               xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
+                               ifp->if_u1.if_data, ifp->if_bytes, fa);
+               return false;
+       }
+
+       fa = xfs_ifork_verify_attr(ip, ops);
+       if (fa) {
+               ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
+               xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
+                               ifp ? ifp->if_u1.if_data : NULL,
+                               ifp ? ifp->if_bytes : 0, fa);
+               return false;
+       }
+       return true;
+}
  
  int
-libxfs_iget(xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, uint lock_flags,
-               xfs_inode_t **ipp, xfs_daddr_t bno)
+libxfs_iget(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       xfs_ino_t               ino,
+       uint                    lock_flags,
+       struct xfs_inode        **ipp,
+       struct xfs_ifork_ops    *ifork_ops)
  {
-       xfs_inode_t     *ip;
-       int             error = 0;
+       struct xfs_inode        *ip;
+       int                     error = 0;
  
         ip = kmem_zone_zalloc(xfs_inode_zone, 0);
         if (!ip)
@@ -1190,17 +1394,22 @@ libxfs_iget(xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, uint lock_flags,
  
         ip->i_ino = ino;
         ip->i_mount = mp;
-       error = xfs_iread(mp, tp, ip, bno);
+       error = xfs_iread(mp, tp, ip, 0);
         if (error) {
                 kmem_zone_free(xfs_inode_zone, ip);
                 *ipp = NULL;
                 return error;
         }
  
+       if (!libxfs_inode_verify_forks(ip, ifork_ops)) {
+               libxfs_iput(ip);
+               return -EFSCORRUPTED;
+       }
+
         /*
          * set up the inode ops structure that the libxfs code relies on
          */
-       if (S_ISDIR(ip->i_d.di_mode))
+       if (XFS_ISDIR(ip))
                 ip->d_ops = mp->m_dir_inode_ops;
         else
                 ip->d_ops = mp->m_nondir_inode_ops;
@@ -1212,7 +1421,7 @@ libxfs_iget(xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, uint lock_flags,
  static void
  libxfs_idestroy(xfs_inode_t *ip)
  {
-       switch (ip->i_d.di_mode & S_IFMT) {
+       switch (VFS_I(ip)->i_mode & S_IFMT) {
                 case S_IFREG:
                 case S_IFDIR:
                 case S_IFLNK:
@@ -1221,6 +1430,8 @@ libxfs_idestroy(xfs_inode_t *ip)
         }
         if (ip->i_afp)
                 libxfs_idestroy_fork(ip, XFS_ATTR_FORK);
+       if (ip->i_cowfp)
+               xfs_idestroy_fork(ip, XFS_COW_FORK);
  }
  
  void