+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
+ */
+
+
+#include "libxfs_priv.h"
+#include "init.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+
+#include "libxfs.h" /* for LIBXFS_EXIT_ON_FAILURE */
+
+/*
+ * Important design/architecture note:
+ *
+ * The userspace code that uses the buffer cache is much less constrained than
+ * the kernel code. The userspace code is pretty nasty in places, especially
+ * when it comes to buffer error handling. Very little of the userspace code
+ * outside libxfs clears bp->b_error - very little code even checks it - so the
+ * libxfs code is tripping on stale errors left by the userspace code.
+ *
+ * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
+ * in the kernel, because those functions are used by the libxfs_readbuf_*
+ * functions and hence need to leave the buffers unchanged on cache hits. This
+ * is actually the only way to gather a write error from a libxfs_writebuf()
+ * call - you need to get the buffer again so you can check bp->b_error field -
+ * assuming that the buffer is still in the cache when you check, that is.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
+ * This is very different to the kernel code which does not release buffers on a
+ * write so we can wait on IO and check errors. The kernel buffer cache also
+ * guarantees a buffer of a known initial state from xfs_buf_get() even on a
+ * cache hit.
*
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * IOWs, userspace is behaving quite differently to the kernel and as a result
+ * it leaks errors from reads, invalidations and writes through
+ * libxfs_getbuf/libxfs_readbuf.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ * The result of this is that until the userspace code outside libxfs is cleaned
+ * up, functions that release buffers from userspace control (i.e
+ * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
+ * propagation of stale errors into future buffer operations.
*/
-#include <xfs/libxfs.h>
-#include "init.h"
-
#define BDSTRAT_SIZE (256 * 1024)
#define IO_BCOMPARE_CHECK
-void
+/* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
+int
libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
{
xfs_off_t start_offset, end_offset, offset;
fd = libxfs_device_to_fd(btp->dev);
start_offset = LIBXFS_BBTOOFF64(start);
- if ((lseek64(fd, start_offset, SEEK_SET)) < 0) {
+ if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
progname, __FUNCTION__,
(unsigned long long)start_offset, strerror(errno));
offset += bytes;
}
free(z);
+ return 0;
}
static void unmount_record(void *p)
xlog_op_header_t *op = (xlog_op_header_t *)p;
/* the data section must be 32 bit size aligned */
struct {
- __uint16_t magic;
- __uint16_t pad1;
- __uint32_t pad2; /* may as well make it 64 bits */
+ uint16_t magic;
+ uint16_t pad1;
+ uint32_t pad2; /* may as well make it 64 bits */
} magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
memset(p, 0, BBSIZE);
- op->oh_tid = cpu_to_be32(1);
+ /* dummy tid to mark this as written from userspace */
+ op->oh_tid = cpu_to_be32(0xb0c0d0d0);
op->oh_len = cpu_to_be32(sizeof(magic));
op->oh_clientid = XFS_LOG;
op->oh_flags = XLOG_UNMOUNT_TRANS;
memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
}
-static xfs_caddr_t next(xfs_caddr_t ptr, int offset, void *private)
+static char *next(
+ char *ptr,
+ int offset,
+ void *private)
{
- xfs_buf_t *buf = (xfs_buf_t *)private;
+ struct xfs_buf *buf = (struct xfs_buf *)private;
- if (XFS_BUF_COUNT(buf) < (int)(ptr - XFS_BUF_PTR(buf)) + offset)
+ if (buf &&
+ (buf->b_bcount < (int)(ptr - (char *)buf->b_addr) + offset))
abort();
+
return ptr + offset;
}
+/*
+ * Format the log. The caller provides either a buftarg which is used to access
+ * the log via buffers or a direct pointer to a buffer that encapsulates the
+ * entire log.
+ */
int
libxfs_log_clear(
struct xfs_buftarg *btp,
+ char *dptr,
xfs_daddr_t start,
- uint length,
+ uint length, /* basic blocks */
uuid_t *fs_uuid,
int version,
- int sunit,
- int fmt)
+ int sunit, /* bytes */
+ int fmt,
+ int cycle,
+ bool max)
{
- xfs_buf_t *bp;
+ struct xfs_buf *bp = NULL;
int len;
-
- if (!btp->dev || !fs_uuid)
+ xfs_lsn_t lsn;
+ xfs_lsn_t tail_lsn;
+ xfs_daddr_t blk;
+ xfs_daddr_t end_blk;
+ char *ptr;
+
+ if (((btp && dptr) || (!btp && !dptr)) ||
+ (btp && !btp->dev) || !fs_uuid)
return -EINVAL;
/* first zero the log */
- libxfs_device_zero(btp, start, length);
+ if (btp)
+ libxfs_device_zero(btp, start, length);
+ else
+ memset(dptr, 0, BBTOB(length));
- /* then write a log record header */
+ /*
+ * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
+ * special reset case where we only write a single record where the lsn
+ * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
+ * the specified cycle and points tail_lsn at the last record of the
+ * previous cycle.
+ */
len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
- len = MAX(len, 2);
- bp = libxfs_getbufr(btp, start, len);
- libxfs_log_header(XFS_BUF_PTR(bp),
- fs_uuid, version, sunit, fmt, next, bp);
- bp->b_flags |= LIBXFS_B_DIRTY;
- libxfs_putbufr(bp);
+ len = max(len, 2);
+ lsn = xlog_assign_lsn(cycle, 0);
+ if (cycle == XLOG_INIT_CYCLE)
+ tail_lsn = lsn;
+ else
+ tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
+
+ /* write out the first log record */
+ ptr = dptr;
+ if (btp) {
+ bp = libxfs_getbufr(btp, start, len);
+ ptr = bp->b_addr;
+ }
+ libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
+ next, bp);
+ if (bp) {
+ bp->b_flags |= LIBXFS_B_DIRTY;
+ libxfs_putbufr(bp);
+ }
+
+ /*
+ * There's nothing else to do if this is a log reset. The kernel detects
+ * the rest of the log is zeroed and starts at cycle 1.
+ */
+ if (cycle == XLOG_INIT_CYCLE)
+ return 0;
+
+ /*
+ * Bump the record size for a full log format if the caller allows it.
+ * This is primarily for performance reasons and most callers don't care
+ * about record size since the log is clean after we're done.
+ */
+ if (max)
+ len = BTOBB(BDSTRAT_SIZE);
+
+ /*
+ * Otherwise, fill everything beyond the initial record with records of
+ * the previous cycle so the kernel head/tail detection works correctly.
+ *
+ * We don't particularly care about the record size or content here.
+ * It's only important that the headers are in place such that the
+ * kernel finds 1.) a clean log and 2.) the correct current cycle value.
+ * Therefore, bump up the record size to the max to use larger I/Os and
+ * improve performance.
+ */
+ cycle--;
+ blk = start + len;
+ if (dptr)
+ dptr += BBTOB(len);
+ end_blk = start + length;
+
+ len = min(end_blk - blk, len);
+ while (blk < end_blk) {
+ lsn = xlog_assign_lsn(cycle, blk - start);
+ tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
+
+ ptr = dptr;
+ if (btp) {
+ bp = libxfs_getbufr(btp, blk, len);
+ ptr = bp->b_addr;
+ }
+ /*
+ * Note: pass the full buffer length as the sunit to initialize
+ * the entire buffer.
+ */
+ libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
+ tail_lsn, next, bp);
+ if (bp) {
+ bp->b_flags |= LIBXFS_B_DIRTY;
+ libxfs_putbufr(bp);
+ }
+
+ blk += len;
+ if (dptr)
+ dptr += BBTOB(len);
+ len = min(end_blk - blk, len);
+ }
+
return 0;
}
int
libxfs_log_header(
- xfs_caddr_t caddr,
+ char *caddr,
uuid_t *fs_uuid,
int version,
int sunit,
int fmt,
+ xfs_lsn_t lsn,
+ xfs_lsn_t tail_lsn,
libxfs_get_block_t *nextfunc,
void *private)
{
xlog_rec_header_t *head = (xlog_rec_header_t *)caddr;
- xfs_caddr_t p = caddr;
+ char *p = caddr;
__be32 cycle_lsn;
int i, len;
+ int hdrs = 1;
+
+ if (lsn == NULLCOMMITLSN)
+ lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
+ if (tail_lsn == NULLCOMMITLSN)
+ tail_lsn = lsn;
len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
- /* note that oh_tid actually contains the cycle number
- * and the tid is stored in h_cycle_data[0] - that's the
- * way things end up on disk.
- */
memset(p, 0, BBSIZE);
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
- head->h_cycle = cpu_to_be32(1);
+ head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
head->h_version = cpu_to_be32(version);
- if (len != 1)
- head->h_len = cpu_to_be32(sunit - BBSIZE);
- else
- head->h_len = cpu_to_be32(20);
- head->h_crc = cpu_to_be32(0);
+ head->h_crc = cpu_to_le32(0);
head->h_prev_block = cpu_to_be32(-1);
head->h_num_logops = cpu_to_be32(1);
- head->h_cycle_data[0] = cpu_to_be32(0xb0c0d0d0);
head->h_fmt = cpu_to_be32(fmt);
- head->h_size = cpu_to_be32(XLOG_HEADER_CYCLE_SIZE);
+ head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
- head->h_lsn = cpu_to_be64(xlog_assign_lsn(1, 0));
- head->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(1, 0));
+ head->h_lsn = cpu_to_be64(lsn);
+ head->h_tail_lsn = cpu_to_be64(tail_lsn);
memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
- len = MAX(len, 2);
+ /*
+ * The kernel expects to see either a log record header magic value or
+ * the LSN cycle at the top of every log block. The first word of each
+ * non-header block is copied to the record headers and replaced with
+ * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
+ * details).
+ *
+ * Even though we only ever write an unmount record (one block), we
+ * support writing log records up to the max log buffer size of 256k to
+ * improve log format performance. This means a record can require up
+ * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
+ * data (each header supports 32k of data).
+ */
+ cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
+ if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
+ hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
+ if (sunit % XLOG_HEADER_CYCLE_SIZE)
+ hdrs++;
+ }
+
+ /*
+ * A fixed number of extended headers is expected based on h_size. If
+ * required, format those now so the unmount record is located
+ * correctly.
+ *
+ * Since we only write an unmount record, we only need one h_cycle_data
+ * entry for the unmount record block. The subsequent record data
+ * blocks are zeroed, which means we can stamp them directly with the
+ * cycle and zero the rest of the cycle data in the extended headers.
+ */
+ if (hdrs > 1) {
+ for (i = 1; i < hdrs; i++) {
+ p = nextfunc(p, BBSIZE, private);
+ memset(p, 0, BBSIZE);
+ /* xlog_rec_ext_header.xh_cycle */
+ *(__be32 *)p = cycle_lsn;
+ }
+ }
+
+ /*
+ * The total length is the max of the stripe unit or 2 basic block
+ * minimum (1 hdr blk + 1 data blk). The record length is the total
+ * minus however many header blocks are required.
+ */
+ head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
+
+ /*
+ * Write out the unmount record, pack the first word into the record
+ * header and stamp the block with the cycle.
+ */
p = nextfunc(p, BBSIZE, private);
unmount_record(p);
- cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
- for (i = 2; i < len; i++) {
+ head->h_cycle_data[0] = *(__be32 *)p;
+ *(__be32 *)p = cycle_lsn;
+
+ /*
+ * Finally, zero all remaining blocks in the record and stamp each with
+ * the cycle. We don't need to pack any of these blocks because the
+ * cycle data in the headers has already been zeroed.
+ */
+ len = max(len, hdrs + 1);
+ for (i = hdrs + 1; i < len; i++) {
p = nextfunc(p, BBSIZE, private);
memset(p, 0, BBSIZE);
*(__be32 *)p = cycle_lsn;
void
libxfs_bprint(xfs_buf_t *bp)
{
- fprintf(stderr, "Buffer 0x%p blkno=%llu bytes=%u flags=0x%x count=%u\n",
+ fprintf(stderr, "Buffer %p blkno=%llu bytes=%u flags=0x%x count=%u\n",
bp, (unsigned long long)bp->b_bn, (unsigned)bp->b_bcount,
bp->b_flags, bp->b_node.cn_count);
}
strerror(errno));
exit(1);
}
+ memset(bp->b_addr, 0, bytes);
#ifdef XFS_BUF_TRACING
list_head_init(&bp->b_lock_list);
#endif
bp->b_holder = 0;
bp->b_recur = 0;
bp->b_ops = NULL;
+
+ if (!bp->b_maps) {
+ bp->b_nmaps = 1;
+ bp->b_maps = &bp->__b_map;
+ bp->b_maps[0].bm_bn = bp->b_bn;
+ bp->b_maps[0].bm_len = bp->b_length;
+ }
}
static void
int i;
bytes = sizeof(struct xfs_buf_map) * nmaps;
- bp->b_map = malloc(bytes);
- if (!bp->b_map) {
+ bp->b_maps = malloc(bytes);
+ if (!bp->b_maps) {
fprintf(stderr,
_("%s: %s can't malloc %u bytes: %s\n"),
progname, __FUNCTION__, bytes,
bytes = 0;
for ( i = 0; i < nmaps; i++) {
- bp->b_map[i].bm_bn = map[i].bm_bn;
- bp->b_map[i].bm_len = map[i].bm_len;
+ bp->b_maps[i].bm_bn = map[i].bm_bn;
+ bp->b_maps[i].bm_len = map[i].bm_len;
bytes += BBTOB(map[i].bm_len);
}
list_del_init(&bp->b_node.cn_mru);
free(bp->b_addr);
bp->b_addr = NULL;
- free(bp->b_map);
- bp->b_map = NULL;
+ if (bp->b_maps != &bp->__b_map)
+ free(bp->b_maps);
+ bp->b_maps = NULL;
}
} else
bp = kmem_zone_zalloc(xfs_buf_zone, 0);
pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
bp->b_ops = NULL;
+ if (bp->b_flags & LIBXFS_B_DIRTY)
+ fprintf(stderr, "found dirty buffer (bulk) on free list!");
return bp;
}
return __cache_lookup(&key, flags);
}
+/*
+ * Clean the buffer flags for libxfs_getbuf*(), which wants to return
+ * an unused buffer with clean state. This prevents CRC errors on a
+ * re-read of a corrupt block that was prefetched and freed. This
+ * can happen with a massively corrupt directory that is discarded,
+ * but whose blocks are then recycled into expanding lost+found.
+ *
+ * Note however that if the buffer's dirty (prefetch calls getbuf)
+ * we'll leave the state alone because we don't want to discard blocks
+ * that have been fixed.
+ */
+static void
+reset_buf_state(
+ struct xfs_buf *bp)
+{
+ if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
+ bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
+ LIBXFS_B_UPTODATE);
+}
+
struct xfs_buf *
libxfs_getbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
{
- return libxfs_getbuf_flags(btp, blkno, len, 0);
+ struct xfs_buf *bp;
+
+ bp = libxfs_getbuf_flags(btp, blkno, len, 0);
+ reset_buf_state(bp);
+ return bp;
}
-struct xfs_buf *
-libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
- int nmaps, int flags)
+static struct xfs_buf *
+__libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
+ int nmaps, int flags)
{
struct xfs_bufkey key = {0};
int i;
return __cache_lookup(&key, flags);
}
+struct xfs_buf *
+libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
+ int nmaps, int flags)
+{
+ struct xfs_buf *bp;
+
+ bp = __libxfs_getbuf_map(btp, map, nmaps, flags);
+ reset_buf_state(bp);
+ return bp;
+}
+
void
libxfs_putbuf(xfs_buf_t *bp)
{
+ /*
+ * ensure that any errors on this use of the buffer don't carry
+ * over to the next user.
+ */
+ bp->b_error = 0;
+
#ifdef XFS_BUF_TRACING
pthread_mutex_lock(&libxfs_bcache->c_mutex);
lock_buf_count--;
pthread_mutex_unlock(&bp->b_lock);
}
}
+
cache_node_put(libxfs_bcache, (struct cache_node *)bp);
}
{
int sts;
- sts = pread64(fd, buf, len, offset);
+ sts = pread(fd, buf, len, offset);
if (sts < 0) {
int error = errno;
fprintf(stderr, _("%s: read failed: %s\n"),
progname, strerror(error));
if (flags & LIBXFS_EXIT_ON_FAILURE)
exit(1);
- return error;
+ return -error;
} else if (sts != len) {
fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
progname, sts, len);
if (flags & LIBXFS_EXIT_ON_FAILURE)
exit(1);
- return EIO;
+ return -EIO;
}
return 0;
}
xfs_buf_t *bp;
int error;
- bp = libxfs_getbuf(btp, blkno, len);
+ bp = libxfs_getbuf_flags(btp, blkno, len, 0);
if (!bp)
return NULL;
{
int fd;
int error = 0;
- char *buf;
+ void *buf;
int i;
fd = libxfs_device_to_fd(btp->dev);
buf = bp->b_addr;
for (i = 0; i < bp->b_nmaps; i++) {
- off64_t offset = LIBXFS_BBTOOFF64(bp->b_map[i].bm_bn);
- int len = BBTOB(bp->b_map[i].bm_len);
+ off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
+ int len = BBTOB(bp->b_maps[i].bm_len);
error = __read_buf(fd, buf, len, offset, flags);
if (error) {
if (!error)
bp->b_flags |= LIBXFS_B_UPTODATE;
#ifdef IO_DEBUG
- printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
- pthread_self(), __FUNCTION__, , error,
- (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
+ printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
+ pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
+ (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
#endif
return error;
}
return libxfs_readbuf(btp, map[0].bm_bn, map[0].bm_len,
flags, ops);
- bp = libxfs_getbuf_map(btp, map, nmaps, 0);
+ bp = __libxfs_getbuf_map(btp, map, nmaps, 0);
if (!bp)
return NULL;
if (!error)
libxfs_readbuf_verify(bp, ops);
-#ifdef IO_DEBUG
+#ifdef IO_DEBUGX
printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
(long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
{
int sts;
- sts = pwrite64(fd, buf, len, offset);
+ sts = pwrite(fd, buf, len, offset);
if (sts < 0) {
int error = errno;
- fprintf(stderr, _("%s: pwrite64 failed: %s\n"),
+ fprintf(stderr, _("%s: pwrite failed: %s\n"),
progname, strerror(error));
if (flags & LIBXFS_B_EXIT)
exit(1);
- return error;
+ return -error;
} else if (sts != len) {
- fprintf(stderr, _("%s: error - pwrite64 only %d of %d bytes\n"),
+ fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
progname, sts, len);
if (flags & LIBXFS_B_EXIT)
exit(1);
- return EIO;
+ return -EIO;
}
return 0;
}
libxfs_writebufr(xfs_buf_t *bp)
{
int fd = libxfs_device_to_fd(bp->b_target->dev);
- int error = 0;
/*
* we never write buffers that are marked stale. This indicates they
* bugs like this. Make sure the error is obvious as to the cause.
*/
if (bp->b_flags & LIBXFS_B_STALE) {
- bp->b_error = ESTALE;
+ bp->b_error = -ESTALE;
return bp->b_error;
}
bp->b_ops->verify_write(bp);
if (bp->b_error) {
fprintf(stderr,
- _("%s: write verifer failed on bno 0x%llx/0x%x\n"),
- __func__, (long long)bp->b_bn, bp->b_bcount);
+ _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
+ __func__, bp->b_ops->name,
+ (long long)bp->b_bn, bp->b_bcount);
return bp->b_error;
}
}
if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
- error = __write_buf(fd, bp->b_addr, bp->b_bcount,
+ bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
} else {
int i;
- char *buf = bp->b_addr;
+ void *buf = bp->b_addr;
for (i = 0; i < bp->b_nmaps; i++) {
- off64_t offset = LIBXFS_BBTOOFF64(bp->b_map[i].bm_bn);
- int len = BBTOB(bp->b_map[i].bm_len);
+ off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
+ int len = BBTOB(bp->b_maps[i].bm_len);
- error = __write_buf(fd, buf, len, offset, bp->b_flags);
- if (error) {
- bp->b_error = error;
+ bp->b_error = __write_buf(fd, buf, len, offset,
+ bp->b_flags);
+ if (bp->b_error)
break;
- }
buf += len;
}
}
#ifdef IO_DEBUG
- printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p\n",
+ printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
pthread_self(), __FUNCTION__, bp->b_bcount,
(long long)LIBXFS_BBTOOFF64(bp->b_bn),
- (long long)bp->b_bn, bp);
+ (long long)bp->b_bn, bp, bp->b_error);
#endif
- if (!error) {
+ if (!bp->b_error) {
bp->b_flags |= LIBXFS_B_UPTODATE;
bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
LIBXFS_B_UNCHECKED);
}
- return error;
+ return bp->b_error;
}
int
* subsequent reads after this write from seeing stale errors.
*/
bp->b_error = 0;
+ bp->b_flags &= ~LIBXFS_B_STALE;
bp->b_flags |= (LIBXFS_B_DIRTY | flags);
return 0;
}
* subsequent reads after this write from seeing stale errors.
*/
bp->b_error = 0;
+ bp->b_flags &= ~LIBXFS_B_STALE;
bp->b_flags |= (LIBXFS_B_DIRTY | flags);
libxfs_putbuf(bp);
return 0;
}
static void
-libxfs_brelse(struct cache_node *node)
+libxfs_brelse(
+ struct cache_node *node)
{
- xfs_buf_t *bp = (xfs_buf_t *)node;
+ struct xfs_buf *bp = (struct xfs_buf *)node;
- if (bp != NULL) {
- if (bp->b_flags & LIBXFS_B_DIRTY)
- libxfs_writebufr(bp);
- pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
- list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
- pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
- }
+ if (!bp)
+ return;
+ if (bp->b_flags & LIBXFS_B_DIRTY)
+ fprintf(stderr,
+ "releasing dirty buffer to free list!");
+
+ pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+ list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
+ pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
}
static unsigned int
libxfs_bulkrelse(
- struct cache *cache,
- struct list_head *list)
+ struct cache *cache,
+ struct list_head *list)
{
xfs_buf_t *bp;
int count = 0;
list_for_each_entry(bp, list, b_node.cn_mru) {
if (bp->b_flags & LIBXFS_B_DIRTY)
- libxfs_writebufr(bp);
+ fprintf(stderr,
+ "releasing dirty buffer (bulk) to free list!");
count++;
}
pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
- __list_splice(list, &xfs_buf_freelist.cm_list);
+ list_splice(list, &xfs_buf_freelist.cm_list);
pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
return count;
}
-static void
-libxfs_bflush(struct cache_node *node)
+/*
+ * Free everything from the xfs_buf_freelist MRU, used at final teardown
+ */
+void
+libxfs_bcache_free(void)
{
- xfs_buf_t *bp = (xfs_buf_t *)node;
+ struct list_head *cm_list;
+ xfs_buf_t *bp, *next;
+
+ cm_list = &xfs_buf_freelist.cm_list;
+ list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
+ free(bp->b_addr);
+ if (bp->b_maps != &bp->__b_map)
+ free(bp->b_maps);
+ kmem_zone_free(xfs_buf_zone, bp);
+ }
+}
- if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY))
- libxfs_writebufr(bp);
+/*
+ * When a buffer is marked dirty, the error is cleared. Hence if we are trying
+ * to flush a buffer prior to cache reclaim that has an error on it it means
+ * we've already tried to flush it and it failed. Prevent repeated corruption
+ * errors from being reported by skipping such buffers - when the corruption is
+ * fixed the buffer will be marked dirty again and we can write it again.
+ */
+static int
+libxfs_bflush(
+ struct cache_node *node)
+{
+ struct xfs_buf *bp = (struct xfs_buf *)node;
+
+ if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
+ return libxfs_writebufr(bp);
+ return bp->b_error;
}
void
libxfs_putbufr(xfs_buf_t *bp)
{
+ if (bp->b_flags & LIBXFS_B_DIRTY)
+ libxfs_writebufr(bp);
libxfs_brelse((struct cache_node *)bp);
}
}
struct cache_operations libxfs_bcache_operations = {
- /* .hash */ libxfs_bhash,
- /* .alloc */ libxfs_balloc,
- /* .flush */ libxfs_bflush,
- /* .relse */ libxfs_brelse,
- /* .compare */ libxfs_bcompare,
- /* .bulkrelse */libxfs_bulkrelse
+ .hash = libxfs_bhash,
+ .alloc = libxfs_balloc,
+ .flush = libxfs_bflush,
+ .relse = libxfs_brelse,
+ .compare = libxfs_bcompare,
+ .bulkrelse = libxfs_bulkrelse
};
* Inode cache stubs.
*/
+kmem_zone_t *xfs_inode_zone;
extern kmem_zone_t *xfs_ili_zone;
-extern kmem_zone_t *xfs_inode_zone;
+
+/*
+ * If there are inline format data / attr forks attached to this inode,
+ * make sure they're not corrupt.
+ */
+bool
+libxfs_inode_verify_forks(
+ struct xfs_inode *ip,
+ struct xfs_ifork_ops *ops)
+{
+ struct xfs_ifork *ifp;
+ xfs_failaddr_t fa;
+
+ if (!ops)
+ return true;
+
+ fa = xfs_ifork_verify_data(ip, ops);
+ if (fa) {
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
+ ifp->if_u1.if_data, ifp->if_bytes, fa);
+ return false;
+ }
+
+ fa = xfs_ifork_verify_attr(ip, ops);
+ if (fa) {
+ ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
+ ifp ? ifp->if_u1.if_data : NULL,
+ ifp ? ifp->if_bytes : 0, fa);
+ return false;
+ }
+ return true;
+}
int
-libxfs_iget(xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, uint lock_flags,
- xfs_inode_t **ipp, xfs_daddr_t bno)
+libxfs_iget(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ uint lock_flags,
+ struct xfs_inode **ipp,
+ struct xfs_ifork_ops *ifork_ops)
{
- xfs_inode_t *ip;
- int error = 0;
+ struct xfs_inode *ip;
+ int error = 0;
ip = kmem_zone_zalloc(xfs_inode_zone, 0);
if (!ip)
- return ENOMEM;
+ return -ENOMEM;
ip->i_ino = ino;
ip->i_mount = mp;
- error = xfs_iread(mp, tp, ip, bno);
+ error = xfs_iread(mp, tp, ip, 0);
if (error) {
kmem_zone_free(xfs_inode_zone, ip);
*ipp = NULL;
return error;
}
+ if (!libxfs_inode_verify_forks(ip, ifork_ops)) {
+ libxfs_iput(ip);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * set up the inode ops structure that the libxfs code relies on
+ */
+ if (XFS_ISDIR(ip))
+ ip->d_ops = mp->m_dir_inode_ops;
+ else
+ ip->d_ops = mp->m_nondir_inode_ops;
+
*ipp = ip;
return 0;
}
static void
libxfs_idestroy(xfs_inode_t *ip)
{
- switch (ip->i_d.di_mode & S_IFMT) {
+ switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
}
if (ip->i_afp)
libxfs_idestroy_fork(ip, XFS_ATTR_FORK);
+ if (ip->i_cowfp)
+ xfs_idestroy_fork(ip, XFS_COW_FORK);
}
void
-libxfs_iput(xfs_inode_t *ip, uint lock_flags)
+libxfs_iput(xfs_inode_t *ip)
{
if (ip->i_itemp)
kmem_zone_free(xfs_ili_zone, ip->i_itemp);