libxfs/rdwr.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6
   7
   8 #include "libxfs_priv.h"
   9 #include "init.h"
  10 #include "xfs_fs.h"
  11 #include "xfs_shared.h"
  12 #include "xfs_format.h"
  13 #include "xfs_log_format.h"
  14 #include "xfs_trans_resv.h"
  15 #include "xfs_mount.h"
  16 #include "xfs_inode_buf.h"
  17 #include "xfs_inode_fork.h"
  18 #include "xfs_inode.h"
  19 #include "xfs_trans.h"
  20
  21 #include "libxfs.h"             /* for LIBXFS_EXIT_ON_FAILURE */
  22
  23 /*
  24  * Important design/architecture note:
  25  *
  26  * The userspace code that uses the buffer cache is much less constrained than
  27  * the kernel code. The userspace code is pretty nasty in places, especially
  28  * when it comes to buffer error handling.  Very little of the userspace code
  29  * outside libxfs clears bp->b_error - very little code even checks it - so the
  30  * libxfs code is tripping on stale errors left by the userspace code.
  31  *
  32  * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
  33  * in the kernel, because those functions are used by the libxfs_readbuf_*
  34  * functions and hence need to leave the buffers unchanged on cache hits. This
  35  * is actually the only way to gather a write error from a libxfs_writebuf()
  36  * call - you need to get the buffer again so you can check bp->b_error field -
  37  * assuming that the buffer is still in the cache when you check, that is.
  38  *
  39  * This is very different to the kernel code which does not release buffers on a
  40  * write so we can wait on IO and check errors. The kernel buffer cache also
  41  * guarantees a buffer of a known initial state from xfs_buf_get() even on a
  42  * cache hit.
  43  *
  44  * IOWs, userspace is behaving quite differently to the kernel and as a result
  45  * it leaks errors from reads, invalidations and writes through
  46  * libxfs_getbuf/libxfs_readbuf.
  47  *
  48  * The result of this is that until the userspace code outside libxfs is cleaned
  49  * up, functions that release buffers from userspace control (i.e
  50  * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
  51  * propagation of stale errors into future buffer operations.
  52  */
  53
  54 #define BDSTRAT_SIZE    (256 * 1024)
  55
  56 #define IO_BCOMPARE_CHECK
  57
  58 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
  59 int
  60 libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
  61 {
  62         xfs_off_t       start_offset, end_offset, offset;
  63         ssize_t         zsize, bytes;
  64         char            *z;
  65         int             fd;
  66
  67         zsize = min(BDSTRAT_SIZE, BBTOB(len));
  68         if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
  69                 fprintf(stderr,
  70                         _("%s: %s can't memalign %d bytes: %s\n"),
  71                         progname, __FUNCTION__, (int)zsize, strerror(errno));
  72                 exit(1);
  73         }
  74         memset(z, 0, zsize);
  75
  76         fd = libxfs_device_to_fd(btp->dev);
  77         start_offset = LIBXFS_BBTOOFF64(start);
  78
  79         if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
  80                 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
  81                         progname, __FUNCTION__,
  82                         (unsigned long long)start_offset, strerror(errno));
  83                 exit(1);
  84         }
  85
  86         end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
  87         for (offset = 0; offset < end_offset; ) {
  88                 bytes = min((ssize_t)(end_offset - offset), zsize);
  89                 if ((bytes = write(fd, z, bytes)) < 0) {
  90                         fprintf(stderr, _("%s: %s write failed: %s\n"),
  91                                 progname, __FUNCTION__, strerror(errno));
  92                         exit(1);
  93                 } else if (bytes == 0) {
  94                         fprintf(stderr, _("%s: %s not progressing?\n"),
  95                                 progname, __FUNCTION__);
  96                         exit(1);
  97                 }
  98                 offset += bytes;
  99         }
 100         free(z);
 101         return 0;
 102 }
 103
 104 static void unmount_record(void *p)
 105 {
 106         xlog_op_header_t        *op = (xlog_op_header_t *)p;
 107         /* the data section must be 32 bit size aligned */
 108         struct {
 109             uint16_t magic;
 110             uint16_t pad1;
 111             uint32_t pad2; /* may as well make it 64 bits */
 112         } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
 113
 114         memset(p, 0, BBSIZE);
 115         /* dummy tid to mark this as written from userspace */
 116         op->oh_tid = cpu_to_be32(0xb0c0d0d0);
 117         op->oh_len = cpu_to_be32(sizeof(magic));
 118         op->oh_clientid = XFS_LOG;
 119         op->oh_flags = XLOG_UNMOUNT_TRANS;
 120         op->oh_res2 = 0;
 121
 122         /* and the data for this op */
 123         memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
 124 }
 125
 126 static char *next(
 127         char            *ptr,
 128         int             offset,
 129         void            *private)
 130 {
 131         struct xfs_buf  *buf = (struct xfs_buf *)private;
 132
 133         if (buf &&
 134             (buf->b_bcount < (int)(ptr - (char *)buf->b_addr) + offset))
 135                 abort();
 136
 137         return ptr + offset;
 138 }
 139
 140 /*
 141  * Format the log. The caller provides either a buftarg which is used to access
 142  * the log via buffers or a direct pointer to a buffer that encapsulates the
 143  * entire log.
 144  */
 145 int
 146 libxfs_log_clear(
 147         struct xfs_buftarg      *btp,
 148         char                    *dptr,
 149         xfs_daddr_t             start,
 150         uint                    length,         /* basic blocks */
 151         uuid_t                  *fs_uuid,
 152         int                     version,
 153         int                     sunit,          /* bytes */
 154         int                     fmt,
 155         int                     cycle,
 156         bool                    max)
 157 {
 158         struct xfs_buf          *bp = NULL;
 159         int                     len;
 160         xfs_lsn_t               lsn;
 161         xfs_lsn_t               tail_lsn;
 162         xfs_daddr_t             blk;
 163         xfs_daddr_t             end_blk;
 164         char                    *ptr;
 165
 166         if (((btp && dptr) || (!btp && !dptr)) ||
 167             (btp && !btp->dev) || !fs_uuid)
 168                 return -EINVAL;
 169
 170         /* first zero the log */
 171         if (btp)
 172                 libxfs_device_zero(btp, start, length);
 173         else
 174                 memset(dptr, 0, BBTOB(length));
 175
 176         /*
 177          * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
 178          * special reset case where we only write a single record where the lsn
 179          * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
 180          * the specified cycle and points tail_lsn at the last record of the
 181          * previous cycle.
 182          */
 183         len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
 184         len = max(len, 2);
 185         lsn = xlog_assign_lsn(cycle, 0);
 186         if (cycle == XLOG_INIT_CYCLE)
 187                 tail_lsn = lsn;
 188         else
 189                 tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
 190
 191         /* write out the first log record */
 192         ptr = dptr;
 193         if (btp) {
 194                 bp = libxfs_getbufr(btp, start, len);
 195                 ptr = bp->b_addr;
 196         }
 197         libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
 198                           next, bp);
 199         if (bp) {
 200                 bp->b_flags |= LIBXFS_B_DIRTY;
 201                 libxfs_putbufr(bp);
 202         }
 203
 204         /*
 205          * There's nothing else to do if this is a log reset. The kernel detects
 206          * the rest of the log is zeroed and starts at cycle 1.
 207          */
 208         if (cycle == XLOG_INIT_CYCLE)
 209                 return 0;
 210
 211         /*
 212          * Bump the record size for a full log format if the caller allows it.
 213          * This is primarily for performance reasons and most callers don't care
 214          * about record size since the log is clean after we're done.
 215          */
 216         if (max)
 217                 len = BTOBB(BDSTRAT_SIZE);
 218
 219         /*
 220          * Otherwise, fill everything beyond the initial record with records of
 221          * the previous cycle so the kernel head/tail detection works correctly.
 222          *
 223          * We don't particularly care about the record size or content here.
 224          * It's only important that the headers are in place such that the
 225          * kernel finds 1.) a clean log and 2.) the correct current cycle value.
 226          * Therefore, bump up the record size to the max to use larger I/Os and
 227          * improve performance.
 228          */
 229         cycle--;
 230         blk = start + len;
 231         if (dptr)
 232                 dptr += BBTOB(len);
 233         end_blk = start + length;
 234
 235         len = min(end_blk - blk, len);
 236         while (blk < end_blk) {
 237                 lsn = xlog_assign_lsn(cycle, blk - start);
 238                 tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
 239
 240                 ptr = dptr;
 241                 if (btp) {
 242                         bp = libxfs_getbufr(btp, blk, len);
 243                         ptr = bp->b_addr;
 244                 }
 245                 /*
 246                  * Note: pass the full buffer length as the sunit to initialize
 247                  * the entire buffer.
 248                  */
 249                 libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
 250                                   tail_lsn, next, bp);
 251                 if (bp) {
 252                         bp->b_flags |= LIBXFS_B_DIRTY;
 253                         libxfs_putbufr(bp);
 254                 }
 255
 256                 blk += len;
 257                 if (dptr)
 258                         dptr += BBTOB(len);
 259                 len = min(end_blk - blk, len);
 260         }
 261
 262         return 0;
 263 }
 264
 265 int
 266 libxfs_log_header(
 267         char                    *caddr,
 268         uuid_t                  *fs_uuid,
 269         int                     version,
 270         int                     sunit,
 271         int                     fmt,
 272         xfs_lsn_t               lsn,
 273         xfs_lsn_t               tail_lsn,
 274         libxfs_get_block_t      *nextfunc,
 275         void                    *private)
 276 {
 277         xlog_rec_header_t       *head = (xlog_rec_header_t *)caddr;
 278         char                    *p = caddr;
 279         __be32                  cycle_lsn;
 280         int                     i, len;
 281         int                     hdrs = 1;
 282
 283         if (lsn == NULLCOMMITLSN)
 284                 lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
 285         if (tail_lsn == NULLCOMMITLSN)
 286                 tail_lsn = lsn;
 287
 288         len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
 289
 290         memset(p, 0, BBSIZE);
 291         head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
 292         head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
 293         head->h_version = cpu_to_be32(version);
 294         head->h_crc = cpu_to_le32(0);
 295         head->h_prev_block = cpu_to_be32(-1);
 296         head->h_num_logops = cpu_to_be32(1);
 297         head->h_fmt = cpu_to_be32(fmt);
 298         head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
 299
 300         head->h_lsn = cpu_to_be64(lsn);
 301         head->h_tail_lsn = cpu_to_be64(tail_lsn);
 302
 303         memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
 304
 305         /*
 306          * The kernel expects to see either a log record header magic value or
 307          * the LSN cycle at the top of every log block. The first word of each
 308          * non-header block is copied to the record headers and replaced with
 309          * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
 310          * details).
 311          *
 312          * Even though we only ever write an unmount record (one block), we
 313          * support writing log records up to the max log buffer size of 256k to
 314          * improve log format performance. This means a record can require up
 315          * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
 316          * data (each header supports 32k of data).
 317          */
 318         cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
 319         if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
 320                 hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
 321                 if (sunit % XLOG_HEADER_CYCLE_SIZE)
 322                         hdrs++;
 323         }
 324
 325         /*
 326          * A fixed number of extended headers is expected based on h_size. If
 327          * required, format those now so the unmount record is located
 328          * correctly.
 329          *
 330          * Since we only write an unmount record, we only need one h_cycle_data
 331          * entry for the unmount record block. The subsequent record data
 332          * blocks are zeroed, which means we can stamp them directly with the
 333          * cycle and zero the rest of the cycle data in the extended headers.
 334          */
 335         if (hdrs > 1) {
 336                 for (i = 1; i < hdrs; i++) {
 337                         p = nextfunc(p, BBSIZE, private);
 338                         memset(p, 0, BBSIZE);
 339                         /* xlog_rec_ext_header.xh_cycle */
 340                         *(__be32 *)p = cycle_lsn;
 341                 }
 342         }
 343
 344         /*
 345          * The total length is the max of the stripe unit or 2 basic block
 346          * minimum (1 hdr blk + 1 data blk). The record length is the total
 347          * minus however many header blocks are required.
 348          */
 349         head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
 350
 351         /*
 352          * Write out the unmount record, pack the first word into the record
 353          * header and stamp the block with the cycle.
 354          */
 355         p = nextfunc(p, BBSIZE, private);
 356         unmount_record(p);
 357
 358         head->h_cycle_data[0] = *(__be32 *)p;
 359         *(__be32 *)p = cycle_lsn;
 360
 361         /*
 362          * Finally, zero all remaining blocks in the record and stamp each with
 363          * the cycle. We don't need to pack any of these blocks because the
 364          * cycle data in the headers has already been zeroed.
 365          */
 366         len = max(len, hdrs + 1);
 367         for (i = hdrs + 1; i < len; i++) {
 368                 p = nextfunc(p, BBSIZE, private);
 369                 memset(p, 0, BBSIZE);
 370                 *(__be32 *)p = cycle_lsn;
 371         }
 372
 373         return BBTOB(len);
 374 }
 375
 376 /*
 377  * Simple I/O (buffer cache) interface
 378  */
 379
 380
 381 #ifdef XFS_BUF_TRACING
 382
 383 #undef libxfs_readbuf
 384 #undef libxfs_readbuf_map
 385 #undef libxfs_writebuf
 386 #undef libxfs_getbuf
 387 #undef libxfs_getbuf_map
 388 #undef libxfs_getbuf_flags
 389 #undef libxfs_putbuf
 390
 391 xfs_buf_t       *libxfs_readbuf(struct xfs_buftarg *, xfs_daddr_t, int, int,
 392                                 const struct xfs_buf_ops *);
 393 xfs_buf_t       *libxfs_readbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
 394                                 int, int, const struct xfs_buf_ops *);
 395 int             libxfs_writebuf(xfs_buf_t *, int);
 396 xfs_buf_t       *libxfs_getbuf(struct xfs_buftarg *, xfs_daddr_t, int);
 397 xfs_buf_t       *libxfs_getbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
 398                                 int, int);
 399 xfs_buf_t       *libxfs_getbuf_flags(struct xfs_buftarg *, xfs_daddr_t, int,
 400                                 unsigned int);
 401 void            libxfs_putbuf (xfs_buf_t *);
 402
 403 #define __add_trace(bp, func, file, line)       \
 404 do {                                            \
 405         if (bp) {                               \
 406                 (bp)->b_func = (func);          \
 407                 (bp)->b_file = (file);          \
 408                 (bp)->b_line = (line);          \
 409         }                                       \
 410 } while (0)
 411
 412 xfs_buf_t *
 413 libxfs_trace_readbuf(const char *func, const char *file, int line,
 414                 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
 415                 const struct xfs_buf_ops *ops)
 416 {
 417         xfs_buf_t       *bp = libxfs_readbuf(btp, blkno, len, flags, ops);
 418         __add_trace(bp, func, file, line);
 419         return bp;
 420 }
 421
 422 xfs_buf_t *
 423 libxfs_trace_readbuf_map(const char *func, const char *file, int line,
 424                 struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, int flags,
 425                 const struct xfs_buf_ops *ops)
 426 {
 427         xfs_buf_t       *bp = libxfs_readbuf_map(btp, map, nmaps, flags, ops);
 428         __add_trace(bp, func, file, line);
 429         return bp;
 430 }
 431
 432 int
 433 libxfs_trace_writebuf(const char *func, const char *file, int line, xfs_buf_t *bp, int flags)
 434 {
 435         __add_trace(bp, func, file, line);
 436         return libxfs_writebuf(bp, flags);
 437 }
 438
 439 xfs_buf_t *
 440 libxfs_trace_getbuf(const char *func, const char *file, int line,
 441                 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
 442 {
 443         xfs_buf_t       *bp = libxfs_getbuf(btp, blkno, len);
 444         __add_trace(bp, func, file, line);
 445         return bp;
 446 }
 447
 448 xfs_buf_t *
 449 libxfs_trace_getbuf_map(const char *func, const char *file, int line,
 450                 struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
 451                 int flags)
 452 {
 453         xfs_buf_t       *bp = libxfs_getbuf_map(btp, map, nmaps, flags);
 454         __add_trace(bp, func, file, line);
 455         return bp;
 456 }
 457
 458 xfs_buf_t *
 459 libxfs_trace_getbuf_flags(const char *func, const char *file, int line,
 460                 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, unsigned int flags)
 461 {
 462         xfs_buf_t       *bp = libxfs_getbuf_flags(btp, blkno, len, flags);
 463         __add_trace(bp, func, file, line);
 464         return bp;
 465 }
 466
 467 void
 468 libxfs_trace_putbuf(const char *func, const char *file, int line, xfs_buf_t *bp)
 469 {
 470         __add_trace(bp, func, file, line);
 471         libxfs_putbuf(bp);
 472 }
 473
 474
 475 #endif
 476
 477
 478 xfs_buf_t *
 479 libxfs_getsb(xfs_mount_t *mp, int flags)
 480 {
 481         return libxfs_readbuf(mp->m_ddev_targp, XFS_SB_DADDR,
 482                                 XFS_FSS_TO_BB(mp, 1), flags, &xfs_sb_buf_ops);
 483 }
 484
 485 kmem_zone_t                     *xfs_buf_zone;
 486
 487 static struct cache_mru         xfs_buf_freelist =
 488         {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
 489          0, PTHREAD_MUTEX_INITIALIZER };
 490
 491 /*
 492  * The bufkey is used to pass the new buffer information to the cache object
 493  * allocation routine. Because discontiguous buffers need to pass different
 494  * information, we need fields to pass that information. However, because the
 495  * blkno and bblen is needed for the initial cache entry lookup (i.e. for
 496  * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
 497  * buffer initialisation instead of a contiguous buffer.
 498  */
 499 struct xfs_bufkey {
 500         struct xfs_buftarg      *buftarg;
 501         xfs_daddr_t             blkno;
 502         unsigned int            bblen;
 503         struct xfs_buf_map      *map;
 504         int                     nmaps;
 505 };
 506
 507 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
 508 #define GOLDEN_RATIO_PRIME      0x9e37fffffffc0001UL
 509 #define CACHE_LINE_SIZE         64
 510 static unsigned int
 511 libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
 512 {
 513         uint64_t        hashval = ((struct xfs_bufkey *)key)->blkno;
 514         uint64_t        tmp;
 515
 516         tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
 517         tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
 518         return tmp % hashsize;
 519 }
 520
 521 static int
 522 libxfs_bcompare(struct cache_node *node, cache_key_t key)
 523 {
 524         struct xfs_buf  *bp = (struct xfs_buf *)node;
 525         struct xfs_bufkey *bkey = (struct xfs_bufkey *)key;
 526
 527         if (bp->b_target->dev == bkey->buftarg->dev &&
 528             bp->b_bn == bkey->blkno) {
 529                 if (bp->b_bcount == BBTOB(bkey->bblen))
 530                         return CACHE_HIT;
 531 #ifdef IO_BCOMPARE_CHECK
 532                 if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
 533                         fprintf(stderr,
 534         "%lx: Badness in key lookup (length)\n"
 535         "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
 536                                 pthread_self(),
 537                                 (unsigned long long)bp->b_bn, (int)bp->b_bcount,
 538                                 (unsigned long long)bkey->blkno,
 539                                 BBTOB(bkey->bblen));
 540                 }
 541 #endif
 542                 return CACHE_PURGE;
 543         }
 544         return CACHE_MISS;
 545 }
 546
 547 static void
 548 __initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 549                 unsigned int bytes)
 550 {
 551         bp->b_flags = 0;
 552         bp->b_bn = bno;
 553         bp->b_bcount = bytes;
 554         bp->b_length = BTOBB(bytes);
 555         bp->b_target = btp;
 556         bp->b_error = 0;
 557         if (!bp->b_addr)
 558                 bp->b_addr = memalign(libxfs_device_alignment(), bytes);
 559         if (!bp->b_addr) {
 560                 fprintf(stderr,
 561                         _("%s: %s can't memalign %u bytes: %s\n"),
 562                         progname, __FUNCTION__, bytes,
 563                         strerror(errno));
 564                 exit(1);
 565         }
 566         memset(bp->b_addr, 0, bytes);
 567 #ifdef XFS_BUF_TRACING
 568         list_head_init(&bp->b_lock_list);
 569 #endif
 570         pthread_mutex_init(&bp->b_lock, NULL);
 571         bp->b_holder = 0;
 572         bp->b_recur = 0;
 573         bp->b_ops = NULL;
 574
 575         if (!bp->b_maps) {
 576                 bp->b_nmaps = 1;
 577                 bp->b_maps = &bp->__b_map;
 578                 bp->b_maps[0].bm_bn = bp->b_bn;
 579                 bp->b_maps[0].bm_len = bp->b_length;
 580         }
 581 }
 582
 583 static void
 584 libxfs_initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 585                 unsigned int bytes)
 586 {
 587         __initbuf(bp, btp, bno, bytes);
 588 }
 589
 590 static void
 591 libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
 592                 struct xfs_buf_map *map, int nmaps)
 593 {
 594         unsigned int bytes = 0;
 595         int i;
 596
 597         bytes = sizeof(struct xfs_buf_map) * nmaps;
 598         bp->b_maps = malloc(bytes);
 599         if (!bp->b_maps) {
 600                 fprintf(stderr,
 601                         _("%s: %s can't malloc %u bytes: %s\n"),
 602                         progname, __FUNCTION__, bytes,
 603                         strerror(errno));
 604                 exit(1);
 605         }
 606         bp->b_nmaps = nmaps;
 607
 608         bytes = 0;
 609         for ( i = 0; i < nmaps; i++) {
 610                 bp->b_maps[i].bm_bn = map[i].bm_bn;
 611                 bp->b_maps[i].bm_len = map[i].bm_len;
 612                 bytes += BBTOB(map[i].bm_len);
 613         }
 614
 615         __initbuf(bp, btp, map[0].bm_bn, bytes);
 616         bp->b_flags |= LIBXFS_B_DISCONTIG;
 617 }
 618
 619 static xfs_buf_t *
 620 __libxfs_getbufr(int blen)
 621 {
 622         xfs_buf_t       *bp;
 623
 624         /*
 625          * first look for a buffer that can be used as-is,
 626          * if one cannot be found, see if there is a buffer,
 627          * and if so, free its buffer and set b_addr to NULL
 628          * before calling libxfs_initbuf.
 629          */
 630         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 631         if (!list_empty(&xfs_buf_freelist.cm_list)) {
 632                 list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
 633                         if (bp->b_bcount == blen) {
 634                                 list_del_init(&bp->b_node.cn_mru);
 635                                 break;
 636                         }
 637                 }
 638                 if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
 639                         bp = list_entry(xfs_buf_freelist.cm_list.next,
 640                                         xfs_buf_t, b_node.cn_mru);
 641                         list_del_init(&bp->b_node.cn_mru);
 642                         free(bp->b_addr);
 643                         bp->b_addr = NULL;
 644                         if (bp->b_maps != &bp->__b_map)
 645                                 free(bp->b_maps);
 646                         bp->b_maps = NULL;
 647                 }
 648         } else
 649                 bp = kmem_zone_zalloc(xfs_buf_zone, 0);
 650         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 651         bp->b_ops = NULL;
 652         if (bp->b_flags & LIBXFS_B_DIRTY)
 653                 fprintf(stderr, "found dirty buffer (bulk) on free list!");
 654
 655         return bp;
 656 }
 657
 658 xfs_buf_t *
 659 libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
 660 {
 661         xfs_buf_t       *bp;
 662         int             blen = BBTOB(bblen);
 663
 664         bp =__libxfs_getbufr(blen);
 665         if (bp)
 666                 libxfs_initbuf(bp, btp, blkno, blen);
 667 #ifdef IO_DEBUG
 668         printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
 669                 pthread_self(), __FUNCTION__, blen,
 670                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 671 #endif
 672
 673         return bp;
 674 }
 675
 676 static xfs_buf_t *
 677 libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
 678                 struct xfs_buf_map *map, int nmaps)
 679 {
 680         xfs_buf_t       *bp;
 681         int             blen = BBTOB(bblen);
 682
 683         if (!map || !nmaps) {
 684                 fprintf(stderr,
 685                         _("%s: %s invalid map %p or nmaps %d\n"),
 686                         progname, __FUNCTION__, map, nmaps);
 687                 exit(1);
 688         }
 689
 690         if (blkno != map[0].bm_bn) {
 691                 fprintf(stderr,
 692                         _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
 693                         progname, __FUNCTION__, (long long)map[0].bm_bn,
 694                         (long long)blkno);
 695                 exit(1);
 696         }
 697
 698         bp =__libxfs_getbufr(blen);
 699         if (bp)
 700                 libxfs_initbuf_map(bp, btp, map, nmaps);
 701 #ifdef IO_DEBUG
 702         printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
 703                 pthread_self(), __FUNCTION__, blen,
 704                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 705 #endif
 706
 707         return bp;
 708 }
 709
 710 #ifdef XFS_BUF_TRACING
 711 struct list_head        lock_buf_list = {&lock_buf_list, &lock_buf_list};
 712 int                     lock_buf_count = 0;
 713 #endif
 714
 715 static struct xfs_buf *
 716 __cache_lookup(struct xfs_bufkey *key, unsigned int flags)
 717 {
 718         struct xfs_buf  *bp;
 719
 720         cache_node_get(libxfs_bcache, key, (struct cache_node **)&bp);
 721         if (!bp)
 722                 return NULL;
 723
 724         if (use_xfs_buf_lock) {
 725                 int ret;
 726
 727                 ret = pthread_mutex_trylock(&bp->b_lock);
 728                 if (ret) {
 729                         ASSERT(ret == EAGAIN);
 730                         if (flags & LIBXFS_GETBUF_TRYLOCK)
 731                                 goto out_put;
 732
 733                         if (pthread_equal(bp->b_holder, pthread_self())) {
 734                                 fprintf(stderr,
 735         _("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
 736                                         key->blkno);
 737                                 bp->b_recur++;
 738                                 return bp;
 739                         } else {
 740                                 pthread_mutex_lock(&bp->b_lock);
 741                         }
 742                 }
 743
 744                 bp->b_holder = pthread_self();
 745         }
 746
 747         cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp,
 748                 cache_node_get_priority((struct cache_node *)bp) -
 749                                                 CACHE_PREFETCH_PRIORITY);
 750 #ifdef XFS_BUF_TRACING
 751         pthread_mutex_lock(&libxfs_bcache->c_mutex);
 752         lock_buf_count++;
 753         list_add(&bp->b_lock_list, &lock_buf_list);
 754         pthread_mutex_unlock(&libxfs_bcache->c_mutex);
 755 #endif
 756 #ifdef IO_DEBUG
 757         printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
 758                 pthread_self(), __FUNCTION__,
 759                 bp, bp->b_bn, (long long)LIBXFS_BBTOOFF64(key->blkno));
 760 #endif
 761
 762         return bp;
 763 out_put:
 764         cache_node_put(libxfs_bcache, (struct cache_node *)bp);
 765         return NULL;
 766 }
 767
 768 struct xfs_buf *
 769 libxfs_getbuf_flags(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len,
 770                 unsigned int flags)
 771 {
 772         struct xfs_bufkey key = {NULL};
 773
 774         key.buftarg = btp;
 775         key.blkno = blkno;
 776         key.bblen = len;
 777
 778         return __cache_lookup(&key, flags);
 779 }
 780
 781 /*
 782  * Clean the buffer flags for libxfs_getbuf*(), which wants to return
 783  * an unused buffer with clean state.  This prevents CRC errors on a
 784  * re-read of a corrupt block that was prefetched and freed.  This
 785  * can happen with a massively corrupt directory that is discarded,
 786  * but whose blocks are then recycled into expanding lost+found.
 787  *
 788  * Note however that if the buffer's dirty (prefetch calls getbuf)
 789  * we'll leave the state alone because we don't want to discard blocks
 790  * that have been fixed.
 791  */
 792 static void
 793 reset_buf_state(
 794         struct xfs_buf  *bp)
 795 {
 796         if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
 797                 bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
 798                                 LIBXFS_B_UPTODATE);
 799 }
 800
 801 struct xfs_buf *
 802 libxfs_getbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
 803 {
 804         struct xfs_buf  *bp;
 805
 806         bp = libxfs_getbuf_flags(btp, blkno, len, 0);
 807         reset_buf_state(bp);
 808         return bp;
 809 }
 810
 811 static struct xfs_buf *
 812 __libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
 813                     int nmaps, int flags)
 814 {
 815         struct xfs_bufkey key = {NULL};
 816         int i;
 817
 818         if (nmaps == 1)
 819                 return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
 820                                            flags);
 821
 822         key.buftarg = btp;
 823         key.blkno = map[0].bm_bn;
 824         for (i = 0; i < nmaps; i++) {
 825                 key.bblen += map[i].bm_len;
 826         }
 827         key.map = map;
 828         key.nmaps = nmaps;
 829
 830         return __cache_lookup(&key, flags);
 831 }
 832
 833 struct xfs_buf *
 834 libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
 835                   int nmaps, int flags)
 836 {
 837         struct xfs_buf  *bp;
 838
 839         bp = __libxfs_getbuf_map(btp, map, nmaps, flags);
 840         reset_buf_state(bp);
 841         return bp;
 842 }
 843
 844 void
 845 libxfs_putbuf(xfs_buf_t *bp)
 846 {
 847         /*
 848          * ensure that any errors on this use of the buffer don't carry
 849          * over to the next user.
 850          */
 851         bp->b_error = 0;
 852
 853 #ifdef XFS_BUF_TRACING
 854         pthread_mutex_lock(&libxfs_bcache->c_mutex);
 855         lock_buf_count--;
 856         ASSERT(lock_buf_count >= 0);
 857         list_del_init(&bp->b_lock_list);
 858         pthread_mutex_unlock(&libxfs_bcache->c_mutex);
 859 #endif
 860         if (use_xfs_buf_lock) {
 861                 if (bp->b_recur) {
 862                         bp->b_recur--;
 863                 } else {
 864                         bp->b_holder = 0;
 865                         pthread_mutex_unlock(&bp->b_lock);
 866                 }
 867         }
 868
 869         cache_node_put(libxfs_bcache, (struct cache_node *)bp);
 870 }
 871
 872 void
 873 libxfs_purgebuf(xfs_buf_t *bp)
 874 {
 875         struct xfs_bufkey key = {NULL};
 876
 877         key.buftarg = bp->b_target;
 878         key.blkno = bp->b_bn;
 879         key.bblen = bp->b_length;
 880
 881         cache_node_purge(libxfs_bcache, &key, (struct cache_node *)bp);
 882 }
 883
 884 static struct cache_node *
 885 libxfs_balloc(cache_key_t key)
 886 {
 887         struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key;
 888
 889         if (bufkey->map)
 890                 return (struct cache_node *)
 891                        libxfs_getbufr_map(bufkey->buftarg,
 892                                           bufkey->blkno, bufkey->bblen,
 893                                           bufkey->map, bufkey->nmaps);
 894         return (struct cache_node *)libxfs_getbufr(bufkey->buftarg,
 895                                           bufkey->blkno, bufkey->bblen);
 896 }
 897
 898
 899 static int
 900 __read_buf(int fd, void *buf, int len, off64_t offset, int flags)
 901 {
 902         int     sts;
 903
 904         sts = pread(fd, buf, len, offset);
 905         if (sts < 0) {
 906                 int error = errno;
 907                 fprintf(stderr, _("%s: read failed: %s\n"),
 908                         progname, strerror(error));
 909                 if (flags & LIBXFS_EXIT_ON_FAILURE)
 910                         exit(1);
 911                 return -error;
 912         } else if (sts != len) {
 913                 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
 914                         progname, sts, len);
 915                 if (flags & LIBXFS_EXIT_ON_FAILURE)
 916                         exit(1);
 917                 return -EIO;
 918         }
 919         return 0;
 920 }
 921
 922 int
 923 libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, xfs_buf_t *bp,
 924                 int len, int flags)
 925 {
 926         int     fd = libxfs_device_to_fd(btp->dev);
 927         int     bytes = BBTOB(len);
 928         int     error;
 929
 930         ASSERT(BBTOB(len) <= bp->b_bcount);
 931
 932         error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
 933         if (!error &&
 934             bp->b_target->dev == btp->dev &&
 935             bp->b_bn == blkno &&
 936             bp->b_bcount == bytes)
 937                 bp->b_flags |= LIBXFS_B_UPTODATE;
 938 #ifdef IO_DEBUG
 939         printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
 940                 pthread_self(), __FUNCTION__, bytes, error,
 941                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 942 #endif
 943         return error;
 944 }
 945
 946 void
 947 libxfs_readbuf_verify(struct xfs_buf *bp, const struct xfs_buf_ops *ops)
 948 {
 949         if (!ops)
 950                 return;
 951         bp->b_ops = ops;
 952         bp->b_ops->verify_read(bp);
 953         bp->b_flags &= ~LIBXFS_B_UNCHECKED;
 954 }
 955
 956
 957 xfs_buf_t *
 958 libxfs_readbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
 959                 const struct xfs_buf_ops *ops)
 960 {
 961         xfs_buf_t       *bp;
 962         int             error;
 963
 964         bp = libxfs_getbuf_flags(btp, blkno, len, 0);
 965         if (!bp)
 966                 return NULL;
 967
 968         /*
 969          * if the buffer was prefetched, it is likely that it was not validated.
 970          * Hence if we are supplied an ops function and the buffer is marked as
 971          * unchecked, we need to validate it now.
 972          *
 973          * We do this verification even if the buffer is dirty - the
 974          * verification is almost certainly going to fail the CRC check in this
 975          * case as a dirty buffer has not had the CRC recalculated. However, we
 976          * should not be dirtying unchecked buffers and therefore failing it
 977          * here because it's dirty and unchecked indicates we've screwed up
 978          * somewhere else.
 979          */
 980         bp->b_error = 0;
 981         if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
 982                 if (bp->b_flags & LIBXFS_B_UNCHECKED)
 983                         libxfs_readbuf_verify(bp, ops);
 984                 return bp;
 985         }
 986
 987         /*
 988          * Set the ops on a cache miss (i.e. first physical read) as the
 989          * verifier may change the ops to match the type of buffer it contains.
 990          * A cache hit might reset the verifier to the original type if we set
 991          * it again, but it won't get called again and set to match the buffer
 992          * contents. *cough* xfs_da_node_buf_ops *cough*.
 993          */
 994         error = libxfs_readbufr(btp, blkno, bp, len, flags);
 995         if (error)
 996                 bp->b_error = error;
 997         else
 998                 libxfs_readbuf_verify(bp, ops);
 999         return bp;
1000 }
1001
1002 int
1003 libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
1004 {
1005         int     fd;
1006         int     error = 0;
1007         void    *buf;
1008         int     i;
1009
1010         fd = libxfs_device_to_fd(btp->dev);
1011         buf = bp->b_addr;
1012         for (i = 0; i < bp->b_nmaps; i++) {
1013                 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
1014                 int len = BBTOB(bp->b_maps[i].bm_len);
1015
1016                 error = __read_buf(fd, buf, len, offset, flags);
1017                 if (error) {
1018                         bp->b_error = error;
1019                         break;
1020                 }
1021                 buf += len;
1022         }
1023
1024         if (!error)
1025                 bp->b_flags |= LIBXFS_B_UPTODATE;
1026 #ifdef IO_DEBUG
1027         printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1028                 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
1029                 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
1030 #endif
1031         return error;
1032 }
1033
1034 struct xfs_buf *
1035 libxfs_readbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
1036                 int flags, const struct xfs_buf_ops *ops)
1037 {
1038         struct xfs_buf  *bp;
1039         int             error = 0;
1040
1041         if (nmaps == 1)
1042                 return libxfs_readbuf(btp, map[0].bm_bn, map[0].bm_len,
1043                                         flags, ops);
1044
1045         bp = __libxfs_getbuf_map(btp, map, nmaps, 0);
1046         if (!bp)
1047                 return NULL;
1048
1049         bp->b_error = 0;
1050         if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
1051                 if (bp->b_flags & LIBXFS_B_UNCHECKED)
1052                         libxfs_readbuf_verify(bp, ops);
1053                 return bp;
1054         }
1055         error = libxfs_readbufr_map(btp, bp, flags);
1056         if (!error)
1057                 libxfs_readbuf_verify(bp, ops);
1058
1059 #ifdef IO_DEBUGX
1060         printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1061                 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
1062                 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
1063 #endif
1064         return bp;
1065 }
1066
1067 static int
1068 __write_buf(int fd, void *buf, int len, off64_t offset, int flags)
1069 {
1070         int     sts;
1071
1072         sts = pwrite(fd, buf, len, offset);
1073         if (sts < 0) {
1074                 int error = errno;
1075                 fprintf(stderr, _("%s: pwrite failed: %s\n"),
1076                         progname, strerror(error));
1077                 if (flags & LIBXFS_B_EXIT)
1078                         exit(1);
1079                 return -error;
1080         } else if (sts != len) {
1081                 fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
1082                         progname, sts, len);
1083                 if (flags & LIBXFS_B_EXIT)
1084                         exit(1);
1085                 return -EIO;
1086         }
1087         return 0;
1088 }
1089
1090 int
1091 libxfs_writebufr(xfs_buf_t *bp)
1092 {
1093         int     fd = libxfs_device_to_fd(bp->b_target->dev);
1094
1095         /*
1096          * we never write buffers that are marked stale. This indicates they
1097          * contain data that has been invalidated, and even if the buffer is
1098          * dirty it must *never* be written. Verifiers are wonderful for finding
1099          * bugs like this. Make sure the error is obvious as to the cause.
1100          */
1101         if (bp->b_flags & LIBXFS_B_STALE) {
1102                 bp->b_error = -ESTALE;
1103                 return bp->b_error;
1104         }
1105
1106         /*
1107          * clear any pre-existing error status on the buffer. This can occur if
1108          * the buffer is corrupt on disk and the repair process doesn't clear
1109          * the error before fixing and writing it back.
1110          */
1111         bp->b_error = 0;
1112         if (bp->b_ops) {
1113                 bp->b_ops->verify_write(bp);
1114                 if (bp->b_error) {
1115                         fprintf(stderr,
1116         _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
1117                                 __func__, bp->b_ops->name,
1118                                 (long long)bp->b_bn, bp->b_bcount);
1119                         return bp->b_error;
1120                 }
1121         }
1122
1123         if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
1124                 bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
1125                                     LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
1126         } else {
1127                 int     i;
1128                 void    *buf = bp->b_addr;
1129
1130                 for (i = 0; i < bp->b_nmaps; i++) {
1131                         off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
1132                         int len = BBTOB(bp->b_maps[i].bm_len);
1133
1134                         bp->b_error = __write_buf(fd, buf, len, offset,
1135                                                   bp->b_flags);
1136                         if (bp->b_error)
1137                                 break;
1138                         buf += len;
1139                 }
1140         }
1141
1142 #ifdef IO_DEBUG
1143         printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
1144                         pthread_self(), __FUNCTION__, bp->b_bcount,
1145                         (long long)LIBXFS_BBTOOFF64(bp->b_bn),
1146                         (long long)bp->b_bn, bp, bp->b_error);
1147 #endif
1148         if (!bp->b_error) {
1149                 bp->b_flags |= LIBXFS_B_UPTODATE;
1150                 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
1151                                  LIBXFS_B_UNCHECKED);
1152         }
1153         return bp->b_error;
1154 }
1155
1156 int
1157 libxfs_writebuf_int(xfs_buf_t *bp, int flags)
1158 {
1159         /*
1160          * Clear any error hanging over from reading the buffer. This prevents
1161          * subsequent reads after this write from seeing stale errors.
1162          */
1163         bp->b_error = 0;
1164         bp->b_flags &= ~LIBXFS_B_STALE;
1165         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
1166         return 0;
1167 }
1168
1169 int
1170 libxfs_writebuf(xfs_buf_t *bp, int flags)
1171 {
1172 #ifdef IO_DEBUG
1173         printf("%lx: %s: dirty blkno=%llu(%llu)\n",
1174                         pthread_self(), __FUNCTION__,
1175                         (long long)LIBXFS_BBTOOFF64(bp->b_bn),
1176                         (long long)bp->b_bn);
1177 #endif
1178         /*
1179          * Clear any error hanging over from reading the buffer. This prevents
1180          * subsequent reads after this write from seeing stale errors.
1181          */
1182         bp->b_error = 0;
1183         bp->b_flags &= ~LIBXFS_B_STALE;
1184         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
1185         libxfs_putbuf(bp);
1186         return 0;
1187 }
1188
1189 void
1190 libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
1191 {
1192 #ifdef IO_DEBUG
1193         if (boff + len > bp->b_bcount) {
1194                 printf("Badness, iomove out of range!\n"
1195                         "bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
1196                         (long long)bp->b_bn, bp->b_bcount, boff, len);
1197                 abort();
1198         }
1199 #endif
1200         switch (flags) {
1201         case LIBXFS_BZERO:
1202                 memset(bp->b_addr + boff, 0, len);
1203                 break;
1204         case LIBXFS_BREAD:
1205                 memcpy(data, bp->b_addr + boff, len);
1206                 break;
1207         case LIBXFS_BWRITE:
1208                 memcpy(bp->b_addr + boff, data, len);
1209                 break;
1210         }
1211 }
1212
1213 static void
1214 libxfs_brelse(
1215         struct cache_node       *node)
1216 {
1217         struct xfs_buf          *bp = (struct xfs_buf *)node;
1218
1219         if (!bp)
1220                 return;
1221         if (bp->b_flags & LIBXFS_B_DIRTY)
1222                 fprintf(stderr,
1223                         "releasing dirty buffer to free list!");
1224
1225         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
1226         list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
1227         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
1228 }
1229
1230 static unsigned int
1231 libxfs_bulkrelse(
1232         struct cache            *cache,
1233         struct list_head        *list)
1234 {
1235         xfs_buf_t               *bp;
1236         int                     count = 0;
1237
1238         if (list_empty(list))
1239                 return 0 ;
1240
1241         list_for_each_entry(bp, list, b_node.cn_mru) {
1242                 if (bp->b_flags & LIBXFS_B_DIRTY)
1243                         fprintf(stderr,
1244                                 "releasing dirty buffer (bulk) to free list!");
1245                 count++;
1246         }
1247
1248         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
1249         list_splice(list, &xfs_buf_freelist.cm_list);
1250         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
1251
1252         return count;
1253 }
1254
1255 /*
1256  * Free everything from the xfs_buf_freelist MRU, used at final teardown
1257  */
1258 void
1259 libxfs_bcache_free(void)
1260 {
1261         struct list_head        *cm_list;
1262         xfs_buf_t               *bp, *next;
1263
1264         cm_list = &xfs_buf_freelist.cm_list;
1265         list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
1266                 free(bp->b_addr);
1267                 if (bp->b_maps != &bp->__b_map)
1268                         free(bp->b_maps);
1269                 kmem_zone_free(xfs_buf_zone, bp);
1270         }
1271 }
1272
1273 /*
1274  * When a buffer is marked dirty, the error is cleared. Hence if we are trying
1275  * to flush a buffer prior to cache reclaim that has an error on it it means
1276  * we've already tried to flush it and it failed. Prevent repeated corruption
1277  * errors from being reported by skipping such buffers - when the corruption is
1278  * fixed the buffer will be marked dirty again and we can write it again.
1279  */
1280 static int
1281 libxfs_bflush(
1282         struct cache_node       *node)
1283 {
1284         struct xfs_buf          *bp = (struct xfs_buf *)node;
1285
1286         if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
1287                 return libxfs_writebufr(bp);
1288         return bp->b_error;
1289 }
1290
1291 void
1292 libxfs_putbufr(xfs_buf_t *bp)
1293 {
1294         if (bp->b_flags & LIBXFS_B_DIRTY)
1295                 libxfs_writebufr(bp);
1296         libxfs_brelse((struct cache_node *)bp);
1297 }
1298
1299
1300 void
1301 libxfs_bcache_purge(void)
1302 {
1303         cache_purge(libxfs_bcache);
1304 }
1305
1306 void
1307 libxfs_bcache_flush(void)
1308 {
1309         cache_flush(libxfs_bcache);
1310 }
1311
1312 int
1313 libxfs_bcache_overflowed(void)
1314 {
1315         return cache_overflowed(libxfs_bcache);
1316 }
1317
1318 struct cache_operations libxfs_bcache_operations = {
1319         .hash           = libxfs_bhash,
1320         .alloc          = libxfs_balloc,
1321         .flush          = libxfs_bflush,
1322         .relse          = libxfs_brelse,
1323         .compare        = libxfs_bcompare,
1324         .bulkrelse      = libxfs_bulkrelse
1325 };
1326
1327
1328 /*
1329  * Inode cache stubs.
1330  */
1331
1332 kmem_zone_t             *xfs_inode_zone;
1333 extern kmem_zone_t      *xfs_ili_zone;
1334
1335 /*
1336  * If there are inline format data / attr forks attached to this inode,
1337  * make sure they're not corrupt.
1338  */
1339 bool
1340 libxfs_inode_verify_forks(
1341         struct xfs_inode        *ip,
1342         struct xfs_ifork_ops    *ops)
1343 {
1344         struct xfs_ifork        *ifp;
1345         xfs_failaddr_t          fa;
1346
1347         if (!ops)
1348                 return true;
1349
1350         fa = xfs_ifork_verify_data(ip, ops);
1351         if (fa) {
1352                 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1353                 xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
1354                                 ifp->if_u1.if_data, ifp->if_bytes, fa);
1355                 return false;
1356         }
1357
1358         fa = xfs_ifork_verify_attr(ip, ops);
1359         if (fa) {
1360                 ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
1361                 xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
1362                                 ifp ? ifp->if_u1.if_data : NULL,
1363                                 ifp ? ifp->if_bytes : 0, fa);
1364                 return false;
1365         }
1366         return true;
1367 }
1368
1369 int
1370 libxfs_iget(
1371         struct xfs_mount        *mp,
1372         struct xfs_trans        *tp,
1373         xfs_ino_t               ino,
1374         uint                    lock_flags,
1375         struct xfs_inode        **ipp,
1376         struct xfs_ifork_ops    *ifork_ops)
1377 {
1378         struct xfs_inode        *ip;
1379         int                     error = 0;
1380
1381         ip = kmem_zone_zalloc(xfs_inode_zone, 0);
1382         if (!ip)
1383                 return -ENOMEM;
1384
1385         ip->i_ino = ino;
1386         ip->i_mount = mp;
1387         error = xfs_iread(mp, tp, ip, 0);
1388         if (error) {
1389                 kmem_zone_free(xfs_inode_zone, ip);
1390                 *ipp = NULL;
1391                 return error;
1392         }
1393
1394         if (!libxfs_inode_verify_forks(ip, ifork_ops)) {
1395                 libxfs_irele(ip);
1396                 return -EFSCORRUPTED;
1397         }
1398
1399         /*
1400          * set up the inode ops structure that the libxfs code relies on
1401          */
1402         if (XFS_ISDIR(ip))
1403                 ip->d_ops = mp->m_dir_inode_ops;
1404         else
1405                 ip->d_ops = mp->m_nondir_inode_ops;
1406
1407         *ipp = ip;
1408         return 0;
1409 }
1410
1411 static void
1412 libxfs_idestroy(xfs_inode_t *ip)
1413 {
1414         switch (VFS_I(ip)->i_mode & S_IFMT) {
1415                 case S_IFREG:
1416                 case S_IFDIR:
1417                 case S_IFLNK:
1418                         libxfs_idestroy_fork(ip, XFS_DATA_FORK);
1419                         break;
1420         }
1421         if (ip->i_afp)
1422                 libxfs_idestroy_fork(ip, XFS_ATTR_FORK);
1423         if (ip->i_cowfp)
1424                 xfs_idestroy_fork(ip, XFS_COW_FORK);
1425 }
1426
1427 void
1428 libxfs_irele(
1429         struct xfs_inode        *ip)
1430 {
1431         if (ip->i_itemp)
1432                 kmem_zone_free(xfs_ili_zone, ip->i_itemp);
1433         ip->i_itemp = NULL;
1434         libxfs_idestroy(ip);
1435         kmem_zone_free(xfs_inode_zone, ip);
1436 }