libxfs/rdwr.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6
   7
   8 #include "libxfs_priv.h"
   9 #include "init.h"
  10 #include "xfs_fs.h"
  11 #include "xfs_shared.h"
  12 #include "xfs_format.h"
  13 #include "xfs_log_format.h"
  14 #include "xfs_trans_resv.h"
  15 #include "xfs_mount.h"
  16 #include "xfs_inode_buf.h"
  17 #include "xfs_inode_fork.h"
  18 #include "xfs_inode.h"
  19 #include "xfs_trans.h"
  20
  21 #include "libxfs.h"             /* for LIBXFS_EXIT_ON_FAILURE */
  22
  23 /*
  24  * Important design/architecture note:
  25  *
  26  * The userspace code that uses the buffer cache is much less constrained than
  27  * the kernel code. The userspace code is pretty nasty in places, especially
  28  * when it comes to buffer error handling.  Very little of the userspace code
  29  * outside libxfs clears bp->b_error - very little code even checks it - so the
  30  * libxfs code is tripping on stale errors left by the userspace code.
  31  *
  32  * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
  33  * in the kernel, because those functions are used by the libxfs_readbuf_*
  34  * functions and hence need to leave the buffers unchanged on cache hits. This
  35  * is actually the only way to gather a write error from a libxfs_writebuf()
  36  * call - you need to get the buffer again so you can check bp->b_error field -
  37  * assuming that the buffer is still in the cache when you check, that is.
  38  *
  39  * This is very different to the kernel code which does not release buffers on a
  40  * write so we can wait on IO and check errors. The kernel buffer cache also
  41  * guarantees a buffer of a known initial state from xfs_buf_get() even on a
  42  * cache hit.
  43  *
  44  * IOWs, userspace is behaving quite differently to the kernel and as a result
  45  * it leaks errors from reads, invalidations and writes through
  46  * libxfs_getbuf/libxfs_readbuf.
  47  *
  48  * The result of this is that until the userspace code outside libxfs is cleaned
  49  * up, functions that release buffers from userspace control (i.e
  50  * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
  51  * propagation of stale errors into future buffer operations.
  52  */
  53
  54 #define BDSTRAT_SIZE    (256 * 1024)
  55
  56 #define IO_BCOMPARE_CHECK
  57
  58 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
  59 int
  60 libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
  61 {
  62         xfs_off_t       start_offset, end_offset, offset;
  63         ssize_t         zsize, bytes;
  64         char            *z;
  65         int             fd;
  66
  67         zsize = min(BDSTRAT_SIZE, BBTOB(len));
  68         if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
  69                 fprintf(stderr,
  70                         _("%s: %s can't memalign %d bytes: %s\n"),
  71                         progname, __FUNCTION__, (int)zsize, strerror(errno));
  72                 exit(1);
  73         }
  74         memset(z, 0, zsize);
  75
  76         fd = libxfs_device_to_fd(btp->dev);
  77         start_offset = LIBXFS_BBTOOFF64(start);
  78
  79         if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
  80                 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
  81                         progname, __FUNCTION__,
  82                         (unsigned long long)start_offset, strerror(errno));
  83                 exit(1);
  84         }
  85
  86         end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
  87         for (offset = 0; offset < end_offset; ) {
  88                 bytes = min((ssize_t)(end_offset - offset), zsize);
  89                 if ((bytes = write(fd, z, bytes)) < 0) {
  90                         fprintf(stderr, _("%s: %s write failed: %s\n"),
  91                                 progname, __FUNCTION__, strerror(errno));
  92                         exit(1);
  93                 } else if (bytes == 0) {
  94                         fprintf(stderr, _("%s: %s not progressing?\n"),
  95                                 progname, __FUNCTION__);
  96                         exit(1);
  97                 }
  98                 offset += bytes;
  99         }
 100         free(z);
 101         return 0;
 102 }
 103
 104 static void unmount_record(void *p)
 105 {
 106         xlog_op_header_t        *op = (xlog_op_header_t *)p;
 107         /* the data section must be 32 bit size aligned */
 108         struct {
 109             uint16_t magic;
 110             uint16_t pad1;
 111             uint32_t pad2; /* may as well make it 64 bits */
 112         } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
 113
 114         memset(p, 0, BBSIZE);
 115         /* dummy tid to mark this as written from userspace */
 116         op->oh_tid = cpu_to_be32(0xb0c0d0d0);
 117         op->oh_len = cpu_to_be32(sizeof(magic));
 118         op->oh_clientid = XFS_LOG;
 119         op->oh_flags = XLOG_UNMOUNT_TRANS;
 120         op->oh_res2 = 0;
 121
 122         /* and the data for this op */
 123         memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
 124 }
 125
 126 static char *next(
 127         char            *ptr,
 128         int             offset,
 129         void            *private)
 130 {
 131         struct xfs_buf  *buf = (struct xfs_buf *)private;
 132
 133         if (buf &&
 134             (buf->b_bcount < (int)(ptr - (char *)buf->b_addr) + offset))
 135                 abort();
 136
 137         return ptr + offset;
 138 }
 139
 140 /*
 141  * Format the log. The caller provides either a buftarg which is used to access
 142  * the log via buffers or a direct pointer to a buffer that encapsulates the
 143  * entire log.
 144  */
 145 int
 146 libxfs_log_clear(
 147         struct xfs_buftarg      *btp,
 148         char                    *dptr,
 149         xfs_daddr_t             start,
 150         uint                    length,         /* basic blocks */
 151         uuid_t                  *fs_uuid,
 152         int                     version,
 153         int                     sunit,          /* bytes */
 154         int                     fmt,
 155         int                     cycle,
 156         bool                    max)
 157 {
 158         struct xfs_buf          *bp = NULL;
 159         int                     len;
 160         xfs_lsn_t               lsn;
 161         xfs_lsn_t               tail_lsn;
 162         xfs_daddr_t             blk;
 163         xfs_daddr_t             end_blk;
 164         char                    *ptr;
 165
 166         if (((btp && dptr) || (!btp && !dptr)) ||
 167             (btp && !btp->dev) || !fs_uuid)
 168                 return -EINVAL;
 169
 170         /* first zero the log */
 171         if (btp)
 172                 libxfs_device_zero(btp, start, length);
 173         else
 174                 memset(dptr, 0, BBTOB(length));
 175
 176         /*
 177          * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
 178          * special reset case where we only write a single record where the lsn
 179          * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
 180          * the specified cycle and points tail_lsn at the last record of the
 181          * previous cycle.
 182          */
 183         len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
 184         len = max(len, 2);
 185         lsn = xlog_assign_lsn(cycle, 0);
 186         if (cycle == XLOG_INIT_CYCLE)
 187                 tail_lsn = lsn;
 188         else
 189                 tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
 190
 191         /* write out the first log record */
 192         ptr = dptr;
 193         if (btp) {
 194                 bp = libxfs_getbufr(btp, start, len);
 195                 ptr = bp->b_addr;
 196         }
 197         libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
 198                           next, bp);
 199         if (bp) {
 200                 bp->b_flags |= LIBXFS_B_DIRTY;
 201                 libxfs_putbufr(bp);
 202         }
 203
 204         /*
 205          * There's nothing else to do if this is a log reset. The kernel detects
 206          * the rest of the log is zeroed and starts at cycle 1.
 207          */
 208         if (cycle == XLOG_INIT_CYCLE)
 209                 return 0;
 210
 211         /*
 212          * Bump the record size for a full log format if the caller allows it.
 213          * This is primarily for performance reasons and most callers don't care
 214          * about record size since the log is clean after we're done.
 215          */
 216         if (max)
 217                 len = BTOBB(BDSTRAT_SIZE);
 218
 219         /*
 220          * Otherwise, fill everything beyond the initial record with records of
 221          * the previous cycle so the kernel head/tail detection works correctly.
 222          *
 223          * We don't particularly care about the record size or content here.
 224          * It's only important that the headers are in place such that the
 225          * kernel finds 1.) a clean log and 2.) the correct current cycle value.
 226          * Therefore, bump up the record size to the max to use larger I/Os and
 227          * improve performance.
 228          */
 229         cycle--;
 230         blk = start + len;
 231         if (dptr)
 232                 dptr += BBTOB(len);
 233         end_blk = start + length;
 234
 235         len = min(end_blk - blk, len);
 236         while (blk < end_blk) {
 237                 lsn = xlog_assign_lsn(cycle, blk - start);
 238                 tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
 239
 240                 ptr = dptr;
 241                 if (btp) {
 242                         bp = libxfs_getbufr(btp, blk, len);
 243                         ptr = bp->b_addr;
 244                 }
 245                 /*
 246                  * Note: pass the full buffer length as the sunit to initialize
 247                  * the entire buffer.
 248                  */
 249                 libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
 250                                   tail_lsn, next, bp);
 251                 if (bp) {
 252                         bp->b_flags |= LIBXFS_B_DIRTY;
 253                         libxfs_putbufr(bp);
 254                 }
 255
 256                 blk += len;
 257                 if (dptr)
 258                         dptr += BBTOB(len);
 259                 len = min(end_blk - blk, len);
 260         }
 261
 262         return 0;
 263 }
 264
 265 int
 266 libxfs_log_header(
 267         char                    *caddr,
 268         uuid_t                  *fs_uuid,
 269         int                     version,
 270         int                     sunit,
 271         int                     fmt,
 272         xfs_lsn_t               lsn,
 273         xfs_lsn_t               tail_lsn,
 274         libxfs_get_block_t      *nextfunc,
 275         void                    *private)
 276 {
 277         xlog_rec_header_t       *head = (xlog_rec_header_t *)caddr;
 278         char                    *p = caddr;
 279         __be32                  cycle_lsn;
 280         int                     i, len;
 281         int                     hdrs = 1;
 282
 283         if (lsn == NULLCOMMITLSN)
 284                 lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
 285         if (tail_lsn == NULLCOMMITLSN)
 286                 tail_lsn = lsn;
 287
 288         len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
 289
 290         memset(p, 0, BBSIZE);
 291         head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
 292         head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
 293         head->h_version = cpu_to_be32(version);
 294         head->h_crc = cpu_to_le32(0);
 295         head->h_prev_block = cpu_to_be32(-1);
 296         head->h_num_logops = cpu_to_be32(1);
 297         head->h_fmt = cpu_to_be32(fmt);
 298         head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
 299
 300         head->h_lsn = cpu_to_be64(lsn);
 301         head->h_tail_lsn = cpu_to_be64(tail_lsn);
 302
 303         memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
 304
 305         /*
 306          * The kernel expects to see either a log record header magic value or
 307          * the LSN cycle at the top of every log block. The first word of each
 308          * non-header block is copied to the record headers and replaced with
 309          * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
 310          * details).
 311          *
 312          * Even though we only ever write an unmount record (one block), we
 313          * support writing log records up to the max log buffer size of 256k to
 314          * improve log format performance. This means a record can require up
 315          * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
 316          * data (each header supports 32k of data).
 317          */
 318         cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
 319         if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
 320                 hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
 321                 if (sunit % XLOG_HEADER_CYCLE_SIZE)
 322                         hdrs++;
 323         }
 324
 325         /*
 326          * A fixed number of extended headers is expected based on h_size. If
 327          * required, format those now so the unmount record is located
 328          * correctly.
 329          *
 330          * Since we only write an unmount record, we only need one h_cycle_data
 331          * entry for the unmount record block. The subsequent record data
 332          * blocks are zeroed, which means we can stamp them directly with the
 333          * cycle and zero the rest of the cycle data in the extended headers.
 334          */
 335         if (hdrs > 1) {
 336                 for (i = 1; i < hdrs; i++) {
 337                         p = nextfunc(p, BBSIZE, private);
 338                         memset(p, 0, BBSIZE);
 339                         /* xlog_rec_ext_header.xh_cycle */
 340                         *(__be32 *)p = cycle_lsn;
 341                 }
 342         }
 343
 344         /*
 345          * The total length is the max of the stripe unit or 2 basic block
 346          * minimum (1 hdr blk + 1 data blk). The record length is the total
 347          * minus however many header blocks are required.
 348          */
 349         head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
 350
 351         /*
 352          * Write out the unmount record, pack the first word into the record
 353          * header and stamp the block with the cycle.
 354          */
 355         p = nextfunc(p, BBSIZE, private);
 356         unmount_record(p);
 357
 358         head->h_cycle_data[0] = *(__be32 *)p;
 359         *(__be32 *)p = cycle_lsn;
 360
 361         /*
 362          * Finally, zero all remaining blocks in the record and stamp each with
 363          * the cycle. We don't need to pack any of these blocks because the
 364          * cycle data in the headers has already been zeroed.
 365          */
 366         len = max(len, hdrs + 1);
 367         for (i = hdrs + 1; i < len; i++) {
 368                 p = nextfunc(p, BBSIZE, private);
 369                 memset(p, 0, BBSIZE);
 370                 *(__be32 *)p = cycle_lsn;
 371         }
 372
 373         return BBTOB(len);
 374 }
 375
 376 /*
 377  * Simple I/O (buffer cache) interface
 378  */
 379
 380
 381 #ifdef XFS_BUF_TRACING
 382
 383 #undef libxfs_readbuf
 384 #undef libxfs_readbuf_map
 385 #undef libxfs_writebuf
 386 #undef libxfs_getbuf
 387 #undef libxfs_getbuf_map
 388 #undef libxfs_getbuf_flags
 389 #undef libxfs_putbuf
 390
 391 xfs_buf_t       *libxfs_readbuf(struct xfs_buftarg *, xfs_daddr_t, int, int,
 392                                 const struct xfs_buf_ops *);
 393 xfs_buf_t       *libxfs_readbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
 394                                 int, int, const struct xfs_buf_ops *);
 395 int             libxfs_writebuf(xfs_buf_t *, int);
 396 xfs_buf_t       *libxfs_getbuf(struct xfs_buftarg *, xfs_daddr_t, int);
 397 xfs_buf_t       *libxfs_getbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
 398                                 int, int);
 399 xfs_buf_t       *libxfs_getbuf_flags(struct xfs_buftarg *, xfs_daddr_t, int,
 400                                 unsigned int);
 401 void            libxfs_putbuf (xfs_buf_t *);
 402
 403 #define __add_trace(bp, func, file, line)       \
 404 do {                                            \
 405         if (bp) {                               \
 406                 (bp)->b_func = (func);          \
 407                 (bp)->b_file = (file);          \
 408                 (bp)->b_line = (line);          \
 409         }                                       \
 410 } while (0)
 411
 412 xfs_buf_t *
 413 libxfs_trace_readbuf(const char *func, const char *file, int line,
 414                 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
 415                 const struct xfs_buf_ops *ops)
 416 {
 417         xfs_buf_t       *bp = libxfs_readbuf(btp, blkno, len, flags, ops);
 418         __add_trace(bp, func, file, line);
 419         return bp;
 420 }
 421
 422 xfs_buf_t *
 423 libxfs_trace_readbuf_map(const char *func, const char *file, int line,
 424                 struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, int flags,
 425                 const struct xfs_buf_ops *ops)
 426 {
 427         xfs_buf_t       *bp = libxfs_readbuf_map(btp, map, nmaps, flags, ops);
 428         __add_trace(bp, func, file, line);
 429         return bp;
 430 }
 431
 432 int
 433 libxfs_trace_writebuf(const char *func, const char *file, int line, xfs_buf_t *bp, int flags)
 434 {
 435         __add_trace(bp, func, file, line);
 436         return libxfs_writebuf(bp, flags);
 437 }
 438
 439 xfs_buf_t *
 440 libxfs_trace_getbuf(const char *func, const char *file, int line,
 441                 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
 442 {
 443         xfs_buf_t       *bp = libxfs_getbuf(btp, blkno, len);
 444         __add_trace(bp, func, file, line);
 445         return bp;
 446 }
 447
 448 xfs_buf_t *
 449 libxfs_trace_getbuf_map(const char *func, const char *file, int line,
 450                 struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
 451                 int flags)
 452 {
 453         xfs_buf_t       *bp = libxfs_getbuf_map(btp, map, nmaps, flags);
 454         __add_trace(bp, func, file, line);
 455         return bp;
 456 }
 457
 458 xfs_buf_t *
 459 libxfs_trace_getbuf_flags(const char *func, const char *file, int line,
 460                 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, unsigned int flags)
 461 {
 462         xfs_buf_t       *bp = libxfs_getbuf_flags(btp, blkno, len, flags);
 463         __add_trace(bp, func, file, line);
 464         return bp;
 465 }
 466
 467 void
 468 libxfs_trace_putbuf(const char *func, const char *file, int line, xfs_buf_t *bp)
 469 {
 470         __add_trace(bp, func, file, line);
 471         libxfs_putbuf(bp);
 472 }
 473
 474
 475 #endif
 476
 477
 478 xfs_buf_t *
 479 libxfs_getsb(xfs_mount_t *mp, int flags)
 480 {
 481         return libxfs_readbuf(mp->m_ddev_targp, XFS_SB_DADDR,
 482                                 XFS_FSS_TO_BB(mp, 1), flags, &xfs_sb_buf_ops);
 483 }
 484
 485 kmem_zone_t                     *xfs_buf_zone;
 486
 487 static struct cache_mru         xfs_buf_freelist =
 488         {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
 489          0, PTHREAD_MUTEX_INITIALIZER };
 490
 491 /*
 492  * The bufkey is used to pass the new buffer information to the cache object
 493  * allocation routine. Because discontiguous buffers need to pass different
 494  * information, we need fields to pass that information. However, because the
 495  * blkno and bblen is needed for the initial cache entry lookup (i.e. for
 496  * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
 497  * buffer initialisation instead of a contiguous buffer.
 498  */
 499 struct xfs_bufkey {
 500         struct xfs_buftarg      *buftarg;
 501         xfs_daddr_t             blkno;
 502         unsigned int            bblen;
 503         struct xfs_buf_map      *map;
 504         int                     nmaps;
 505 };
 506
 507 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
 508 #define GOLDEN_RATIO_PRIME      0x9e37fffffffc0001UL
 509 #define CACHE_LINE_SIZE         64
 510 static unsigned int
 511 libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
 512 {
 513         uint64_t        hashval = ((struct xfs_bufkey *)key)->blkno;
 514         uint64_t        tmp;
 515
 516         tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
 517         tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
 518         return tmp % hashsize;
 519 }
 520
 521 static int
 522 libxfs_bcompare(struct cache_node *node, cache_key_t key)
 523 {
 524         struct xfs_buf  *bp = (struct xfs_buf *)node;
 525         struct xfs_bufkey *bkey = (struct xfs_bufkey *)key;
 526
 527         if (bp->b_target->dev == bkey->buftarg->dev &&
 528             bp->b_bn == bkey->blkno) {
 529                 if (bp->b_bcount == BBTOB(bkey->bblen))
 530                         return CACHE_HIT;
 531 #ifdef IO_BCOMPARE_CHECK
 532                 if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
 533                         fprintf(stderr,
 534         "%lx: Badness in key lookup (length)\n"
 535         "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
 536                                 pthread_self(),
 537                                 (unsigned long long)bp->b_bn, (int)bp->b_bcount,
 538                                 (unsigned long long)bkey->blkno,
 539                                 BBTOB(bkey->bblen));
 540                 }
 541 #endif
 542                 return CACHE_PURGE;
 543         }
 544         return CACHE_MISS;
 545 }
 546
 547 static void
 548 libxfs_bprint(xfs_buf_t *bp)
 549 {
 550         fprintf(stderr, "Buffer %p blkno=%llu bytes=%u flags=0x%x count=%u\n",
 551                 bp, (unsigned long long)bp->b_bn, (unsigned)bp->b_bcount,
 552                 bp->b_flags, bp->b_node.cn_count);
 553 }
 554
 555 static void
 556 __initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 557                 unsigned int bytes)
 558 {
 559         bp->b_flags = 0;
 560         bp->b_bn = bno;
 561         bp->b_bcount = bytes;
 562         bp->b_length = BTOBB(bytes);
 563         bp->b_target = btp;
 564         bp->b_error = 0;
 565         if (!bp->b_addr)
 566                 bp->b_addr = memalign(libxfs_device_alignment(), bytes);
 567         if (!bp->b_addr) {
 568                 fprintf(stderr,
 569                         _("%s: %s can't memalign %u bytes: %s\n"),
 570                         progname, __FUNCTION__, bytes,
 571                         strerror(errno));
 572                 exit(1);
 573         }
 574         memset(bp->b_addr, 0, bytes);
 575 #ifdef XFS_BUF_TRACING
 576         list_head_init(&bp->b_lock_list);
 577 #endif
 578         pthread_mutex_init(&bp->b_lock, NULL);
 579         bp->b_holder = 0;
 580         bp->b_recur = 0;
 581         bp->b_ops = NULL;
 582
 583         if (!bp->b_maps) {
 584                 bp->b_nmaps = 1;
 585                 bp->b_maps = &bp->__b_map;
 586                 bp->b_maps[0].bm_bn = bp->b_bn;
 587                 bp->b_maps[0].bm_len = bp->b_length;
 588         }
 589 }
 590
 591 static void
 592 libxfs_initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 593                 unsigned int bytes)
 594 {
 595         __initbuf(bp, btp, bno, bytes);
 596 }
 597
 598 static void
 599 libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
 600                 struct xfs_buf_map *map, int nmaps)
 601 {
 602         unsigned int bytes = 0;
 603         int i;
 604
 605         bytes = sizeof(struct xfs_buf_map) * nmaps;
 606         bp->b_maps = malloc(bytes);
 607         if (!bp->b_maps) {
 608                 fprintf(stderr,
 609                         _("%s: %s can't malloc %u bytes: %s\n"),
 610                         progname, __FUNCTION__, bytes,
 611                         strerror(errno));
 612                 exit(1);
 613         }
 614         bp->b_nmaps = nmaps;
 615
 616         bytes = 0;
 617         for ( i = 0; i < nmaps; i++) {
 618                 bp->b_maps[i].bm_bn = map[i].bm_bn;
 619                 bp->b_maps[i].bm_len = map[i].bm_len;
 620                 bytes += BBTOB(map[i].bm_len);
 621         }
 622
 623         __initbuf(bp, btp, map[0].bm_bn, bytes);
 624         bp->b_flags |= LIBXFS_B_DISCONTIG;
 625 }
 626
 627 static xfs_buf_t *
 628 __libxfs_getbufr(int blen)
 629 {
 630         xfs_buf_t       *bp;
 631
 632         /*
 633          * first look for a buffer that can be used as-is,
 634          * if one cannot be found, see if there is a buffer,
 635          * and if so, free its buffer and set b_addr to NULL
 636          * before calling libxfs_initbuf.
 637          */
 638         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 639         if (!list_empty(&xfs_buf_freelist.cm_list)) {
 640                 list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
 641                         if (bp->b_bcount == blen) {
 642                                 list_del_init(&bp->b_node.cn_mru);
 643                                 break;
 644                         }
 645                 }
 646                 if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
 647                         bp = list_entry(xfs_buf_freelist.cm_list.next,
 648                                         xfs_buf_t, b_node.cn_mru);
 649                         list_del_init(&bp->b_node.cn_mru);
 650                         free(bp->b_addr);
 651                         bp->b_addr = NULL;
 652                         if (bp->b_maps != &bp->__b_map)
 653                                 free(bp->b_maps);
 654                         bp->b_maps = NULL;
 655                 }
 656         } else
 657                 bp = kmem_zone_zalloc(xfs_buf_zone, 0);
 658         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 659         bp->b_ops = NULL;
 660         if (bp->b_flags & LIBXFS_B_DIRTY)
 661                 fprintf(stderr, "found dirty buffer (bulk) on free list!");
 662
 663         return bp;
 664 }
 665
 666 xfs_buf_t *
 667 libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
 668 {
 669         xfs_buf_t       *bp;
 670         int             blen = BBTOB(bblen);
 671
 672         bp =__libxfs_getbufr(blen);
 673         if (bp)
 674                 libxfs_initbuf(bp, btp, blkno, blen);
 675 #ifdef IO_DEBUG
 676         printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
 677                 pthread_self(), __FUNCTION__, blen,
 678                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 679 #endif
 680
 681         return bp;
 682 }
 683
 684 static xfs_buf_t *
 685 libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
 686                 struct xfs_buf_map *map, int nmaps)
 687 {
 688         xfs_buf_t       *bp;
 689         int             blen = BBTOB(bblen);
 690
 691         if (!map || !nmaps) {
 692                 fprintf(stderr,
 693                         _("%s: %s invalid map %p or nmaps %d\n"),
 694                         progname, __FUNCTION__, map, nmaps);
 695                 exit(1);
 696         }
 697
 698         if (blkno != map[0].bm_bn) {
 699                 fprintf(stderr,
 700                         _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
 701                         progname, __FUNCTION__, (long long)map[0].bm_bn,
 702                         (long long)blkno);
 703                 exit(1);
 704         }
 705
 706         bp =__libxfs_getbufr(blen);
 707         if (bp)
 708                 libxfs_initbuf_map(bp, btp, map, nmaps);
 709 #ifdef IO_DEBUG
 710         printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
 711                 pthread_self(), __FUNCTION__, blen,
 712                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 713 #endif
 714
 715         return bp;
 716 }
 717
 718 #ifdef XFS_BUF_TRACING
 719 struct list_head        lock_buf_list = {&lock_buf_list, &lock_buf_list};
 720 int                     lock_buf_count = 0;
 721 #endif
 722
 723 extern int     use_xfs_buf_lock;
 724
 725 static struct xfs_buf *
 726 __cache_lookup(struct xfs_bufkey *key, unsigned int flags)
 727 {
 728         struct xfs_buf  *bp;
 729
 730         cache_node_get(libxfs_bcache, key, (struct cache_node **)&bp);
 731         if (!bp)
 732                 return NULL;
 733
 734         if (use_xfs_buf_lock) {
 735                 int ret;
 736
 737                 ret = pthread_mutex_trylock(&bp->b_lock);
 738                 if (ret) {
 739                         ASSERT(ret == EAGAIN);
 740                         if (flags & LIBXFS_GETBUF_TRYLOCK)
 741                                 goto out_put;
 742
 743                         if (pthread_equal(bp->b_holder, pthread_self())) {
 744                                 fprintf(stderr,
 745         _("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
 746                                         key->blkno);
 747                                 bp->b_recur++;
 748                                 return bp;
 749                         } else {
 750                                 pthread_mutex_lock(&bp->b_lock);
 751                         }
 752                 }
 753
 754                 bp->b_holder = pthread_self();
 755         }
 756
 757         cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp,
 758                 cache_node_get_priority((struct cache_node *)bp) -
 759                                                 CACHE_PREFETCH_PRIORITY);
 760 #ifdef XFS_BUF_TRACING
 761         pthread_mutex_lock(&libxfs_bcache->c_mutex);
 762         lock_buf_count++;
 763         list_add(&bp->b_lock_list, &lock_buf_list);
 764         pthread_mutex_unlock(&libxfs_bcache->c_mutex);
 765 #endif
 766 #ifdef IO_DEBUG
 767         printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
 768                 pthread_self(), __FUNCTION__,
 769                 bp, bp->b_bn, (long long)LIBXFS_BBTOOFF64(key->blkno));
 770 #endif
 771
 772         return bp;
 773 out_put:
 774         cache_node_put(libxfs_bcache, (struct cache_node *)bp);
 775         return NULL;
 776 }
 777
 778 struct xfs_buf *
 779 libxfs_getbuf_flags(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len,
 780                 unsigned int flags)
 781 {
 782         struct xfs_bufkey key = {NULL};
 783
 784         key.buftarg = btp;
 785         key.blkno = blkno;
 786         key.bblen = len;
 787
 788         return __cache_lookup(&key, flags);
 789 }
 790
 791 /*
 792  * Clean the buffer flags for libxfs_getbuf*(), which wants to return
 793  * an unused buffer with clean state.  This prevents CRC errors on a
 794  * re-read of a corrupt block that was prefetched and freed.  This
 795  * can happen with a massively corrupt directory that is discarded,
 796  * but whose blocks are then recycled into expanding lost+found.
 797  *
 798  * Note however that if the buffer's dirty (prefetch calls getbuf)
 799  * we'll leave the state alone because we don't want to discard blocks
 800  * that have been fixed.
 801  */
 802 static void
 803 reset_buf_state(
 804         struct xfs_buf  *bp)
 805 {
 806         if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
 807                 bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
 808                                 LIBXFS_B_UPTODATE);
 809 }
 810
 811 struct xfs_buf *
 812 libxfs_getbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
 813 {
 814         struct xfs_buf  *bp;
 815
 816         bp = libxfs_getbuf_flags(btp, blkno, len, 0);
 817         reset_buf_state(bp);
 818         return bp;
 819 }
 820
 821 static struct xfs_buf *
 822 __libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
 823                     int nmaps, int flags)
 824 {
 825         struct xfs_bufkey key = {NULL};
 826         int i;
 827
 828         if (nmaps == 1)
 829                 return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
 830                                            flags);
 831
 832         key.buftarg = btp;
 833         key.blkno = map[0].bm_bn;
 834         for (i = 0; i < nmaps; i++) {
 835                 key.bblen += map[i].bm_len;
 836         }
 837         key.map = map;
 838         key.nmaps = nmaps;
 839
 840         return __cache_lookup(&key, flags);
 841 }
 842
 843 struct xfs_buf *
 844 libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
 845                   int nmaps, int flags)
 846 {
 847         struct xfs_buf  *bp;
 848
 849         bp = __libxfs_getbuf_map(btp, map, nmaps, flags);
 850         reset_buf_state(bp);
 851         return bp;
 852 }
 853
 854 void
 855 libxfs_putbuf(xfs_buf_t *bp)
 856 {
 857         /*
 858          * ensure that any errors on this use of the buffer don't carry
 859          * over to the next user.
 860          */
 861         bp->b_error = 0;
 862
 863 #ifdef XFS_BUF_TRACING
 864         pthread_mutex_lock(&libxfs_bcache->c_mutex);
 865         lock_buf_count--;
 866         ASSERT(lock_buf_count >= 0);
 867         list_del_init(&bp->b_lock_list);
 868         pthread_mutex_unlock(&libxfs_bcache->c_mutex);
 869 #endif
 870         if (use_xfs_buf_lock) {
 871                 if (bp->b_recur) {
 872                         bp->b_recur--;
 873                 } else {
 874                         bp->b_holder = 0;
 875                         pthread_mutex_unlock(&bp->b_lock);
 876                 }
 877         }
 878
 879         cache_node_put(libxfs_bcache, (struct cache_node *)bp);
 880 }
 881
 882 void
 883 libxfs_purgebuf(xfs_buf_t *bp)
 884 {
 885         struct xfs_bufkey key = {NULL};
 886
 887         key.buftarg = bp->b_target;
 888         key.blkno = bp->b_bn;
 889         key.bblen = bp->b_length;
 890
 891         cache_node_purge(libxfs_bcache, &key, (struct cache_node *)bp);
 892 }
 893
 894 static struct cache_node *
 895 libxfs_balloc(cache_key_t key)
 896 {
 897         struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key;
 898
 899         if (bufkey->map)
 900                 return (struct cache_node *)
 901                        libxfs_getbufr_map(bufkey->buftarg,
 902                                           bufkey->blkno, bufkey->bblen,
 903                                           bufkey->map, bufkey->nmaps);
 904         return (struct cache_node *)libxfs_getbufr(bufkey->buftarg,
 905                                           bufkey->blkno, bufkey->bblen);
 906 }
 907
 908
 909 static int
 910 __read_buf(int fd, void *buf, int len, off64_t offset, int flags)
 911 {
 912         int     sts;
 913
 914         sts = pread(fd, buf, len, offset);
 915         if (sts < 0) {
 916                 int error = errno;
 917                 fprintf(stderr, _("%s: read failed: %s\n"),
 918                         progname, strerror(error));
 919                 if (flags & LIBXFS_EXIT_ON_FAILURE)
 920                         exit(1);
 921                 return -error;
 922         } else if (sts != len) {
 923                 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
 924                         progname, sts, len);
 925                 if (flags & LIBXFS_EXIT_ON_FAILURE)
 926                         exit(1);
 927                 return -EIO;
 928         }
 929         return 0;
 930 }
 931
 932 int
 933 libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, xfs_buf_t *bp,
 934                 int len, int flags)
 935 {
 936         int     fd = libxfs_device_to_fd(btp->dev);
 937         int     bytes = BBTOB(len);
 938         int     error;
 939
 940         ASSERT(BBTOB(len) <= bp->b_bcount);
 941
 942         error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
 943         if (!error &&
 944             bp->b_target->dev == btp->dev &&
 945             bp->b_bn == blkno &&
 946             bp->b_bcount == bytes)
 947                 bp->b_flags |= LIBXFS_B_UPTODATE;
 948 #ifdef IO_DEBUG
 949         printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
 950                 pthread_self(), __FUNCTION__, bytes, error,
 951                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 952 #endif
 953         return error;
 954 }
 955
 956 void
 957 libxfs_readbuf_verify(struct xfs_buf *bp, const struct xfs_buf_ops *ops)
 958 {
 959         if (!ops)
 960                 return;
 961         bp->b_ops = ops;
 962         bp->b_ops->verify_read(bp);
 963         bp->b_flags &= ~LIBXFS_B_UNCHECKED;
 964 }
 965
 966
 967 xfs_buf_t *
 968 libxfs_readbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
 969                 const struct xfs_buf_ops *ops)
 970 {
 971         xfs_buf_t       *bp;
 972         int             error;
 973
 974         bp = libxfs_getbuf_flags(btp, blkno, len, 0);
 975         if (!bp)
 976                 return NULL;
 977
 978         /*
 979          * if the buffer was prefetched, it is likely that it was not validated.
 980          * Hence if we are supplied an ops function and the buffer is marked as
 981          * unchecked, we need to validate it now.
 982          *
 983          * We do this verification even if the buffer is dirty - the
 984          * verification is almost certainly going to fail the CRC check in this
 985          * case as a dirty buffer has not had the CRC recalculated. However, we
 986          * should not be dirtying unchecked buffers and therefore failing it
 987          * here because it's dirty and unchecked indicates we've screwed up
 988          * somewhere else.
 989          */
 990         bp->b_error = 0;
 991         if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
 992                 if (bp->b_flags & LIBXFS_B_UNCHECKED)
 993                         libxfs_readbuf_verify(bp, ops);
 994                 return bp;
 995         }
 996
 997         /*
 998          * Set the ops on a cache miss (i.e. first physical read) as the
 999          * verifier may change the ops to match the type of buffer it contains.
1000          * A cache hit might reset the verifier to the original type if we set
1001          * it again, but it won't get called again and set to match the buffer
1002          * contents. *cough* xfs_da_node_buf_ops *cough*.
1003          */
1004         error = libxfs_readbufr(btp, blkno, bp, len, flags);
1005         if (error)
1006                 bp->b_error = error;
1007         else
1008                 libxfs_readbuf_verify(bp, ops);
1009         return bp;
1010 }
1011
1012 int
1013 libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
1014 {
1015         int     fd;
1016         int     error = 0;
1017         void    *buf;
1018         int     i;
1019
1020         fd = libxfs_device_to_fd(btp->dev);
1021         buf = bp->b_addr;
1022         for (i = 0; i < bp->b_nmaps; i++) {
1023                 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
1024                 int len = BBTOB(bp->b_maps[i].bm_len);
1025
1026                 error = __read_buf(fd, buf, len, offset, flags);
1027                 if (error) {
1028                         bp->b_error = error;
1029                         break;
1030                 }
1031                 buf += len;
1032         }
1033
1034         if (!error)
1035                 bp->b_flags |= LIBXFS_B_UPTODATE;
1036 #ifdef IO_DEBUG
1037         printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1038                 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
1039                 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
1040 #endif
1041         return error;
1042 }
1043
1044 struct xfs_buf *
1045 libxfs_readbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
1046                 int flags, const struct xfs_buf_ops *ops)
1047 {
1048         struct xfs_buf  *bp;
1049         int             error = 0;
1050
1051         if (nmaps == 1)
1052                 return libxfs_readbuf(btp, map[0].bm_bn, map[0].bm_len,
1053                                         flags, ops);
1054
1055         bp = __libxfs_getbuf_map(btp, map, nmaps, 0);
1056         if (!bp)
1057                 return NULL;
1058
1059         bp->b_error = 0;
1060         if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
1061                 if (bp->b_flags & LIBXFS_B_UNCHECKED)
1062                         libxfs_readbuf_verify(bp, ops);
1063                 return bp;
1064         }
1065         error = libxfs_readbufr_map(btp, bp, flags);
1066         if (!error)
1067                 libxfs_readbuf_verify(bp, ops);
1068
1069 #ifdef IO_DEBUGX
1070         printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1071                 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
1072                 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
1073 #endif
1074         return bp;
1075 }
1076
1077 static int
1078 __write_buf(int fd, void *buf, int len, off64_t offset, int flags)
1079 {
1080         int     sts;
1081
1082         sts = pwrite(fd, buf, len, offset);
1083         if (sts < 0) {
1084                 int error = errno;
1085                 fprintf(stderr, _("%s: pwrite failed: %s\n"),
1086                         progname, strerror(error));
1087                 if (flags & LIBXFS_B_EXIT)
1088                         exit(1);
1089                 return -error;
1090         } else if (sts != len) {
1091                 fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
1092                         progname, sts, len);
1093                 if (flags & LIBXFS_B_EXIT)
1094                         exit(1);
1095                 return -EIO;
1096         }
1097         return 0;
1098 }
1099
1100 int
1101 libxfs_writebufr(xfs_buf_t *bp)
1102 {
1103         int     fd = libxfs_device_to_fd(bp->b_target->dev);
1104
1105         /*
1106          * we never write buffers that are marked stale. This indicates they
1107          * contain data that has been invalidated, and even if the buffer is
1108          * dirty it must *never* be written. Verifiers are wonderful for finding
1109          * bugs like this. Make sure the error is obvious as to the cause.
1110          */
1111         if (bp->b_flags & LIBXFS_B_STALE) {
1112                 bp->b_error = -ESTALE;
1113                 return bp->b_error;
1114         }
1115
1116         /*
1117          * clear any pre-existing error status on the buffer. This can occur if
1118          * the buffer is corrupt on disk and the repair process doesn't clear
1119          * the error before fixing and writing it back.
1120          */
1121         bp->b_error = 0;
1122         if (bp->b_ops) {
1123                 bp->b_ops->verify_write(bp);
1124                 if (bp->b_error) {
1125                         fprintf(stderr,
1126         _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
1127                                 __func__, bp->b_ops->name,
1128                                 (long long)bp->b_bn, bp->b_bcount);
1129                         return bp->b_error;
1130                 }
1131         }
1132
1133         if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
1134                 bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
1135                                     LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
1136         } else {
1137                 int     i;
1138                 void    *buf = bp->b_addr;
1139
1140                 for (i = 0; i < bp->b_nmaps; i++) {
1141                         off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
1142                         int len = BBTOB(bp->b_maps[i].bm_len);
1143
1144                         bp->b_error = __write_buf(fd, buf, len, offset,
1145                                                   bp->b_flags);
1146                         if (bp->b_error)
1147                                 break;
1148                         buf += len;
1149                 }
1150         }
1151
1152 #ifdef IO_DEBUG
1153         printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
1154                         pthread_self(), __FUNCTION__, bp->b_bcount,
1155                         (long long)LIBXFS_BBTOOFF64(bp->b_bn),
1156                         (long long)bp->b_bn, bp, bp->b_error);
1157 #endif
1158         if (!bp->b_error) {
1159                 bp->b_flags |= LIBXFS_B_UPTODATE;
1160                 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
1161                                  LIBXFS_B_UNCHECKED);
1162         }
1163         return bp->b_error;
1164 }
1165
1166 int
1167 libxfs_writebuf_int(xfs_buf_t *bp, int flags)
1168 {
1169         /*
1170          * Clear any error hanging over from reading the buffer. This prevents
1171          * subsequent reads after this write from seeing stale errors.
1172          */
1173         bp->b_error = 0;
1174         bp->b_flags &= ~LIBXFS_B_STALE;
1175         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
1176         return 0;
1177 }
1178
1179 int
1180 libxfs_writebuf(xfs_buf_t *bp, int flags)
1181 {
1182 #ifdef IO_DEBUG
1183         printf("%lx: %s: dirty blkno=%llu(%llu)\n",
1184                         pthread_self(), __FUNCTION__,
1185                         (long long)LIBXFS_BBTOOFF64(bp->b_bn),
1186                         (long long)bp->b_bn);
1187 #endif
1188         /*
1189          * Clear any error hanging over from reading the buffer. This prevents
1190          * subsequent reads after this write from seeing stale errors.
1191          */
1192         bp->b_error = 0;
1193         bp->b_flags &= ~LIBXFS_B_STALE;
1194         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
1195         libxfs_putbuf(bp);
1196         return 0;
1197 }
1198
1199 void
1200 libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
1201 {
1202 #ifdef IO_DEBUG
1203         if (boff + len > bp->b_bcount) {
1204                 printf("Badness, iomove out of range!\n"
1205                         "bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
1206                         (long long)bp->b_bn, bp->b_bcount, boff, len);
1207                 abort();
1208         }
1209 #endif
1210         switch (flags) {
1211         case LIBXFS_BZERO:
1212                 memset(bp->b_addr + boff, 0, len);
1213                 break;
1214         case LIBXFS_BREAD:
1215                 memcpy(data, bp->b_addr + boff, len);
1216                 break;
1217         case LIBXFS_BWRITE:
1218                 memcpy(bp->b_addr + boff, data, len);
1219                 break;
1220         }
1221 }
1222
1223 static void
1224 libxfs_brelse(
1225         struct cache_node       *node)
1226 {
1227         struct xfs_buf          *bp = (struct xfs_buf *)node;
1228
1229         if (!bp)
1230                 return;
1231         if (bp->b_flags & LIBXFS_B_DIRTY)
1232                 fprintf(stderr,
1233                         "releasing dirty buffer to free list!");
1234
1235         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
1236         list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
1237         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
1238 }
1239
1240 static unsigned int
1241 libxfs_bulkrelse(
1242         struct cache            *cache,
1243         struct list_head        *list)
1244 {
1245         xfs_buf_t               *bp;
1246         int                     count = 0;
1247
1248         if (list_empty(list))
1249                 return 0 ;
1250
1251         list_for_each_entry(bp, list, b_node.cn_mru) {
1252                 if (bp->b_flags & LIBXFS_B_DIRTY)
1253                         fprintf(stderr,
1254                                 "releasing dirty buffer (bulk) to free list!");
1255                 count++;
1256         }
1257
1258         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
1259         list_splice(list, &xfs_buf_freelist.cm_list);
1260         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
1261
1262         return count;
1263 }
1264
1265 /*
1266  * Free everything from the xfs_buf_freelist MRU, used at final teardown
1267  */
1268 void
1269 libxfs_bcache_free(void)
1270 {
1271         struct list_head        *cm_list;
1272         xfs_buf_t               *bp, *next;
1273
1274         cm_list = &xfs_buf_freelist.cm_list;
1275         list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
1276                 free(bp->b_addr);
1277                 if (bp->b_maps != &bp->__b_map)
1278                         free(bp->b_maps);
1279                 kmem_zone_free(xfs_buf_zone, bp);
1280         }
1281 }
1282
1283 /*
1284  * When a buffer is marked dirty, the error is cleared. Hence if we are trying
1285  * to flush a buffer prior to cache reclaim that has an error on it it means
1286  * we've already tried to flush it and it failed. Prevent repeated corruption
1287  * errors from being reported by skipping such buffers - when the corruption is
1288  * fixed the buffer will be marked dirty again and we can write it again.
1289  */
1290 static int
1291 libxfs_bflush(
1292         struct cache_node       *node)
1293 {
1294         struct xfs_buf          *bp = (struct xfs_buf *)node;
1295
1296         if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
1297                 return libxfs_writebufr(bp);
1298         return bp->b_error;
1299 }
1300
1301 void
1302 libxfs_putbufr(xfs_buf_t *bp)
1303 {
1304         if (bp->b_flags & LIBXFS_B_DIRTY)
1305                 libxfs_writebufr(bp);
1306         libxfs_brelse((struct cache_node *)bp);
1307 }
1308
1309
1310 void
1311 libxfs_bcache_purge(void)
1312 {
1313         cache_purge(libxfs_bcache);
1314 }
1315
1316 void
1317 libxfs_bcache_flush(void)
1318 {
1319         cache_flush(libxfs_bcache);
1320 }
1321
1322 int
1323 libxfs_bcache_overflowed(void)
1324 {
1325         return cache_overflowed(libxfs_bcache);
1326 }
1327
1328 struct cache_operations libxfs_bcache_operations = {
1329         .hash           = libxfs_bhash,
1330         .alloc          = libxfs_balloc,
1331         .flush          = libxfs_bflush,
1332         .relse          = libxfs_brelse,
1333         .compare        = libxfs_bcompare,
1334         .bulkrelse      = libxfs_bulkrelse
1335 };
1336
1337
1338 /*
1339  * Inode cache stubs.
1340  */
1341
1342 kmem_zone_t             *xfs_inode_zone;
1343 extern kmem_zone_t      *xfs_ili_zone;
1344
1345 /*
1346  * If there are inline format data / attr forks attached to this inode,
1347  * make sure they're not corrupt.
1348  */
1349 bool
1350 libxfs_inode_verify_forks(
1351         struct xfs_inode        *ip,
1352         struct xfs_ifork_ops    *ops)
1353 {
1354         struct xfs_ifork        *ifp;
1355         xfs_failaddr_t          fa;
1356
1357         if (!ops)
1358                 return true;
1359
1360         fa = xfs_ifork_verify_data(ip, ops);
1361         if (fa) {
1362                 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1363                 xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
1364                                 ifp->if_u1.if_data, ifp->if_bytes, fa);
1365                 return false;
1366         }
1367
1368         fa = xfs_ifork_verify_attr(ip, ops);
1369         if (fa) {
1370                 ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
1371                 xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
1372                                 ifp ? ifp->if_u1.if_data : NULL,
1373                                 ifp ? ifp->if_bytes : 0, fa);
1374                 return false;
1375         }
1376         return true;
1377 }
1378
1379 int
1380 libxfs_iget(
1381         struct xfs_mount        *mp,
1382         struct xfs_trans        *tp,
1383         xfs_ino_t               ino,
1384         uint                    lock_flags,
1385         struct xfs_inode        **ipp,
1386         struct xfs_ifork_ops    *ifork_ops)
1387 {
1388         struct xfs_inode        *ip;
1389         int                     error = 0;
1390
1391         ip = kmem_zone_zalloc(xfs_inode_zone, 0);
1392         if (!ip)
1393                 return -ENOMEM;
1394
1395         ip->i_ino = ino;
1396         ip->i_mount = mp;
1397         error = xfs_iread(mp, tp, ip, 0);
1398         if (error) {
1399                 kmem_zone_free(xfs_inode_zone, ip);
1400                 *ipp = NULL;
1401                 return error;
1402         }
1403
1404         if (!libxfs_inode_verify_forks(ip, ifork_ops)) {
1405                 libxfs_irele(ip);
1406                 return -EFSCORRUPTED;
1407         }
1408
1409         /*
1410          * set up the inode ops structure that the libxfs code relies on
1411          */
1412         if (XFS_ISDIR(ip))
1413                 ip->d_ops = mp->m_dir_inode_ops;
1414         else
1415                 ip->d_ops = mp->m_nondir_inode_ops;
1416
1417         *ipp = ip;
1418         return 0;
1419 }
1420
1421 static void
1422 libxfs_idestroy(xfs_inode_t *ip)
1423 {
1424         switch (VFS_I(ip)->i_mode & S_IFMT) {
1425                 case S_IFREG:
1426                 case S_IFDIR:
1427                 case S_IFLNK:
1428                         libxfs_idestroy_fork(ip, XFS_DATA_FORK);
1429                         break;
1430         }
1431         if (ip->i_afp)
1432                 libxfs_idestroy_fork(ip, XFS_ATTR_FORK);
1433         if (ip->i_cowfp)
1434                 xfs_idestroy_fork(ip, XFS_COW_FORK);
1435 }
1436
1437 void
1438 libxfs_irele(
1439         struct xfs_inode        *ip)
1440 {
1441         if (ip->i_itemp)
1442                 kmem_zone_free(xfs_ili_zone, ip->i_itemp);
1443         ip->i_itemp = NULL;
1444         libxfs_idestroy(ip);
1445         kmem_zone_free(xfs_inode_zone, ip);
1446 }