libxfs/rdwr.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6
   7
   8 #include "libxfs_priv.h"
   9 #include "init.h"
  10 #include "xfs_fs.h"
  11 #include "xfs_shared.h"
  12 #include "xfs_format.h"
  13 #include "xfs_log_format.h"
  14 #include "xfs_trans_resv.h"
  15 #include "xfs_mount.h"
  16 #include "xfs_inode_buf.h"
  17 #include "xfs_inode_fork.h"
  18 #include "xfs_inode.h"
  19 #include "xfs_trans.h"
  20 #include "libfrog/platform.h"
  21
  22 #include "libxfs.h"
  23
  24 static void libxfs_brelse(struct cache_node *node);
  25
  26 /*
  27  * Important design/architecture note:
  28  *
  29  * The userspace code that uses the buffer cache is much less constrained than
  30  * the kernel code. The userspace code is pretty nasty in places, especially
  31  * when it comes to buffer error handling.  Very little of the userspace code
  32  * outside libxfs clears bp->b_error - very little code even checks it - so the
  33  * libxfs code is tripping on stale errors left by the userspace code.
  34  *
  35  * We can't clear errors or zero buffer contents in libxfs_buf_get-* like we do
  36  * in the kernel, because those functions are used by the libxfs_readbuf_*
  37  * functions and hence need to leave the buffers unchanged on cache hits. This
  38  * is actually the only way to gather a write error from a libxfs_writebuf()
  39  * call - you need to get the buffer again so you can check bp->b_error field -
  40  * assuming that the buffer is still in the cache when you check, that is.
  41  *
  42  * This is very different to the kernel code which does not release buffers on a
  43  * write so we can wait on IO and check errors. The kernel buffer cache also
  44  * guarantees a buffer of a known initial state from xfs_buf_get() even on a
  45  * cache hit.
  46  *
  47  * IOWs, userspace is behaving quite differently to the kernel and as a result
  48  * it leaks errors from reads, invalidations and writes through
  49  * libxfs_buf_get/libxfs_buf_read.
  50  *
  51  * The result of this is that until the userspace code outside libxfs is cleaned
  52  * up, functions that release buffers from userspace control (i.e
  53  * libxfs_writebuf/libxfs_buf_relse) need to zero bp->b_error to prevent
  54  * propagation of stale errors into future buffer operations.
  55  */
  56
  57 #define BDSTRAT_SIZE    (256 * 1024)
  58
  59 #define IO_BCOMPARE_CHECK
  60
  61 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
  62 int
  63 libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
  64 {
  65         int             fd = btp->bt_bdev_fd;
  66         xfs_off_t       start_offset, end_offset, offset;
  67         ssize_t         zsize, bytes;
  68         size_t          len_bytes;
  69         char            *z;
  70         int             error;
  71
  72         start_offset = LIBXFS_BBTOOFF64(start);
  73
  74         /* try to use special zeroing methods, fall back to writes if needed */
  75         len_bytes = LIBXFS_BBTOOFF64(len);
  76         error = platform_zero_range(fd, start_offset, len_bytes);
  77         if (!error) {
  78                 xfs_buftarg_trip_write(btp);
  79                 return 0;
  80         }
  81
  82         zsize = min(BDSTRAT_SIZE, BBTOB(len));
  83         if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
  84                 fprintf(stderr,
  85                         _("%s: %s can't memalign %d bytes: %s\n"),
  86                         progname, __FUNCTION__, (int)zsize, strerror(errno));
  87                 exit(1);
  88         }
  89         memset(z, 0, zsize);
  90
  91         if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
  92                 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
  93                         progname, __FUNCTION__,
  94                         (unsigned long long)start_offset, strerror(errno));
  95                 exit(1);
  96         }
  97
  98         end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
  99         for (offset = 0; offset < end_offset; ) {
 100                 bytes = min((ssize_t)(end_offset - offset), zsize);
 101                 if ((bytes = write(fd, z, bytes)) < 0) {
 102                         fprintf(stderr, _("%s: %s write failed: %s\n"),
 103                                 progname, __FUNCTION__, strerror(errno));
 104                         exit(1);
 105                 } else if (bytes == 0) {
 106                         fprintf(stderr, _("%s: %s not progressing?\n"),
 107                                 progname, __FUNCTION__);
 108                         exit(1);
 109                 }
 110                 xfs_buftarg_trip_write(btp);
 111                 offset += bytes;
 112         }
 113         free(z);
 114         return 0;
 115 }
 116
 117 static void unmount_record(void *p)
 118 {
 119         xlog_op_header_t        *op = (xlog_op_header_t *)p;
 120         /* the data section must be 32 bit size aligned */
 121         struct {
 122             uint16_t magic;
 123             uint16_t pad1;
 124             uint32_t pad2; /* may as well make it 64 bits */
 125         } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
 126
 127         memset(p, 0, BBSIZE);
 128         /* dummy tid to mark this as written from userspace */
 129         op->oh_tid = cpu_to_be32(0xb0c0d0d0);
 130         op->oh_len = cpu_to_be32(sizeof(magic));
 131         op->oh_clientid = XFS_LOG;
 132         op->oh_flags = XLOG_UNMOUNT_TRANS;
 133         op->oh_res2 = 0;
 134
 135         /* and the data for this op */
 136         memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
 137 }
 138
 139 static char *next(
 140         char            *ptr,
 141         int             offset,
 142         void            *private)
 143 {
 144         struct xfs_buf  *buf = (struct xfs_buf *)private;
 145
 146         if (buf &&
 147             (BBTOB(buf->b_length) < (int)(ptr - (char *)buf->b_addr) + offset))
 148                 abort();
 149
 150         return ptr + offset;
 151 }
 152
 153 struct xfs_buf *
 154 libxfs_getsb(
 155         struct xfs_mount        *mp)
 156 {
 157         struct xfs_buf          *bp;
 158
 159         libxfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, XFS_FSS_TO_BB(mp, 1),
 160                         0, &bp, &xfs_sb_buf_ops);
 161         return bp;
 162 }
 163
 164 struct kmem_cache                       *xfs_buf_cache;
 165
 166 static struct cache_mru         xfs_buf_freelist =
 167         {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
 168          0, PTHREAD_MUTEX_INITIALIZER };
 169
 170 /*
 171  * The bufkey is used to pass the new buffer information to the cache object
 172  * allocation routine. Because discontiguous buffers need to pass different
 173  * information, we need fields to pass that information. However, because the
 174  * blkno and bblen is needed for the initial cache entry lookup (i.e. for
 175  * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
 176  * buffer initialisation instead of a contiguous buffer.
 177  */
 178 struct xfs_bufkey {
 179         struct xfs_buftarg      *buftarg;
 180         xfs_daddr_t             blkno;
 181         unsigned int            bblen;
 182         struct xfs_buf_map      *map;
 183         int                     nmaps;
 184 };
 185
 186 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
 187 #define GOLDEN_RATIO_PRIME      0x9e37fffffffc0001UL
 188 #define CACHE_LINE_SIZE         64
 189 static unsigned int
 190 libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
 191 {
 192         uint64_t        hashval = ((struct xfs_bufkey *)key)->blkno;
 193         uint64_t        tmp;
 194
 195         tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
 196         tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
 197         return tmp % hashsize;
 198 }
 199
 200 static int
 201 libxfs_bcompare(struct cache_node *node, cache_key_t key)
 202 {
 203         struct xfs_buf          *bp = container_of(node, struct xfs_buf,
 204                                                    b_node);
 205         struct xfs_bufkey       *bkey = (struct xfs_bufkey *)key;
 206
 207         if (bp->b_target->bt_bdev == bkey->buftarg->bt_bdev &&
 208             bp->b_cache_key == bkey->blkno) {
 209                 if (bp->b_length == bkey->bblen)
 210                         return CACHE_HIT;
 211 #ifdef IO_BCOMPARE_CHECK
 212                 if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
 213                         fprintf(stderr,
 214         "%lx: Badness in key lookup (length)\n"
 215         "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
 216                                 pthread_self(),
 217                                 (unsigned long long)xfs_buf_daddr(bp),
 218                                 BBTOB(bp->b_length),
 219                                 (unsigned long long)bkey->blkno,
 220                                 BBTOB(bkey->bblen));
 221                 }
 222 #endif
 223                 return CACHE_PURGE;
 224         }
 225         return CACHE_MISS;
 226 }
 227
 228 static void
 229 __initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 230                 unsigned int bytes)
 231 {
 232         bp->b_flags = 0;
 233         bp->b_cache_key = bno;
 234         bp->b_length = BTOBB(bytes);
 235         bp->b_target = btp;
 236         bp->b_mount = btp->bt_mount;
 237         bp->b_error = 0;
 238         if (!bp->b_addr)
 239                 bp->b_addr = memalign(libxfs_device_alignment(), bytes);
 240         if (!bp->b_addr) {
 241                 fprintf(stderr,
 242                         _("%s: %s can't memalign %u bytes: %s\n"),
 243                         progname, __FUNCTION__, bytes,
 244                         strerror(errno));
 245                 exit(1);
 246         }
 247         memset(bp->b_addr, 0, bytes);
 248         pthread_mutex_init(&bp->b_lock, NULL);
 249         bp->b_holder = 0;
 250         bp->b_recur = 0;
 251         bp->b_ops = NULL;
 252         INIT_LIST_HEAD(&bp->b_li_list);
 253
 254         if (!bp->b_maps)
 255                 bp->b_maps = &bp->__b_map;
 256
 257         if (bp->b_maps == &bp->__b_map) {
 258                 bp->b_nmaps = 1;
 259                 bp->b_maps[0].bm_bn = bno;
 260                 bp->b_maps[0].bm_len = bp->b_length;
 261         }
 262 }
 263
 264 static void
 265 libxfs_initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 266                 unsigned int bytes)
 267 {
 268         __initbuf(bp, btp, bno, bytes);
 269 }
 270
 271 static void
 272 libxfs_initbuf_map(struct xfs_buf *bp, struct xfs_buftarg *btp,
 273                 struct xfs_buf_map *map, int nmaps)
 274 {
 275         unsigned int bytes = 0;
 276         int i;
 277
 278         bytes = sizeof(struct xfs_buf_map) * nmaps;
 279         bp->b_maps = malloc(bytes);
 280         if (!bp->b_maps) {
 281                 fprintf(stderr,
 282                         _("%s: %s can't malloc %u bytes: %s\n"),
 283                         progname, __FUNCTION__, bytes,
 284                         strerror(errno));
 285                 exit(1);
 286         }
 287         bp->b_nmaps = nmaps;
 288
 289         bytes = 0;
 290         for ( i = 0; i < nmaps; i++) {
 291                 bp->b_maps[i].bm_bn = map[i].bm_bn;
 292                 bp->b_maps[i].bm_len = map[i].bm_len;
 293                 bytes += BBTOB(map[i].bm_len);
 294         }
 295
 296         __initbuf(bp, btp, map[0].bm_bn, bytes);
 297         bp->b_flags |= LIBXFS_B_DISCONTIG;
 298 }
 299
 300 static struct xfs_buf *
 301 __libxfs_getbufr(int blen)
 302 {
 303         struct xfs_buf  *bp;
 304
 305         /*
 306          * first look for a buffer that can be used as-is,
 307          * if one cannot be found, see if there is a buffer,
 308          * and if so, free its buffer and set b_addr to NULL
 309          * before calling libxfs_initbuf.
 310          */
 311         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 312         if (!list_empty(&xfs_buf_freelist.cm_list)) {
 313                 list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
 314                         if (bp->b_length == BTOBB(blen)) {
 315                                 list_del_init(&bp->b_node.cn_mru);
 316                                 break;
 317                         }
 318                 }
 319                 if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
 320                         bp = list_entry(xfs_buf_freelist.cm_list.next,
 321                                         struct xfs_buf, b_node.cn_mru);
 322                         list_del_init(&bp->b_node.cn_mru);
 323                         free(bp->b_addr);
 324                         bp->b_addr = NULL;
 325                         if (bp->b_maps != &bp->__b_map)
 326                                 free(bp->b_maps);
 327                         bp->b_maps = NULL;
 328                 }
 329         } else
 330                 bp = kmem_cache_zalloc(xfs_buf_cache, 0);
 331         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 332         bp->b_ops = NULL;
 333         if (bp->b_flags & LIBXFS_B_DIRTY)
 334                 fprintf(stderr, "found dirty buffer (bulk) on free list!\n");
 335
 336         return bp;
 337 }
 338
 339 static struct xfs_buf *
 340 libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
 341 {
 342         struct xfs_buf  *bp;
 343         int             blen = BBTOB(bblen);
 344
 345         bp =__libxfs_getbufr(blen);
 346         if (bp)
 347                 libxfs_initbuf(bp, btp, blkno, blen);
 348         return bp;
 349 }
 350
 351 static struct xfs_buf *
 352 libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
 353                 struct xfs_buf_map *map, int nmaps)
 354 {
 355         struct xfs_buf  *bp;
 356         int             blen = BBTOB(bblen);
 357
 358         if (!map || !nmaps) {
 359                 fprintf(stderr,
 360                         _("%s: %s invalid map %p or nmaps %d\n"),
 361                         progname, __FUNCTION__, map, nmaps);
 362                 exit(1);
 363         }
 364
 365         if (blkno != map[0].bm_bn) {
 366                 fprintf(stderr,
 367                         _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
 368                         progname, __FUNCTION__, (long long)map[0].bm_bn,
 369                         (long long)blkno);
 370                 exit(1);
 371         }
 372
 373         bp =__libxfs_getbufr(blen);
 374         if (bp)
 375                 libxfs_initbuf_map(bp, btp, map, nmaps);
 376         return bp;
 377 }
 378
 379 void
 380 xfs_buf_lock(
 381         struct xfs_buf  *bp)
 382 {
 383         if (use_xfs_buf_lock)
 384                 pthread_mutex_lock(&bp->b_lock);
 385 }
 386
 387 void
 388 xfs_buf_unlock(
 389         struct xfs_buf  *bp)
 390 {
 391         if (use_xfs_buf_lock)
 392                 pthread_mutex_unlock(&bp->b_lock);
 393 }
 394
 395 static int
 396 __cache_lookup(
 397         struct xfs_bufkey       *key,
 398         unsigned int            flags,
 399         struct xfs_buf          **bpp)
 400 {
 401         struct cache_node       *cn = NULL;
 402         struct xfs_buf          *bp;
 403
 404         *bpp = NULL;
 405
 406         cache_node_get(libxfs_bcache, key, &cn);
 407         if (!cn)
 408                 return -ENOMEM;
 409         bp = container_of(cn, struct xfs_buf, b_node);
 410
 411         if (use_xfs_buf_lock) {
 412                 int             ret;
 413
 414                 ret = pthread_mutex_trylock(&bp->b_lock);
 415                 if (ret) {
 416                         ASSERT(ret == EAGAIN);
 417                         if (flags & LIBXFS_GETBUF_TRYLOCK) {
 418                                 cache_node_put(libxfs_bcache, cn);
 419                                 return -EAGAIN;
 420                         }
 421
 422                         if (pthread_equal(bp->b_holder, pthread_self())) {
 423                                 fprintf(stderr,
 424         _("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
 425                                         key->blkno);
 426                                 bp->b_recur++;
 427                                 *bpp = bp;
 428                                 return 0;
 429                         } else {
 430                                 pthread_mutex_lock(&bp->b_lock);
 431                         }
 432                 }
 433
 434                 bp->b_holder = pthread_self();
 435         }
 436
 437         cache_node_set_priority(libxfs_bcache, cn,
 438                         cache_node_get_priority(cn) - CACHE_PREFETCH_PRIORITY);
 439         *bpp = bp;
 440         return 0;
 441 }
 442
 443 static int
 444 libxfs_getbuf_flags(
 445         struct xfs_buftarg      *btp,
 446         xfs_daddr_t             blkno,
 447         int                     len,
 448         unsigned int            flags,
 449         struct xfs_buf          **bpp)
 450 {
 451         struct xfs_bufkey       key = {NULL};
 452         int                     ret;
 453
 454         key.buftarg = btp;
 455         key.blkno = blkno;
 456         key.bblen = len;
 457
 458         ret = __cache_lookup(&key, flags, bpp);
 459         if (ret)
 460                 return ret;
 461
 462         if (btp == btp->bt_mount->m_ddev_targp) {
 463                 (*bpp)->b_pag = xfs_perag_get(btp->bt_mount,
 464                                 xfs_daddr_to_agno(btp->bt_mount, blkno));
 465         }
 466
 467         return 0;
 468 }
 469
 470 /*
 471  * Clean the buffer flags for libxfs_getbuf*(), which wants to return
 472  * an unused buffer with clean state.  This prevents CRC errors on a
 473  * re-read of a corrupt block that was prefetched and freed.  This
 474  * can happen with a massively corrupt directory that is discarded,
 475  * but whose blocks are then recycled into expanding lost+found.
 476  *
 477  * Note however that if the buffer's dirty (prefetch calls getbuf)
 478  * we'll leave the state alone because we don't want to discard blocks
 479  * that have been fixed.
 480  */
 481 static void
 482 reset_buf_state(
 483         struct xfs_buf  *bp)
 484 {
 485         if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
 486                 bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
 487                                 LIBXFS_B_UPTODATE);
 488 }
 489
 490 static int
 491 __libxfs_buf_get_map(
 492         struct xfs_buftarg      *btp,
 493         struct xfs_buf_map      *map,
 494         int                     nmaps,
 495         int                     flags,
 496         struct xfs_buf          **bpp)
 497 {
 498         struct xfs_bufkey       key = {NULL};
 499         int                     i;
 500
 501         if (nmaps == 1)
 502                 return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
 503                                 flags, bpp);
 504
 505         key.buftarg = btp;
 506         key.blkno = map[0].bm_bn;
 507         for (i = 0; i < nmaps; i++) {
 508                 key.bblen += map[i].bm_len;
 509         }
 510         key.map = map;
 511         key.nmaps = nmaps;
 512
 513         return __cache_lookup(&key, flags, bpp);
 514 }
 515
 516 int
 517 libxfs_buf_get_map(
 518         struct xfs_buftarg      *btp,
 519         struct xfs_buf_map      *map,
 520         int                     nmaps,
 521         int                     flags,
 522         struct xfs_buf          **bpp)
 523 {
 524         int                     error;
 525
 526         error = __libxfs_buf_get_map(btp, map, nmaps, flags, bpp);
 527         if (error)
 528                 return error;
 529
 530         reset_buf_state(*bpp);
 531         return 0;
 532 }
 533
 534 void
 535 libxfs_buf_relse(
 536         struct xfs_buf  *bp)
 537 {
 538         /*
 539          * ensure that any errors on this use of the buffer don't carry
 540          * over to the next user.
 541          */
 542         bp->b_error = 0;
 543         if (use_xfs_buf_lock) {
 544                 if (bp->b_recur) {
 545                         bp->b_recur--;
 546                 } else {
 547                         bp->b_holder = 0;
 548                         pthread_mutex_unlock(&bp->b_lock);
 549                 }
 550         }
 551
 552         if (!list_empty(&bp->b_node.cn_hash))
 553                 cache_node_put(libxfs_bcache, &bp->b_node);
 554         else if (--bp->b_node.cn_count == 0) {
 555                 if (bp->b_flags & LIBXFS_B_DIRTY)
 556                         libxfs_bwrite(bp);
 557                 libxfs_brelse(&bp->b_node);
 558         }
 559 }
 560
 561 static struct cache_node *
 562 libxfs_balloc(
 563         cache_key_t             key)
 564 {
 565         struct xfs_bufkey       *bufkey = (struct xfs_bufkey *)key;
 566         struct xfs_buf          *bp;
 567
 568         if (bufkey->map)
 569                 bp = libxfs_getbufr_map(bufkey->buftarg, bufkey->blkno,
 570                                 bufkey->bblen, bufkey->map, bufkey->nmaps);
 571         else
 572                 bp = libxfs_getbufr(bufkey->buftarg, bufkey->blkno,
 573                                 bufkey->bblen);
 574         return &bp->b_node;
 575 }
 576
 577
 578 static int
 579 __read_buf(int fd, void *buf, int len, off64_t offset, int flags)
 580 {
 581         int     sts;
 582
 583         sts = pread(fd, buf, len, offset);
 584         if (sts < 0) {
 585                 int error = errno;
 586                 fprintf(stderr, _("%s: read failed: %s\n"),
 587                         progname, strerror(error));
 588                 return -error;
 589         } else if (sts != len) {
 590                 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
 591                         progname, sts, len);
 592                 return -EIO;
 593         }
 594         return 0;
 595 }
 596
 597 int
 598 libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, struct xfs_buf *bp,
 599                 int len, int flags)
 600 {
 601         int     fd = btp->bt_bdev_fd;
 602         int     bytes = BBTOB(len);
 603         int     error;
 604
 605         ASSERT(len <= bp->b_length);
 606
 607         error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
 608         if (!error &&
 609             bp->b_target->bt_bdev == btp->bt_bdev &&
 610             bp->b_cache_key == blkno &&
 611             bp->b_length == len)
 612                 bp->b_flags |= LIBXFS_B_UPTODATE;
 613         bp->b_error = error;
 614         return error;
 615 }
 616
 617 int
 618 libxfs_readbuf_verify(
 619         struct xfs_buf          *bp,
 620         const struct xfs_buf_ops *ops)
 621 {
 622         if (!ops)
 623                 return bp->b_error;
 624
 625         bp->b_ops = ops;
 626         bp->b_ops->verify_read(bp);
 627         bp->b_flags &= ~LIBXFS_B_UNCHECKED;
 628         return bp->b_error;
 629 }
 630
 631 int
 632 libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
 633 {
 634         int     fd = btp->bt_bdev_fd;
 635         int     error = 0;
 636         void    *buf;
 637         int     i;
 638
 639         buf = bp->b_addr;
 640         for (i = 0; i < bp->b_nmaps; i++) {
 641                 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
 642                 int len = BBTOB(bp->b_maps[i].bm_len);
 643
 644                 error = __read_buf(fd, buf, len, offset, flags);
 645                 if (error) {
 646                         bp->b_error = error;
 647                         break;
 648                 }
 649                 buf += len;
 650         }
 651
 652         if (!error)
 653                 bp->b_flags |= LIBXFS_B_UPTODATE;
 654         return error;
 655 }
 656
 657 int
 658 libxfs_buf_read_map(
 659         struct xfs_buftarg      *btp,
 660         struct xfs_buf_map      *map,
 661         int                     nmaps,
 662         int                     flags,
 663         struct xfs_buf          **bpp,
 664         const struct xfs_buf_ops *ops)
 665 {
 666         struct xfs_buf          *bp;
 667         bool                    salvage = flags & LIBXFS_READBUF_SALVAGE;
 668         int                     error = 0;
 669
 670         *bpp = NULL;
 671         if (nmaps == 1)
 672                 error = libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
 673                                 0, &bp);
 674         else
 675                 error = __libxfs_buf_get_map(btp, map, nmaps, 0, &bp);
 676         if (error)
 677                 return error;
 678
 679         /*
 680          * If the buffer was prefetched, it is likely that it was not validated.
 681          * Hence if we are supplied an ops function and the buffer is marked as
 682          * unchecked, we need to validate it now.
 683          *
 684          * We do this verification even if the buffer is dirty - the
 685          * verification is almost certainly going to fail the CRC check in this
 686          * case as a dirty buffer has not had the CRC recalculated. However, we
 687          * should not be dirtying unchecked buffers and therefore failing it
 688          * here because it's dirty and unchecked indicates we've screwed up
 689          * somewhere else.
 690          *
 691          * Note that if the caller passes in LIBXFS_READBUF_SALVAGE, that means
 692          * they want the buffer even if it fails verification.
 693          */
 694         bp->b_error = 0;
 695         if (bp->b_flags & (LIBXFS_B_UPTODATE | LIBXFS_B_DIRTY)) {
 696                 if (bp->b_flags & LIBXFS_B_UNCHECKED)
 697                         error = libxfs_readbuf_verify(bp, ops);
 698                 if (error && !salvage)
 699                         goto err;
 700                 goto ok;
 701         }
 702
 703         /*
 704          * Set the ops on a cache miss (i.e. first physical read) as the
 705          * verifier may change the ops to match the type of buffer it contains.
 706          * A cache hit might reset the verifier to the original type if we set
 707          * it again, but it won't get called again and set to match the buffer
 708          * contents. *cough* xfs_da_node_buf_ops *cough*.
 709          */
 710         if (nmaps == 1)
 711                 error = libxfs_readbufr(btp, map[0].bm_bn, bp, map[0].bm_len,
 712                                 flags);
 713         else
 714                 error = libxfs_readbufr_map(btp, bp, flags);
 715         if (error)
 716                 goto err;
 717
 718         error = libxfs_readbuf_verify(bp, ops);
 719         if (error && !salvage)
 720                 goto err;
 721
 722 ok:
 723         *bpp = bp;
 724         return 0;
 725 err:
 726         libxfs_buf_relse(bp);
 727         return error;
 728 }
 729
 730 /* Allocate a raw uncached buffer. */
 731 static inline struct xfs_buf *
 732 libxfs_getbufr_uncached(
 733         struct xfs_buftarg      *targ,
 734         xfs_daddr_t             daddr,
 735         size_t                  bblen)
 736 {
 737         struct xfs_buf          *bp;
 738
 739         bp = libxfs_getbufr(targ, daddr, bblen);
 740         if (!bp)
 741                 return NULL;
 742
 743         INIT_LIST_HEAD(&bp->b_node.cn_hash);
 744         bp->b_node.cn_count = 1;
 745         return bp;
 746 }
 747
 748 /*
 749  * Allocate an uncached buffer that points nowhere.  The refcount will be 1,
 750  * and the cache node hash list will be empty to indicate that it's uncached.
 751  */
 752 int
 753 libxfs_buf_get_uncached(
 754         struct xfs_buftarg      *targ,
 755         size_t                  bblen,
 756         int                     flags,
 757         struct xfs_buf          **bpp)
 758 {
 759         *bpp = libxfs_getbufr_uncached(targ, XFS_BUF_DADDR_NULL, bblen);
 760         return *bpp != NULL ? 0 : -ENOMEM;
 761 }
 762
 763 /*
 764  * Allocate and read an uncached buffer.  The refcount will be 1, and the cache
 765  * node hash list will be empty to indicate that it's uncached.
 766  */
 767 int
 768 libxfs_buf_read_uncached(
 769         struct xfs_buftarg      *targ,
 770         xfs_daddr_t             daddr,
 771         size_t                  bblen,
 772         int                     flags,
 773         struct xfs_buf          **bpp,
 774         const struct xfs_buf_ops *ops)
 775 {
 776         struct xfs_buf          *bp;
 777         int                     error;
 778
 779         *bpp = NULL;
 780         bp = libxfs_getbufr_uncached(targ, daddr, bblen);
 781         if (!bp)
 782                 return -ENOMEM;
 783
 784         error = libxfs_readbufr(targ, daddr, bp, bblen, flags);
 785         if (error)
 786                 goto err;
 787
 788         error = libxfs_readbuf_verify(bp, ops);
 789         if (error)
 790                 goto err;
 791
 792         *bpp = bp;
 793         return 0;
 794 err:
 795         libxfs_buf_relse(bp);
 796         return error;
 797 }
 798
 799 static int
 800 __write_buf(int fd, void *buf, int len, off64_t offset, int flags)
 801 {
 802         int     sts;
 803
 804         sts = pwrite(fd, buf, len, offset);
 805         if (sts < 0) {
 806                 int error = errno;
 807                 fprintf(stderr, _("%s: pwrite failed: %s\n"),
 808                         progname, strerror(error));
 809                 return -error;
 810         } else if (sts != len) {
 811                 fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
 812                         progname, sts, len);
 813                 return -EIO;
 814         }
 815         return 0;
 816 }
 817
 818 int
 819 libxfs_bwrite(
 820         struct xfs_buf  *bp)
 821 {
 822         int             fd = bp->b_target->bt_bdev_fd;
 823
 824         /*
 825          * we never write buffers that are marked stale. This indicates they
 826          * contain data that has been invalidated, and even if the buffer is
 827          * dirty it must *never* be written. Verifiers are wonderful for finding
 828          * bugs like this. Make sure the error is obvious as to the cause.
 829          */
 830         if (bp->b_flags & LIBXFS_B_STALE) {
 831                 bp->b_error = -ESTALE;
 832                 return bp->b_error;
 833         }
 834
 835         /* Trigger the writeback hook if there is one. */
 836         if (bp->b_mount->m_buf_writeback_fn)
 837                 bp->b_mount->m_buf_writeback_fn(bp);
 838
 839         /*
 840          * clear any pre-existing error status on the buffer. This can occur if
 841          * the buffer is corrupt on disk and the repair process doesn't clear
 842          * the error before fixing and writing it back.
 843          */
 844         bp->b_error = 0;
 845         if (bp->b_ops) {
 846                 bp->b_ops->verify_write(bp);
 847                 if (bp->b_error) {
 848                         fprintf(stderr,
 849         _("%s: write verifier failed on %s bno 0x%llx/0x%x\n"),
 850                                 __func__, bp->b_ops->name,
 851                                 (unsigned long long)xfs_buf_daddr(bp),
 852                                 bp->b_length);
 853                         return bp->b_error;
 854                 }
 855         }
 856
 857         if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
 858                 bp->b_error = __write_buf(fd, bp->b_addr, BBTOB(bp->b_length),
 859                                     LIBXFS_BBTOOFF64(xfs_buf_daddr(bp)),
 860                                     bp->b_flags);
 861         } else {
 862                 int     i;
 863                 void    *buf = bp->b_addr;
 864
 865                 for (i = 0; i < bp->b_nmaps; i++) {
 866                         off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
 867                         int len = BBTOB(bp->b_maps[i].bm_len);
 868
 869                         bp->b_error = __write_buf(fd, buf, len, offset,
 870                                                   bp->b_flags);
 871                         if (bp->b_error)
 872                                 break;
 873                         buf += len;
 874                 }
 875         }
 876
 877         if (bp->b_error) {
 878                 fprintf(stderr,
 879         _("%s: write failed on %s bno 0x%llx/0x%x, err=%d\n"),
 880                         __func__, bp->b_ops ? bp->b_ops->name : "(unknown)",
 881                         (unsigned long long)xfs_buf_daddr(bp),
 882                         bp->b_length, -bp->b_error);
 883         } else {
 884                 bp->b_flags |= LIBXFS_B_UPTODATE;
 885                 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED);
 886                 xfs_buftarg_trip_write(bp->b_target);
 887         }
 888         return bp->b_error;
 889 }
 890
 891 /*
 892  * Mark a buffer dirty.  The dirty data will be written out when the cache
 893  * is flushed (or at release time if the buffer is uncached).
 894  */
 895 void
 896 libxfs_buf_mark_dirty(
 897         struct xfs_buf  *bp)
 898 {
 899         /*
 900          * Clear any error hanging over from reading the buffer. This prevents
 901          * subsequent reads after this write from seeing stale errors.
 902          */
 903         bp->b_error = 0;
 904         bp->b_flags &= ~LIBXFS_B_STALE;
 905         bp->b_flags |= LIBXFS_B_DIRTY;
 906 }
 907
 908 /* Prepare a buffer to be sent to the MRU list. */
 909 static inline void
 910 libxfs_buf_prepare_mru(
 911         struct xfs_buf          *bp)
 912 {
 913         if (bp->b_pag)
 914                 xfs_perag_put(bp->b_pag);
 915         bp->b_pag = NULL;
 916
 917         if (!(bp->b_flags & LIBXFS_B_DIRTY))
 918                 return;
 919
 920         /* Complain about (and remember) dropping dirty buffers. */
 921         fprintf(stderr, _("%s: Releasing dirty buffer to free list!\n"),
 922                         progname);
 923
 924         if (bp->b_error == -EFSCORRUPTED)
 925                 bp->b_target->flags |= XFS_BUFTARG_CORRUPT_WRITE;
 926         bp->b_target->flags |= XFS_BUFTARG_LOST_WRITE;
 927 }
 928
 929 static void
 930 libxfs_brelse(
 931         struct cache_node       *node)
 932 {
 933         struct xfs_buf          *bp = container_of(node, struct xfs_buf,
 934                                                    b_node);
 935
 936         if (!bp)
 937                 return;
 938         libxfs_buf_prepare_mru(bp);
 939
 940         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 941         list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
 942         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 943 }
 944
 945 static unsigned int
 946 libxfs_bulkrelse(
 947         struct cache            *cache,
 948         struct list_head        *list)
 949 {
 950         struct xfs_buf          *bp;
 951         int                     count = 0;
 952
 953         if (list_empty(list))
 954                 return 0 ;
 955
 956         list_for_each_entry(bp, list, b_node.cn_mru) {
 957                 libxfs_buf_prepare_mru(bp);
 958                 count++;
 959         }
 960
 961         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 962         list_splice(list, &xfs_buf_freelist.cm_list);
 963         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 964
 965         return count;
 966 }
 967
 968 /*
 969  * Free everything from the xfs_buf_freelist MRU, used at final teardown
 970  */
 971 void
 972 libxfs_bcache_free(void)
 973 {
 974         struct list_head        *cm_list;
 975         struct xfs_buf          *bp, *next;
 976
 977         cm_list = &xfs_buf_freelist.cm_list;
 978         list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
 979                 free(bp->b_addr);
 980                 if (bp->b_maps != &bp->__b_map)
 981                         free(bp->b_maps);
 982                 kmem_cache_free(xfs_buf_cache, bp);
 983         }
 984 }
 985
 986 /*
 987  * When a buffer is marked dirty, the error is cleared. Hence if we are trying
 988  * to flush a buffer prior to cache reclaim that has an error on it it means
 989  * we've already tried to flush it and it failed. Prevent repeated corruption
 990  * errors from being reported by skipping such buffers - when the corruption is
 991  * fixed the buffer will be marked dirty again and we can write it again.
 992  */
 993 static int
 994 libxfs_bflush(
 995         struct cache_node       *node)
 996 {
 997         struct xfs_buf          *bp = container_of(node, struct xfs_buf,
 998                                                    b_node);
 999
1000         if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
1001                 return libxfs_bwrite(bp);
1002         return bp->b_error;
1003 }
1004
1005 void
1006 libxfs_bcache_purge(void)
1007 {
1008         cache_purge(libxfs_bcache);
1009 }
1010
1011 void
1012 libxfs_bcache_flush(void)
1013 {
1014         cache_flush(libxfs_bcache);
1015 }
1016
1017 int
1018 libxfs_bcache_overflowed(void)
1019 {
1020         return cache_overflowed(libxfs_bcache);
1021 }
1022
1023 struct cache_operations libxfs_bcache_operations = {
1024         .hash           = libxfs_bhash,
1025         .alloc          = libxfs_balloc,
1026         .flush          = libxfs_bflush,
1027         .relse          = libxfs_brelse,
1028         .compare        = libxfs_bcompare,
1029         .bulkrelse      = libxfs_bulkrelse
1030 };
1031
1032 /*
1033  * Verify an on-disk magic value against the magic value specified in the
1034  * verifier structure. The verifier magic is in disk byte order so the caller is
1035  * expected to pass the value directly from disk.
1036  */
1037 bool
1038 xfs_verify_magic(
1039         struct xfs_buf          *bp,
1040         __be32                  dmagic)
1041 {
1042         struct xfs_mount        *mp = bp->b_mount;
1043         int                     idx;
1044
1045         idx = xfs_has_crc(mp);
1046         if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
1047                 return false;
1048         return dmagic == bp->b_ops->magic[idx];
1049 }
1050
1051 /*
1052  * Verify an on-disk magic value against the magic value specified in the
1053  * verifier structure. The verifier magic is in disk byte order so the caller is
1054  * expected to pass the value directly from disk.
1055  */
1056 bool
1057 xfs_verify_magic16(
1058         struct xfs_buf          *bp,
1059         __be16                  dmagic)
1060 {
1061         struct xfs_mount        *mp = bp->b_mount;
1062         int                     idx;
1063
1064         idx = xfs_has_crc(mp);
1065         if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
1066                 return false;
1067         return dmagic == bp->b_ops->magic16[idx];
1068 }
1069
1070 /*
1071  * Inode cache stubs.
1072  */
1073
1074 struct kmem_cache               *xfs_inode_cache;
1075 extern struct kmem_cache        *xfs_ili_cache;
1076
1077 int
1078 libxfs_iget(
1079         struct xfs_mount        *mp,
1080         struct xfs_trans        *tp,
1081         xfs_ino_t               ino,
1082         uint                    lock_flags,
1083         struct xfs_inode        **ipp)
1084 {
1085         struct xfs_inode        *ip;
1086         struct xfs_buf          *bp;
1087         struct xfs_perag        *pag;
1088         int                     error = 0;
1089
1090         /* reject inode numbers outside existing AGs */
1091         if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
1092                 return -EINVAL;
1093
1094         ip = kmem_cache_zalloc(xfs_inode_cache, 0);
1095         if (!ip)
1096                 return -ENOMEM;
1097
1098         VFS_I(ip)->i_count = 1;
1099         ip->i_ino = ino;
1100         ip->i_mount = mp;
1101         ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
1102         spin_lock_init(&VFS_I(ip)->i_lock);
1103
1104         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1105         error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, 0);
1106         xfs_perag_put(pag);
1107
1108         if (error)
1109                 goto out_destroy;
1110
1111         error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
1112         if (error)
1113                 goto out_destroy;
1114
1115         error = xfs_inode_from_disk(ip,
1116                         xfs_buf_offset(bp, ip->i_imap.im_boffset));
1117         if (!error)
1118                 xfs_buf_set_ref(bp, XFS_INO_REF);
1119         xfs_trans_brelse(tp, bp);
1120
1121         if (error)
1122                 goto out_destroy;
1123
1124         *ipp = ip;
1125         return 0;
1126
1127 out_destroy:
1128         kmem_cache_free(xfs_inode_cache, ip);
1129         *ipp = NULL;
1130         return error;
1131 }
1132
1133 static void
1134 libxfs_idestroy(xfs_inode_t *ip)
1135 {
1136         switch (VFS_I(ip)->i_mode & S_IFMT) {
1137                 case S_IFREG:
1138                 case S_IFDIR:
1139                 case S_IFLNK:
1140                         libxfs_idestroy_fork(&ip->i_df);
1141                         break;
1142         }
1143
1144         libxfs_ifork_zap_attr(ip);
1145
1146         if (ip->i_cowfp) {
1147                 libxfs_idestroy_fork(ip->i_cowfp);
1148                 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
1149         }
1150 }
1151
1152 void
1153 libxfs_irele(
1154         struct xfs_inode        *ip)
1155 {
1156         VFS_I(ip)->i_count--;
1157
1158         if (VFS_I(ip)->i_count == 0) {
1159                 ASSERT(ip->i_itemp == NULL);
1160                 libxfs_idestroy(ip);
1161                 kmem_cache_free(xfs_inode_cache, ip);
1162         }
1163 }
1164
1165 /*
1166  * Flush everything dirty in the kernel and disk write caches to stable media.
1167  * Returns 0 for success or a negative error code.
1168  */
1169 int
1170 libxfs_blkdev_issue_flush(
1171         struct xfs_buftarg      *btp)
1172 {
1173         int                     ret;
1174
1175         if (btp->bt_bdev == 0)
1176                 return 0;
1177
1178         ret = platform_flush_device(btp->bt_bdev_fd, btp->bt_bdev);
1179         return ret ? -errno : 0;
1180 }
1181
1182 /*
1183  * Write out a buffer list synchronously.
1184  *
1185  * This will take the @buffer_list, write all buffers out and wait for I/O
1186  * completion on all of the buffers. @buffer_list is consumed by the function,
1187  * so callers must have some other way of tracking buffers if they require such
1188  * functionality.
1189  */
1190 int
1191 xfs_buf_delwri_submit(
1192         struct list_head        *buffer_list)
1193 {
1194         struct xfs_buf          *bp, *n;
1195         int                     error = 0, error2;
1196
1197         list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1198                 list_del_init(&bp->b_list);
1199                 error2 = libxfs_bwrite(bp);
1200                 if (!error)
1201                         error = error2;
1202                 libxfs_buf_relse(bp);
1203         }
1204
1205         return error;
1206 }
1207
1208 /*
1209  * Cancel a delayed write list.
1210  *
1211  * Remove each buffer from the list, clear the delwri queue flag and drop the
1212  * associated buffer reference.
1213  */
1214 void
1215 xfs_buf_delwri_cancel(
1216         struct list_head        *list)
1217 {
1218         struct xfs_buf          *bp;
1219
1220         while (!list_empty(list)) {
1221                 bp = list_first_entry(list, struct xfs_buf, b_list);
1222
1223                 list_del_init(&bp->b_list);
1224                 libxfs_buf_relse(bp);
1225         }
1226 }
1227
1228 /*
1229  * Format the log. The caller provides either a buftarg which is used to access
1230  * the log via buffers or a direct pointer to a buffer that encapsulates the
1231  * entire log.
1232  */
1233 int
1234 libxfs_log_clear(
1235         struct xfs_buftarg      *btp,
1236         char                    *dptr,
1237         xfs_daddr_t             start,
1238         uint                    length,         /* basic blocks */
1239         uuid_t                  *fs_uuid,
1240         int                     version,
1241         int                     sunit,          /* bytes */
1242         int                     fmt,
1243         int                     cycle,
1244         bool                    max)
1245 {
1246         struct xfs_buf          *bp = NULL;
1247         int                     len;
1248         xfs_lsn_t               lsn;
1249         xfs_lsn_t               tail_lsn;
1250         xfs_daddr_t             blk;
1251         xfs_daddr_t             end_blk;
1252         char                    *ptr;
1253
1254         if (((btp && dptr) || (!btp && !dptr)) ||
1255             (btp && !btp->bt_bdev) || !fs_uuid)
1256                 return -EINVAL;
1257
1258         /* first zero the log */
1259         if (btp)
1260                 libxfs_device_zero(btp, start, length);
1261         else
1262                 memset(dptr, 0, BBTOB(length));
1263
1264         /*
1265          * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
1266          * special reset case where we only write a single record where the lsn
1267          * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
1268          * the specified cycle and points tail_lsn at the last record of the
1269          * previous cycle.
1270          */
1271         len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
1272         len = max(len, 2);
1273         lsn = xlog_assign_lsn(cycle, 0);
1274         if (cycle == XLOG_INIT_CYCLE)
1275                 tail_lsn = lsn;
1276         else
1277                 tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
1278
1279         /* write out the first log record */
1280         ptr = dptr;
1281         if (btp) {
1282                 bp = libxfs_getbufr_uncached(btp, start, len);
1283                 ptr = bp->b_addr;
1284         }
1285         libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
1286                           next, bp);
1287         if (bp) {
1288                 libxfs_buf_mark_dirty(bp);
1289                 libxfs_buf_relse(bp);
1290         }
1291
1292         /*
1293          * There's nothing else to do if this is a log reset. The kernel detects
1294          * the rest of the log is zeroed and starts at cycle 1.
1295          */
1296         if (cycle == XLOG_INIT_CYCLE)
1297                 return 0;
1298
1299         /*
1300          * Bump the record size for a full log format if the caller allows it.
1301          * This is primarily for performance reasons and most callers don't care
1302          * about record size since the log is clean after we're done.
1303          */
1304         if (max)
1305                 len = BTOBB(BDSTRAT_SIZE);
1306
1307         /*
1308          * Otherwise, fill everything beyond the initial record with records of
1309          * the previous cycle so the kernel head/tail detection works correctly.
1310          *
1311          * We don't particularly care about the record size or content here.
1312          * It's only important that the headers are in place such that the
1313          * kernel finds 1.) a clean log and 2.) the correct current cycle value.
1314          * Therefore, bump up the record size to the max to use larger I/Os and
1315          * improve performance.
1316          */
1317         cycle--;
1318         blk = start + len;
1319         if (dptr)
1320                 dptr += BBTOB(len);
1321         end_blk = start + length;
1322
1323         len = min(end_blk - blk, len);
1324         while (blk < end_blk) {
1325                 lsn = xlog_assign_lsn(cycle, blk - start);
1326                 tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
1327
1328                 ptr = dptr;
1329                 if (btp) {
1330                         bp = libxfs_getbufr_uncached(btp, blk, len);
1331                         ptr = bp->b_addr;
1332                 }
1333                 /*
1334                  * Note: pass the full buffer length as the sunit to initialize
1335                  * the entire buffer.
1336                  */
1337                 libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
1338                                   tail_lsn, next, bp);
1339                 if (bp) {
1340                         libxfs_buf_mark_dirty(bp);
1341                         libxfs_buf_relse(bp);
1342                 }
1343
1344                 blk += len;
1345                 if (dptr)
1346                         dptr += BBTOB(len);
1347                 len = min(end_blk - blk, len);
1348         }
1349
1350         return 0;
1351 }
1352
1353 int
1354 libxfs_log_header(
1355         char                    *caddr,
1356         uuid_t                  *fs_uuid,
1357         int                     version,
1358         int                     sunit,
1359         int                     fmt,
1360         xfs_lsn_t               lsn,
1361         xfs_lsn_t               tail_lsn,
1362         libxfs_get_block_t      *nextfunc,
1363         void                    *private)
1364 {
1365         xlog_rec_header_t       *head = (xlog_rec_header_t *)caddr;
1366         char                    *p = caddr;
1367         __be32                  cycle_lsn;
1368         int                     i, len;
1369         int                     hdrs = 1;
1370
1371         if (lsn == NULLCOMMITLSN)
1372                 lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
1373         if (tail_lsn == NULLCOMMITLSN)
1374                 tail_lsn = lsn;
1375
1376         len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
1377
1378         memset(p, 0, BBSIZE);
1379         head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1380         head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
1381         head->h_version = cpu_to_be32(version);
1382         head->h_crc = cpu_to_le32(0);
1383         head->h_prev_block = cpu_to_be32(-1);
1384         head->h_num_logops = cpu_to_be32(1);
1385         head->h_fmt = cpu_to_be32(fmt);
1386         head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
1387
1388         head->h_lsn = cpu_to_be64(lsn);
1389         head->h_tail_lsn = cpu_to_be64(tail_lsn);
1390
1391         memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
1392
1393         /*
1394          * The kernel expects to see either a log record header magic value or
1395          * the LSN cycle at the top of every log block. The first word of each
1396          * non-header block is copied to the record headers and replaced with
1397          * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
1398          * details).
1399          *
1400          * Even though we only ever write an unmount record (one block), we
1401          * support writing log records up to the max log buffer size of 256k to
1402          * improve log format performance. This means a record can require up
1403          * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
1404          * data (each header supports 32k of data).
1405          */
1406         cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
1407         if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
1408                 hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
1409                 if (sunit % XLOG_HEADER_CYCLE_SIZE)
1410                         hdrs++;
1411         }
1412
1413         /*
1414          * A fixed number of extended headers is expected based on h_size. If
1415          * required, format those now so the unmount record is located
1416          * correctly.
1417          *
1418          * Since we only write an unmount record, we only need one h_cycle_data
1419          * entry for the unmount record block. The subsequent record data
1420          * blocks are zeroed, which means we can stamp them directly with the
1421          * cycle and zero the rest of the cycle data in the extended headers.
1422          */
1423         if (hdrs > 1) {
1424                 for (i = 1; i < hdrs; i++) {
1425                         p = nextfunc(p, BBSIZE, private);
1426                         memset(p, 0, BBSIZE);
1427                         /* xlog_rec_ext_header.xh_cycle */
1428                         *(__be32 *)p = cycle_lsn;
1429                 }
1430         }
1431
1432         /*
1433          * The total length is the max of the stripe unit or 2 basic block
1434          * minimum (1 hdr blk + 1 data blk). The record length is the total
1435          * minus however many header blocks are required.
1436          */
1437         head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
1438
1439         /*
1440          * Write out the unmount record, pack the first word into the record
1441          * header and stamp the block with the cycle.
1442          */
1443         p = nextfunc(p, BBSIZE, private);
1444         unmount_record(p);
1445
1446         head->h_cycle_data[0] = *(__be32 *)p;
1447         *(__be32 *)p = cycle_lsn;
1448
1449         /*
1450          * Finally, zero all remaining blocks in the record and stamp each with
1451          * the cycle. We don't need to pack any of these blocks because the
1452          * cycle data in the headers has already been zeroed.
1453          */
1454         len = max(len, hdrs + 1);
1455         for (i = hdrs + 1; i < len; i++) {
1456                 p = nextfunc(p, BBSIZE, private);
1457                 memset(p, 0, BBSIZE);
1458                 *(__be32 *)p = cycle_lsn;
1459         }
1460
1461         return BBTOB(len);
1462 }
1463
1464 void
1465 libxfs_buf_set_priority(
1466         struct xfs_buf  *bp,
1467         int             priority)
1468 {
1469         cache_node_set_priority(libxfs_bcache, &bp->b_node, priority);
1470 }
1471
1472 int
1473 libxfs_buf_priority(
1474         struct xfs_buf  *bp)
1475 {
1476         return cache_node_get_priority(&bp->b_node);
1477 }
1478
1479 /*
1480  * Log a message about and stale a buffer that a caller has decided is corrupt.
1481  *
1482  * This function should be called for the kinds of metadata corruption that
1483  * cannot be detect from a verifier, such as incorrect inter-block relationship
1484  * data.  Do /not/ call this function from a verifier function.
1485  *
1486  * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
1487  * be marked stale, but b_error will not be set.  The caller is responsible for
1488  * releasing the buffer or fixing it.
1489  */
1490 void
1491 __xfs_buf_mark_corrupt(
1492         struct xfs_buf          *bp,
1493         xfs_failaddr_t          fa)
1494 {
1495         ASSERT(bp->b_flags & XBF_DONE);
1496
1497         xfs_buf_corruption_error(bp, fa);
1498         xfs_buf_stale(bp);
1499 }