libxfs/rdwr.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6
   7
   8 #include "libxfs_priv.h"
   9 #include "init.h"
  10 #include "xfs_fs.h"
  11 #include "xfs_shared.h"
  12 #include "xfs_format.h"
  13 #include "xfs_log_format.h"
  14 #include "xfs_trans_resv.h"
  15 #include "xfs_mount.h"
  16 #include "xfs_inode_buf.h"
  17 #include "xfs_inode_fork.h"
  18 #include "xfs_inode.h"
  19 #include "xfs_trans.h"
  20 #include "libfrog/platform.h"
  21
  22 #include "libxfs.h"
  23
  24 static void libxfs_brelse(struct cache_node *node);
  25
  26 /*
  27  * Important design/architecture note:
  28  *
  29  * The userspace code that uses the buffer cache is much less constrained than
  30  * the kernel code. The userspace code is pretty nasty in places, especially
  31  * when it comes to buffer error handling.  Very little of the userspace code
  32  * outside libxfs clears bp->b_error - very little code even checks it - so the
  33  * libxfs code is tripping on stale errors left by the userspace code.
  34  *
  35  * We can't clear errors or zero buffer contents in libxfs_buf_get-* like we do
  36  * in the kernel, because those functions are used by the libxfs_readbuf_*
  37  * functions and hence need to leave the buffers unchanged on cache hits. This
  38  * is actually the only way to gather a write error from a libxfs_writebuf()
  39  * call - you need to get the buffer again so you can check bp->b_error field -
  40  * assuming that the buffer is still in the cache when you check, that is.
  41  *
  42  * This is very different to the kernel code which does not release buffers on a
  43  * write so we can wait on IO and check errors. The kernel buffer cache also
  44  * guarantees a buffer of a known initial state from xfs_buf_get() even on a
  45  * cache hit.
  46  *
  47  * IOWs, userspace is behaving quite differently to the kernel and as a result
  48  * it leaks errors from reads, invalidations and writes through
  49  * libxfs_buf_get/libxfs_buf_read.
  50  *
  51  * The result of this is that until the userspace code outside libxfs is cleaned
  52  * up, functions that release buffers from userspace control (i.e
  53  * libxfs_writebuf/libxfs_buf_relse) need to zero bp->b_error to prevent
  54  * propagation of stale errors into future buffer operations.
  55  */
  56
  57 #define BDSTRAT_SIZE    (256 * 1024)
  58
  59 #define IO_BCOMPARE_CHECK
  60
  61 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
  62 int
  63 libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
  64 {
  65         xfs_off_t       start_offset, end_offset, offset;
  66         ssize_t         zsize, bytes;
  67         size_t          len_bytes;
  68         char            *z;
  69         int             error, fd;
  70
  71         fd = libxfs_device_to_fd(btp->bt_bdev);
  72         start_offset = LIBXFS_BBTOOFF64(start);
  73
  74         /* try to use special zeroing methods, fall back to writes if needed */
  75         len_bytes = LIBXFS_BBTOOFF64(len);
  76         error = platform_zero_range(fd, start_offset, len_bytes);
  77         if (!error) {
  78                 xfs_buftarg_trip_write(btp);
  79                 return 0;
  80         }
  81
  82         zsize = min(BDSTRAT_SIZE, BBTOB(len));
  83         if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
  84                 fprintf(stderr,
  85                         _("%s: %s can't memalign %d bytes: %s\n"),
  86                         progname, __FUNCTION__, (int)zsize, strerror(errno));
  87                 exit(1);
  88         }
  89         memset(z, 0, zsize);
  90
  91         if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
  92                 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
  93                         progname, __FUNCTION__,
  94                         (unsigned long long)start_offset, strerror(errno));
  95                 exit(1);
  96         }
  97
  98         end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
  99         for (offset = 0; offset < end_offset; ) {
 100                 bytes = min((ssize_t)(end_offset - offset), zsize);
 101                 if ((bytes = write(fd, z, bytes)) < 0) {
 102                         fprintf(stderr, _("%s: %s write failed: %s\n"),
 103                                 progname, __FUNCTION__, strerror(errno));
 104                         exit(1);
 105                 } else if (bytes == 0) {
 106                         fprintf(stderr, _("%s: %s not progressing?\n"),
 107                                 progname, __FUNCTION__);
 108                         exit(1);
 109                 }
 110                 xfs_buftarg_trip_write(btp);
 111                 offset += bytes;
 112         }
 113         free(z);
 114         return 0;
 115 }
 116
 117 static void unmount_record(void *p)
 118 {
 119         xlog_op_header_t        *op = (xlog_op_header_t *)p;
 120         /* the data section must be 32 bit size aligned */
 121         struct {
 122             uint16_t magic;
 123             uint16_t pad1;
 124             uint32_t pad2; /* may as well make it 64 bits */
 125         } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
 126
 127         memset(p, 0, BBSIZE);
 128         /* dummy tid to mark this as written from userspace */
 129         op->oh_tid = cpu_to_be32(0xb0c0d0d0);
 130         op->oh_len = cpu_to_be32(sizeof(magic));
 131         op->oh_clientid = XFS_LOG;
 132         op->oh_flags = XLOG_UNMOUNT_TRANS;
 133         op->oh_res2 = 0;
 134
 135         /* and the data for this op */
 136         memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
 137 }
 138
 139 static char *next(
 140         char            *ptr,
 141         int             offset,
 142         void            *private)
 143 {
 144         struct xfs_buf  *buf = (struct xfs_buf *)private;
 145
 146         if (buf &&
 147             (BBTOB(buf->b_length) < (int)(ptr - (char *)buf->b_addr) + offset))
 148                 abort();
 149
 150         return ptr + offset;
 151 }
 152
 153 struct xfs_buf *
 154 libxfs_getsb(
 155         struct xfs_mount        *mp)
 156 {
 157         struct xfs_buf          *bp;
 158
 159         libxfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, XFS_FSS_TO_BB(mp, 1),
 160                         0, &bp, &xfs_sb_buf_ops);
 161         return bp;
 162 }
 163
 164 struct kmem_cache                       *xfs_buf_cache;
 165
 166 static struct cache_mru         xfs_buf_freelist =
 167         {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
 168          0, PTHREAD_MUTEX_INITIALIZER };
 169
 170 /*
 171  * The bufkey is used to pass the new buffer information to the cache object
 172  * allocation routine. Because discontiguous buffers need to pass different
 173  * information, we need fields to pass that information. However, because the
 174  * blkno and bblen is needed for the initial cache entry lookup (i.e. for
 175  * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
 176  * buffer initialisation instead of a contiguous buffer.
 177  */
 178 struct xfs_bufkey {
 179         struct xfs_buftarg      *buftarg;
 180         xfs_daddr_t             blkno;
 181         unsigned int            bblen;
 182         struct xfs_buf_map      *map;
 183         int                     nmaps;
 184 };
 185
 186 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
 187 #define GOLDEN_RATIO_PRIME      0x9e37fffffffc0001UL
 188 #define CACHE_LINE_SIZE         64
 189 static unsigned int
 190 libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
 191 {
 192         uint64_t        hashval = ((struct xfs_bufkey *)key)->blkno;
 193         uint64_t        tmp;
 194
 195         tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
 196         tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
 197         return tmp % hashsize;
 198 }
 199
 200 static int
 201 libxfs_bcompare(struct cache_node *node, cache_key_t key)
 202 {
 203         struct xfs_buf          *bp = container_of(node, struct xfs_buf,
 204                                                    b_node);
 205         struct xfs_bufkey       *bkey = (struct xfs_bufkey *)key;
 206
 207         if (bp->b_target->bt_bdev == bkey->buftarg->bt_bdev &&
 208             bp->b_cache_key == bkey->blkno) {
 209                 if (bp->b_length == bkey->bblen)
 210                         return CACHE_HIT;
 211 #ifdef IO_BCOMPARE_CHECK
 212                 if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
 213                         fprintf(stderr,
 214         "%lx: Badness in key lookup (length)\n"
 215         "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
 216                                 pthread_self(),
 217                                 (unsigned long long)xfs_buf_daddr(bp),
 218                                 BBTOB(bp->b_length),
 219                                 (unsigned long long)bkey->blkno,
 220                                 BBTOB(bkey->bblen));
 221                 }
 222 #endif
 223                 return CACHE_PURGE;
 224         }
 225         return CACHE_MISS;
 226 }
 227
 228 static void
 229 __initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 230                 unsigned int bytes)
 231 {
 232         bp->b_flags = 0;
 233         bp->b_cache_key = bno;
 234         bp->b_length = BTOBB(bytes);
 235         bp->b_target = btp;
 236         bp->b_mount = btp->bt_mount;
 237         bp->b_error = 0;
 238         if (!bp->b_addr)
 239                 bp->b_addr = memalign(libxfs_device_alignment(), bytes);
 240         if (!bp->b_addr) {
 241                 fprintf(stderr,
 242                         _("%s: %s can't memalign %u bytes: %s\n"),
 243                         progname, __FUNCTION__, bytes,
 244                         strerror(errno));
 245                 exit(1);
 246         }
 247         memset(bp->b_addr, 0, bytes);
 248         pthread_mutex_init(&bp->b_lock, NULL);
 249         bp->b_holder = 0;
 250         bp->b_recur = 0;
 251         bp->b_ops = NULL;
 252         INIT_LIST_HEAD(&bp->b_li_list);
 253
 254         if (!bp->b_maps)
 255                 bp->b_maps = &bp->__b_map;
 256
 257         if (bp->b_maps == &bp->__b_map) {
 258                 bp->b_nmaps = 1;
 259                 bp->b_maps[0].bm_bn = bno;
 260                 bp->b_maps[0].bm_len = bp->b_length;
 261         }
 262 }
 263
 264 static void
 265 libxfs_initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 266                 unsigned int bytes)
 267 {
 268         __initbuf(bp, btp, bno, bytes);
 269 }
 270
 271 static void
 272 libxfs_initbuf_map(struct xfs_buf *bp, struct xfs_buftarg *btp,
 273                 struct xfs_buf_map *map, int nmaps)
 274 {
 275         unsigned int bytes = 0;
 276         int i;
 277
 278         bytes = sizeof(struct xfs_buf_map) * nmaps;
 279         bp->b_maps = malloc(bytes);
 280         if (!bp->b_maps) {
 281                 fprintf(stderr,
 282                         _("%s: %s can't malloc %u bytes: %s\n"),
 283                         progname, __FUNCTION__, bytes,
 284                         strerror(errno));
 285                 exit(1);
 286         }
 287         bp->b_nmaps = nmaps;
 288
 289         bytes = 0;
 290         for ( i = 0; i < nmaps; i++) {
 291                 bp->b_maps[i].bm_bn = map[i].bm_bn;
 292                 bp->b_maps[i].bm_len = map[i].bm_len;
 293                 bytes += BBTOB(map[i].bm_len);
 294         }
 295
 296         __initbuf(bp, btp, map[0].bm_bn, bytes);
 297         bp->b_flags |= LIBXFS_B_DISCONTIG;
 298 }
 299
 300 static struct xfs_buf *
 301 __libxfs_getbufr(int blen)
 302 {
 303         struct xfs_buf  *bp;
 304
 305         /*
 306          * first look for a buffer that can be used as-is,
 307          * if one cannot be found, see if there is a buffer,
 308          * and if so, free its buffer and set b_addr to NULL
 309          * before calling libxfs_initbuf.
 310          */
 311         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 312         if (!list_empty(&xfs_buf_freelist.cm_list)) {
 313                 list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
 314                         if (bp->b_length == BTOBB(blen)) {
 315                                 list_del_init(&bp->b_node.cn_mru);
 316                                 break;
 317                         }
 318                 }
 319                 if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
 320                         bp = list_entry(xfs_buf_freelist.cm_list.next,
 321                                         struct xfs_buf, b_node.cn_mru);
 322                         list_del_init(&bp->b_node.cn_mru);
 323                         free(bp->b_addr);
 324                         bp->b_addr = NULL;
 325                         if (bp->b_maps != &bp->__b_map)
 326                                 free(bp->b_maps);
 327                         bp->b_maps = NULL;
 328                 }
 329         } else
 330                 bp = kmem_cache_zalloc(xfs_buf_cache, 0);
 331         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 332         bp->b_ops = NULL;
 333         if (bp->b_flags & LIBXFS_B_DIRTY)
 334                 fprintf(stderr, "found dirty buffer (bulk) on free list!\n");
 335
 336         return bp;
 337 }
 338
 339 static struct xfs_buf *
 340 libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
 341 {
 342         struct xfs_buf  *bp;
 343         int             blen = BBTOB(bblen);
 344
 345         bp =__libxfs_getbufr(blen);
 346         if (bp)
 347                 libxfs_initbuf(bp, btp, blkno, blen);
 348         return bp;
 349 }
 350
 351 static struct xfs_buf *
 352 libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
 353                 struct xfs_buf_map *map, int nmaps)
 354 {
 355         struct xfs_buf  *bp;
 356         int             blen = BBTOB(bblen);
 357
 358         if (!map || !nmaps) {
 359                 fprintf(stderr,
 360                         _("%s: %s invalid map %p or nmaps %d\n"),
 361                         progname, __FUNCTION__, map, nmaps);
 362                 exit(1);
 363         }
 364
 365         if (blkno != map[0].bm_bn) {
 366                 fprintf(stderr,
 367                         _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
 368                         progname, __FUNCTION__, (long long)map[0].bm_bn,
 369                         (long long)blkno);
 370                 exit(1);
 371         }
 372
 373         bp =__libxfs_getbufr(blen);
 374         if (bp)
 375                 libxfs_initbuf_map(bp, btp, map, nmaps);
 376         return bp;
 377 }
 378
 379 void
 380 xfs_buf_lock(
 381         struct xfs_buf  *bp)
 382 {
 383         if (use_xfs_buf_lock)
 384                 pthread_mutex_lock(&bp->b_lock);
 385 }
 386
 387 static int
 388 __cache_lookup(
 389         struct xfs_bufkey       *key,
 390         unsigned int            flags,
 391         struct xfs_buf          **bpp)
 392 {
 393         struct cache_node       *cn = NULL;
 394         struct xfs_buf          *bp;
 395
 396         *bpp = NULL;
 397
 398         cache_node_get(libxfs_bcache, key, &cn);
 399         if (!cn)
 400                 return -ENOMEM;
 401         bp = container_of(cn, struct xfs_buf, b_node);
 402
 403         if (use_xfs_buf_lock) {
 404                 int             ret;
 405
 406                 ret = pthread_mutex_trylock(&bp->b_lock);
 407                 if (ret) {
 408                         ASSERT(ret == EAGAIN);
 409                         if (flags & LIBXFS_GETBUF_TRYLOCK) {
 410                                 cache_node_put(libxfs_bcache, cn);
 411                                 return -EAGAIN;
 412                         }
 413
 414                         if (pthread_equal(bp->b_holder, pthread_self())) {
 415                                 fprintf(stderr,
 416         _("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
 417                                         key->blkno);
 418                                 bp->b_recur++;
 419                                 *bpp = bp;
 420                                 return 0;
 421                         } else {
 422                                 pthread_mutex_lock(&bp->b_lock);
 423                         }
 424                 }
 425
 426                 bp->b_holder = pthread_self();
 427         }
 428
 429         cache_node_set_priority(libxfs_bcache, cn,
 430                         cache_node_get_priority(cn) - CACHE_PREFETCH_PRIORITY);
 431         *bpp = bp;
 432         return 0;
 433 }
 434
 435 static int
 436 libxfs_getbuf_flags(
 437         struct xfs_buftarg      *btp,
 438         xfs_daddr_t             blkno,
 439         int                     len,
 440         unsigned int            flags,
 441         struct xfs_buf          **bpp)
 442 {
 443         struct xfs_bufkey       key = {NULL};
 444         int                     ret;
 445
 446         key.buftarg = btp;
 447         key.blkno = blkno;
 448         key.bblen = len;
 449
 450         ret = __cache_lookup(&key, flags, bpp);
 451         if (ret)
 452                 return ret;
 453
 454         if (btp == btp->bt_mount->m_ddev_targp) {
 455                 (*bpp)->b_pag = xfs_perag_get(btp->bt_mount,
 456                                 xfs_daddr_to_agno(btp->bt_mount, blkno));
 457         }
 458
 459         return 0;
 460 }
 461
 462 /*
 463  * Clean the buffer flags for libxfs_getbuf*(), which wants to return
 464  * an unused buffer with clean state.  This prevents CRC errors on a
 465  * re-read of a corrupt block that was prefetched and freed.  This
 466  * can happen with a massively corrupt directory that is discarded,
 467  * but whose blocks are then recycled into expanding lost+found.
 468  *
 469  * Note however that if the buffer's dirty (prefetch calls getbuf)
 470  * we'll leave the state alone because we don't want to discard blocks
 471  * that have been fixed.
 472  */
 473 static void
 474 reset_buf_state(
 475         struct xfs_buf  *bp)
 476 {
 477         if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
 478                 bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
 479                                 LIBXFS_B_UPTODATE);
 480 }
 481
 482 static int
 483 __libxfs_buf_get_map(
 484         struct xfs_buftarg      *btp,
 485         struct xfs_buf_map      *map,
 486         int                     nmaps,
 487         int                     flags,
 488         struct xfs_buf          **bpp)
 489 {
 490         struct xfs_bufkey       key = {NULL};
 491         int                     i;
 492
 493         if (nmaps == 1)
 494                 return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
 495                                 flags, bpp);
 496
 497         key.buftarg = btp;
 498         key.blkno = map[0].bm_bn;
 499         for (i = 0; i < nmaps; i++) {
 500                 key.bblen += map[i].bm_len;
 501         }
 502         key.map = map;
 503         key.nmaps = nmaps;
 504
 505         return __cache_lookup(&key, flags, bpp);
 506 }
 507
 508 int
 509 libxfs_buf_get_map(
 510         struct xfs_buftarg      *btp,
 511         struct xfs_buf_map      *map,
 512         int                     nmaps,
 513         int                     flags,
 514         struct xfs_buf          **bpp)
 515 {
 516         int                     error;
 517
 518         error = __libxfs_buf_get_map(btp, map, nmaps, flags, bpp);
 519         if (error)
 520                 return error;
 521
 522         reset_buf_state(*bpp);
 523         return 0;
 524 }
 525
 526 void
 527 libxfs_buf_relse(
 528         struct xfs_buf  *bp)
 529 {
 530         /*
 531          * ensure that any errors on this use of the buffer don't carry
 532          * over to the next user.
 533          */
 534         bp->b_error = 0;
 535         if (use_xfs_buf_lock) {
 536                 if (bp->b_recur) {
 537                         bp->b_recur--;
 538                 } else {
 539                         bp->b_holder = 0;
 540                         pthread_mutex_unlock(&bp->b_lock);
 541                 }
 542         }
 543
 544         if (!list_empty(&bp->b_node.cn_hash))
 545                 cache_node_put(libxfs_bcache, &bp->b_node);
 546         else if (--bp->b_node.cn_count == 0) {
 547                 if (bp->b_flags & LIBXFS_B_DIRTY)
 548                         libxfs_bwrite(bp);
 549                 libxfs_brelse(&bp->b_node);
 550         }
 551 }
 552
 553 static struct cache_node *
 554 libxfs_balloc(
 555         cache_key_t             key)
 556 {
 557         struct xfs_bufkey       *bufkey = (struct xfs_bufkey *)key;
 558         struct xfs_buf          *bp;
 559
 560         if (bufkey->map)
 561                 bp = libxfs_getbufr_map(bufkey->buftarg, bufkey->blkno,
 562                                 bufkey->bblen, bufkey->map, bufkey->nmaps);
 563         else
 564                 bp = libxfs_getbufr(bufkey->buftarg, bufkey->blkno,
 565                                 bufkey->bblen);
 566         return &bp->b_node;
 567 }
 568
 569
 570 static int
 571 __read_buf(int fd, void *buf, int len, off64_t offset, int flags)
 572 {
 573         int     sts;
 574
 575         sts = pread(fd, buf, len, offset);
 576         if (sts < 0) {
 577                 int error = errno;
 578                 fprintf(stderr, _("%s: read failed: %s\n"),
 579                         progname, strerror(error));
 580                 return -error;
 581         } else if (sts != len) {
 582                 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
 583                         progname, sts, len);
 584                 return -EIO;
 585         }
 586         return 0;
 587 }
 588
 589 int
 590 libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, struct xfs_buf *bp,
 591                 int len, int flags)
 592 {
 593         int     fd = libxfs_device_to_fd(btp->bt_bdev);
 594         int     bytes = BBTOB(len);
 595         int     error;
 596
 597         ASSERT(len <= bp->b_length);
 598
 599         error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
 600         if (!error &&
 601             bp->b_target->bt_bdev == btp->bt_bdev &&
 602             bp->b_cache_key == blkno &&
 603             bp->b_length == len)
 604                 bp->b_flags |= LIBXFS_B_UPTODATE;
 605         bp->b_error = error;
 606         return error;
 607 }
 608
 609 int
 610 libxfs_readbuf_verify(
 611         struct xfs_buf          *bp,
 612         const struct xfs_buf_ops *ops)
 613 {
 614         if (!ops)
 615                 return bp->b_error;
 616
 617         bp->b_ops = ops;
 618         bp->b_ops->verify_read(bp);
 619         bp->b_flags &= ~LIBXFS_B_UNCHECKED;
 620         return bp->b_error;
 621 }
 622
 623 int
 624 libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
 625 {
 626         int     fd;
 627         int     error = 0;
 628         void    *buf;
 629         int     i;
 630
 631         fd = libxfs_device_to_fd(btp->bt_bdev);
 632         buf = bp->b_addr;
 633         for (i = 0; i < bp->b_nmaps; i++) {
 634                 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
 635                 int len = BBTOB(bp->b_maps[i].bm_len);
 636
 637                 error = __read_buf(fd, buf, len, offset, flags);
 638                 if (error) {
 639                         bp->b_error = error;
 640                         break;
 641                 }
 642                 buf += len;
 643         }
 644
 645         if (!error)
 646                 bp->b_flags |= LIBXFS_B_UPTODATE;
 647         return error;
 648 }
 649
 650 int
 651 libxfs_buf_read_map(
 652         struct xfs_buftarg      *btp,
 653         struct xfs_buf_map      *map,
 654         int                     nmaps,
 655         int                     flags,
 656         struct xfs_buf          **bpp,
 657         const struct xfs_buf_ops *ops)
 658 {
 659         struct xfs_buf          *bp;
 660         bool                    salvage = flags & LIBXFS_READBUF_SALVAGE;
 661         int                     error = 0;
 662
 663         *bpp = NULL;
 664         if (nmaps == 1)
 665                 error = libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
 666                                 0, &bp);
 667         else
 668                 error = __libxfs_buf_get_map(btp, map, nmaps, 0, &bp);
 669         if (error)
 670                 return error;
 671
 672         /*
 673          * If the buffer was prefetched, it is likely that it was not validated.
 674          * Hence if we are supplied an ops function and the buffer is marked as
 675          * unchecked, we need to validate it now.
 676          *
 677          * We do this verification even if the buffer is dirty - the
 678          * verification is almost certainly going to fail the CRC check in this
 679          * case as a dirty buffer has not had the CRC recalculated. However, we
 680          * should not be dirtying unchecked buffers and therefore failing it
 681          * here because it's dirty and unchecked indicates we've screwed up
 682          * somewhere else.
 683          *
 684          * Note that if the caller passes in LIBXFS_READBUF_SALVAGE, that means
 685          * they want the buffer even if it fails verification.
 686          */
 687         bp->b_error = 0;
 688         if (bp->b_flags & (LIBXFS_B_UPTODATE | LIBXFS_B_DIRTY)) {
 689                 if (bp->b_flags & LIBXFS_B_UNCHECKED)
 690                         error = libxfs_readbuf_verify(bp, ops);
 691                 if (error && !salvage)
 692                         goto err;
 693                 goto ok;
 694         }
 695
 696         /*
 697          * Set the ops on a cache miss (i.e. first physical read) as the
 698          * verifier may change the ops to match the type of buffer it contains.
 699          * A cache hit might reset the verifier to the original type if we set
 700          * it again, but it won't get called again and set to match the buffer
 701          * contents. *cough* xfs_da_node_buf_ops *cough*.
 702          */
 703         if (nmaps == 1)
 704                 error = libxfs_readbufr(btp, map[0].bm_bn, bp, map[0].bm_len,
 705                                 flags);
 706         else
 707                 error = libxfs_readbufr_map(btp, bp, flags);
 708         if (error)
 709                 goto err;
 710
 711         error = libxfs_readbuf_verify(bp, ops);
 712         if (error && !salvage)
 713                 goto err;
 714
 715 ok:
 716         *bpp = bp;
 717         return 0;
 718 err:
 719         libxfs_buf_relse(bp);
 720         return error;
 721 }
 722
 723 /* Allocate a raw uncached buffer. */
 724 static inline struct xfs_buf *
 725 libxfs_getbufr_uncached(
 726         struct xfs_buftarg      *targ,
 727         xfs_daddr_t             daddr,
 728         size_t                  bblen)
 729 {
 730         struct xfs_buf          *bp;
 731
 732         bp = libxfs_getbufr(targ, daddr, bblen);
 733         if (!bp)
 734                 return NULL;
 735
 736         INIT_LIST_HEAD(&bp->b_node.cn_hash);
 737         bp->b_node.cn_count = 1;
 738         return bp;
 739 }
 740
 741 /*
 742  * Allocate an uncached buffer that points nowhere.  The refcount will be 1,
 743  * and the cache node hash list will be empty to indicate that it's uncached.
 744  */
 745 int
 746 libxfs_buf_get_uncached(
 747         struct xfs_buftarg      *targ,
 748         size_t                  bblen,
 749         int                     flags,
 750         struct xfs_buf          **bpp)
 751 {
 752         *bpp = libxfs_getbufr_uncached(targ, XFS_BUF_DADDR_NULL, bblen);
 753         return *bpp != NULL ? 0 : -ENOMEM;
 754 }
 755
 756 /*
 757  * Allocate and read an uncached buffer.  The refcount will be 1, and the cache
 758  * node hash list will be empty to indicate that it's uncached.
 759  */
 760 int
 761 libxfs_buf_read_uncached(
 762         struct xfs_buftarg      *targ,
 763         xfs_daddr_t             daddr,
 764         size_t                  bblen,
 765         int                     flags,
 766         struct xfs_buf          **bpp,
 767         const struct xfs_buf_ops *ops)
 768 {
 769         struct xfs_buf          *bp;
 770         int                     error;
 771
 772         *bpp = NULL;
 773         bp = libxfs_getbufr_uncached(targ, daddr, bblen);
 774         if (!bp)
 775                 return -ENOMEM;
 776
 777         error = libxfs_readbufr(targ, daddr, bp, bblen, flags);
 778         if (error)
 779                 goto err;
 780
 781         error = libxfs_readbuf_verify(bp, ops);
 782         if (error)
 783                 goto err;
 784
 785         *bpp = bp;
 786         return 0;
 787 err:
 788         libxfs_buf_relse(bp);
 789         return error;
 790 }
 791
 792 static int
 793 __write_buf(int fd, void *buf, int len, off64_t offset, int flags)
 794 {
 795         int     sts;
 796
 797         sts = pwrite(fd, buf, len, offset);
 798         if (sts < 0) {
 799                 int error = errno;
 800                 fprintf(stderr, _("%s: pwrite failed: %s\n"),
 801                         progname, strerror(error));
 802                 return -error;
 803         } else if (sts != len) {
 804                 fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
 805                         progname, sts, len);
 806                 return -EIO;
 807         }
 808         return 0;
 809 }
 810
 811 int
 812 libxfs_bwrite(
 813         struct xfs_buf  *bp)
 814 {
 815         int             fd = libxfs_device_to_fd(bp->b_target->bt_bdev);
 816
 817         /*
 818          * we never write buffers that are marked stale. This indicates they
 819          * contain data that has been invalidated, and even if the buffer is
 820          * dirty it must *never* be written. Verifiers are wonderful for finding
 821          * bugs like this. Make sure the error is obvious as to the cause.
 822          */
 823         if (bp->b_flags & LIBXFS_B_STALE) {
 824                 bp->b_error = -ESTALE;
 825                 return bp->b_error;
 826         }
 827
 828         /* Trigger the writeback hook if there is one. */
 829         if (bp->b_mount->m_buf_writeback_fn)
 830                 bp->b_mount->m_buf_writeback_fn(bp);
 831
 832         /*
 833          * clear any pre-existing error status on the buffer. This can occur if
 834          * the buffer is corrupt on disk and the repair process doesn't clear
 835          * the error before fixing and writing it back.
 836          */
 837         bp->b_error = 0;
 838         if (bp->b_ops) {
 839                 bp->b_ops->verify_write(bp);
 840                 if (bp->b_error) {
 841                         fprintf(stderr,
 842         _("%s: write verifier failed on %s bno 0x%llx/0x%x\n"),
 843                                 __func__, bp->b_ops->name,
 844                                 (unsigned long long)xfs_buf_daddr(bp),
 845                                 bp->b_length);
 846                         return bp->b_error;
 847                 }
 848         }
 849
 850         if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
 851                 bp->b_error = __write_buf(fd, bp->b_addr, BBTOB(bp->b_length),
 852                                     LIBXFS_BBTOOFF64(xfs_buf_daddr(bp)),
 853                                     bp->b_flags);
 854         } else {
 855                 int     i;
 856                 void    *buf = bp->b_addr;
 857
 858                 for (i = 0; i < bp->b_nmaps; i++) {
 859                         off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
 860                         int len = BBTOB(bp->b_maps[i].bm_len);
 861
 862                         bp->b_error = __write_buf(fd, buf, len, offset,
 863                                                   bp->b_flags);
 864                         if (bp->b_error)
 865                                 break;
 866                         buf += len;
 867                 }
 868         }
 869
 870         if (bp->b_error) {
 871                 fprintf(stderr,
 872         _("%s: write failed on %s bno 0x%llx/0x%x, err=%d\n"),
 873                         __func__, bp->b_ops ? bp->b_ops->name : "(unknown)",
 874                         (unsigned long long)xfs_buf_daddr(bp),
 875                         bp->b_length, -bp->b_error);
 876         } else {
 877                 bp->b_flags |= LIBXFS_B_UPTODATE;
 878                 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED);
 879                 xfs_buftarg_trip_write(bp->b_target);
 880         }
 881         return bp->b_error;
 882 }
 883
 884 /*
 885  * Mark a buffer dirty.  The dirty data will be written out when the cache
 886  * is flushed (or at release time if the buffer is uncached).
 887  */
 888 void
 889 libxfs_buf_mark_dirty(
 890         struct xfs_buf  *bp)
 891 {
 892         /*
 893          * Clear any error hanging over from reading the buffer. This prevents
 894          * subsequent reads after this write from seeing stale errors.
 895          */
 896         bp->b_error = 0;
 897         bp->b_flags &= ~LIBXFS_B_STALE;
 898         bp->b_flags |= LIBXFS_B_DIRTY;
 899 }
 900
 901 /* Prepare a buffer to be sent to the MRU list. */
 902 static inline void
 903 libxfs_buf_prepare_mru(
 904         struct xfs_buf          *bp)
 905 {
 906         if (bp->b_pag)
 907                 xfs_perag_put(bp->b_pag);
 908         bp->b_pag = NULL;
 909
 910         if (!(bp->b_flags & LIBXFS_B_DIRTY))
 911                 return;
 912
 913         /* Complain about (and remember) dropping dirty buffers. */
 914         fprintf(stderr, _("%s: Releasing dirty buffer to free list!\n"),
 915                         progname);
 916
 917         if (bp->b_error == -EFSCORRUPTED)
 918                 bp->b_target->flags |= XFS_BUFTARG_CORRUPT_WRITE;
 919         bp->b_target->flags |= XFS_BUFTARG_LOST_WRITE;
 920 }
 921
 922 static void
 923 libxfs_brelse(
 924         struct cache_node       *node)
 925 {
 926         struct xfs_buf          *bp = container_of(node, struct xfs_buf,
 927                                                    b_node);
 928
 929         if (!bp)
 930                 return;
 931         libxfs_buf_prepare_mru(bp);
 932
 933         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 934         list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
 935         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 936 }
 937
 938 static unsigned int
 939 libxfs_bulkrelse(
 940         struct cache            *cache,
 941         struct list_head        *list)
 942 {
 943         struct xfs_buf          *bp;
 944         int                     count = 0;
 945
 946         if (list_empty(list))
 947                 return 0 ;
 948
 949         list_for_each_entry(bp, list, b_node.cn_mru) {
 950                 libxfs_buf_prepare_mru(bp);
 951                 count++;
 952         }
 953
 954         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 955         list_splice(list, &xfs_buf_freelist.cm_list);
 956         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 957
 958         return count;
 959 }
 960
 961 /*
 962  * Free everything from the xfs_buf_freelist MRU, used at final teardown
 963  */
 964 void
 965 libxfs_bcache_free(void)
 966 {
 967         struct list_head        *cm_list;
 968         struct xfs_buf          *bp, *next;
 969
 970         cm_list = &xfs_buf_freelist.cm_list;
 971         list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
 972                 free(bp->b_addr);
 973                 if (bp->b_maps != &bp->__b_map)
 974                         free(bp->b_maps);
 975                 kmem_cache_free(xfs_buf_cache, bp);
 976         }
 977 }
 978
 979 /*
 980  * When a buffer is marked dirty, the error is cleared. Hence if we are trying
 981  * to flush a buffer prior to cache reclaim that has an error on it it means
 982  * we've already tried to flush it and it failed. Prevent repeated corruption
 983  * errors from being reported by skipping such buffers - when the corruption is
 984  * fixed the buffer will be marked dirty again and we can write it again.
 985  */
 986 static int
 987 libxfs_bflush(
 988         struct cache_node       *node)
 989 {
 990         struct xfs_buf          *bp = container_of(node, struct xfs_buf,
 991                                                    b_node);
 992
 993         if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
 994                 return libxfs_bwrite(bp);
 995         return bp->b_error;
 996 }
 997
 998 void
 999 libxfs_bcache_purge(void)
1000 {
1001         cache_purge(libxfs_bcache);
1002 }
1003
1004 void
1005 libxfs_bcache_flush(void)
1006 {
1007         cache_flush(libxfs_bcache);
1008 }
1009
1010 int
1011 libxfs_bcache_overflowed(void)
1012 {
1013         return cache_overflowed(libxfs_bcache);
1014 }
1015
1016 struct cache_operations libxfs_bcache_operations = {
1017         .hash           = libxfs_bhash,
1018         .alloc          = libxfs_balloc,
1019         .flush          = libxfs_bflush,
1020         .relse          = libxfs_brelse,
1021         .compare        = libxfs_bcompare,
1022         .bulkrelse      = libxfs_bulkrelse
1023 };
1024
1025 /*
1026  * Verify an on-disk magic value against the magic value specified in the
1027  * verifier structure. The verifier magic is in disk byte order so the caller is
1028  * expected to pass the value directly from disk.
1029  */
1030 bool
1031 xfs_verify_magic(
1032         struct xfs_buf          *bp,
1033         __be32                  dmagic)
1034 {
1035         struct xfs_mount        *mp = bp->b_mount;
1036         int                     idx;
1037
1038         idx = xfs_has_crc(mp);
1039         if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
1040                 return false;
1041         return dmagic == bp->b_ops->magic[idx];
1042 }
1043
1044 /*
1045  * Verify an on-disk magic value against the magic value specified in the
1046  * verifier structure. The verifier magic is in disk byte order so the caller is
1047  * expected to pass the value directly from disk.
1048  */
1049 bool
1050 xfs_verify_magic16(
1051         struct xfs_buf          *bp,
1052         __be16                  dmagic)
1053 {
1054         struct xfs_mount        *mp = bp->b_mount;
1055         int                     idx;
1056
1057         idx = xfs_has_crc(mp);
1058         if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
1059                 return false;
1060         return dmagic == bp->b_ops->magic16[idx];
1061 }
1062
1063 /*
1064  * Inode cache stubs.
1065  */
1066
1067 struct kmem_cache               *xfs_inode_cache;
1068 extern struct kmem_cache        *xfs_ili_cache;
1069
1070 int
1071 libxfs_iget(
1072         struct xfs_mount        *mp,
1073         struct xfs_trans        *tp,
1074         xfs_ino_t               ino,
1075         uint                    lock_flags,
1076         struct xfs_inode        **ipp)
1077 {
1078         struct xfs_inode        *ip;
1079         struct xfs_buf          *bp;
1080         int                     error = 0;
1081
1082         ip = kmem_cache_zalloc(xfs_inode_cache, 0);
1083         if (!ip)
1084                 return -ENOMEM;
1085
1086         VFS_I(ip)->i_count = 1;
1087         ip->i_ino = ino;
1088         ip->i_mount = mp;
1089         ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
1090         ip->i_df.if_present = 1;
1091         spin_lock_init(&VFS_I(ip)->i_lock);
1092
1093         error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, 0);
1094         if (error)
1095                 goto out_destroy;
1096
1097         error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
1098         if (error)
1099                 goto out_destroy;
1100
1101         error = xfs_inode_from_disk(ip,
1102                         xfs_buf_offset(bp, ip->i_imap.im_boffset));
1103         if (!error)
1104                 xfs_buf_set_ref(bp, XFS_INO_REF);
1105         xfs_trans_brelse(tp, bp);
1106
1107         if (error)
1108                 goto out_destroy;
1109
1110         *ipp = ip;
1111         return 0;
1112
1113 out_destroy:
1114         kmem_cache_free(xfs_inode_cache, ip);
1115         *ipp = NULL;
1116         return error;
1117 }
1118
1119 static void
1120 libxfs_idestroy(xfs_inode_t *ip)
1121 {
1122         switch (VFS_I(ip)->i_mode & S_IFMT) {
1123                 case S_IFREG:
1124                 case S_IFDIR:
1125                 case S_IFLNK:
1126                         libxfs_idestroy_fork(&ip->i_df);
1127                         break;
1128         }
1129         if (ip->i_af.if_present) {
1130                 libxfs_idestroy_fork(&ip->i_af);
1131                 libxfs_ifork_zap_attr(ip);
1132         }
1133         if (ip->i_cowfp) {
1134                 libxfs_idestroy_fork(ip->i_cowfp);
1135                 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
1136         }
1137 }
1138
1139 void
1140 libxfs_irele(
1141         struct xfs_inode        *ip)
1142 {
1143         VFS_I(ip)->i_count--;
1144
1145         if (VFS_I(ip)->i_count == 0) {
1146                 ASSERT(ip->i_itemp == NULL);
1147                 libxfs_idestroy(ip);
1148                 kmem_cache_free(xfs_inode_cache, ip);
1149         }
1150 }
1151
1152 /*
1153  * Flush everything dirty in the kernel and disk write caches to stable media.
1154  * Returns 0 for success or a negative error code.
1155  */
1156 int
1157 libxfs_blkdev_issue_flush(
1158         struct xfs_buftarg      *btp)
1159 {
1160         int                     fd, ret;
1161
1162         if (btp->bt_bdev == 0)
1163                 return 0;
1164
1165         fd = libxfs_device_to_fd(btp->bt_bdev);
1166         ret = platform_flush_device(fd, btp->bt_bdev);
1167         return ret ? -errno : 0;
1168 }
1169
1170 /*
1171  * Write out a buffer list synchronously.
1172  *
1173  * This will take the @buffer_list, write all buffers out and wait for I/O
1174  * completion on all of the buffers. @buffer_list is consumed by the function,
1175  * so callers must have some other way of tracking buffers if they require such
1176  * functionality.
1177  */
1178 int
1179 xfs_buf_delwri_submit(
1180         struct list_head        *buffer_list)
1181 {
1182         struct xfs_buf          *bp, *n;
1183         int                     error = 0, error2;
1184
1185         list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1186                 list_del_init(&bp->b_list);
1187                 error2 = libxfs_bwrite(bp);
1188                 if (!error)
1189                         error = error2;
1190                 libxfs_buf_relse(bp);
1191         }
1192
1193         return error;
1194 }
1195
1196 /*
1197  * Cancel a delayed write list.
1198  *
1199  * Remove each buffer from the list, clear the delwri queue flag and drop the
1200  * associated buffer reference.
1201  */
1202 void
1203 xfs_buf_delwri_cancel(
1204         struct list_head        *list)
1205 {
1206         struct xfs_buf          *bp;
1207
1208         while (!list_empty(list)) {
1209                 bp = list_first_entry(list, struct xfs_buf, b_list);
1210
1211                 list_del_init(&bp->b_list);
1212                 libxfs_buf_relse(bp);
1213         }
1214 }
1215
1216 /*
1217  * Format the log. The caller provides either a buftarg which is used to access
1218  * the log via buffers or a direct pointer to a buffer that encapsulates the
1219  * entire log.
1220  */
1221 int
1222 libxfs_log_clear(
1223         struct xfs_buftarg      *btp,
1224         char                    *dptr,
1225         xfs_daddr_t             start,
1226         uint                    length,         /* basic blocks */
1227         uuid_t                  *fs_uuid,
1228         int                     version,
1229         int                     sunit,          /* bytes */
1230         int                     fmt,
1231         int                     cycle,
1232         bool                    max)
1233 {
1234         struct xfs_buf          *bp = NULL;
1235         int                     len;
1236         xfs_lsn_t               lsn;
1237         xfs_lsn_t               tail_lsn;
1238         xfs_daddr_t             blk;
1239         xfs_daddr_t             end_blk;
1240         char                    *ptr;
1241
1242         if (((btp && dptr) || (!btp && !dptr)) ||
1243             (btp && !btp->bt_bdev) || !fs_uuid)
1244                 return -EINVAL;
1245
1246         /* first zero the log */
1247         if (btp)
1248                 libxfs_device_zero(btp, start, length);
1249         else
1250                 memset(dptr, 0, BBTOB(length));
1251
1252         /*
1253          * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
1254          * special reset case where we only write a single record where the lsn
1255          * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
1256          * the specified cycle and points tail_lsn at the last record of the
1257          * previous cycle.
1258          */
1259         len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
1260         len = max(len, 2);
1261         lsn = xlog_assign_lsn(cycle, 0);
1262         if (cycle == XLOG_INIT_CYCLE)
1263                 tail_lsn = lsn;
1264         else
1265                 tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
1266
1267         /* write out the first log record */
1268         ptr = dptr;
1269         if (btp) {
1270                 bp = libxfs_getbufr_uncached(btp, start, len);
1271                 ptr = bp->b_addr;
1272         }
1273         libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
1274                           next, bp);
1275         if (bp) {
1276                 libxfs_buf_mark_dirty(bp);
1277                 libxfs_buf_relse(bp);
1278         }
1279
1280         /*
1281          * There's nothing else to do if this is a log reset. The kernel detects
1282          * the rest of the log is zeroed and starts at cycle 1.
1283          */
1284         if (cycle == XLOG_INIT_CYCLE)
1285                 return 0;
1286
1287         /*
1288          * Bump the record size for a full log format if the caller allows it.
1289          * This is primarily for performance reasons and most callers don't care
1290          * about record size since the log is clean after we're done.
1291          */
1292         if (max)
1293                 len = BTOBB(BDSTRAT_SIZE);
1294
1295         /*
1296          * Otherwise, fill everything beyond the initial record with records of
1297          * the previous cycle so the kernel head/tail detection works correctly.
1298          *
1299          * We don't particularly care about the record size or content here.
1300          * It's only important that the headers are in place such that the
1301          * kernel finds 1.) a clean log and 2.) the correct current cycle value.
1302          * Therefore, bump up the record size to the max to use larger I/Os and
1303          * improve performance.
1304          */
1305         cycle--;
1306         blk = start + len;
1307         if (dptr)
1308                 dptr += BBTOB(len);
1309         end_blk = start + length;
1310
1311         len = min(end_blk - blk, len);
1312         while (blk < end_blk) {
1313                 lsn = xlog_assign_lsn(cycle, blk - start);
1314                 tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
1315
1316                 ptr = dptr;
1317                 if (btp) {
1318                         bp = libxfs_getbufr_uncached(btp, blk, len);
1319                         ptr = bp->b_addr;
1320                 }
1321                 /*
1322                  * Note: pass the full buffer length as the sunit to initialize
1323                  * the entire buffer.
1324                  */
1325                 libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
1326                                   tail_lsn, next, bp);
1327                 if (bp) {
1328                         libxfs_buf_mark_dirty(bp);
1329                         libxfs_buf_relse(bp);
1330                 }
1331
1332                 blk += len;
1333                 if (dptr)
1334                         dptr += BBTOB(len);
1335                 len = min(end_blk - blk, len);
1336         }
1337
1338         return 0;
1339 }
1340
1341 int
1342 libxfs_log_header(
1343         char                    *caddr,
1344         uuid_t                  *fs_uuid,
1345         int                     version,
1346         int                     sunit,
1347         int                     fmt,
1348         xfs_lsn_t               lsn,
1349         xfs_lsn_t               tail_lsn,
1350         libxfs_get_block_t      *nextfunc,
1351         void                    *private)
1352 {
1353         xlog_rec_header_t       *head = (xlog_rec_header_t *)caddr;
1354         char                    *p = caddr;
1355         __be32                  cycle_lsn;
1356         int                     i, len;
1357         int                     hdrs = 1;
1358
1359         if (lsn == NULLCOMMITLSN)
1360                 lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
1361         if (tail_lsn == NULLCOMMITLSN)
1362                 tail_lsn = lsn;
1363
1364         len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
1365
1366         memset(p, 0, BBSIZE);
1367         head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1368         head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
1369         head->h_version = cpu_to_be32(version);
1370         head->h_crc = cpu_to_le32(0);
1371         head->h_prev_block = cpu_to_be32(-1);
1372         head->h_num_logops = cpu_to_be32(1);
1373         head->h_fmt = cpu_to_be32(fmt);
1374         head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
1375
1376         head->h_lsn = cpu_to_be64(lsn);
1377         head->h_tail_lsn = cpu_to_be64(tail_lsn);
1378
1379         memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
1380
1381         /*
1382          * The kernel expects to see either a log record header magic value or
1383          * the LSN cycle at the top of every log block. The first word of each
1384          * non-header block is copied to the record headers and replaced with
1385          * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
1386          * details).
1387          *
1388          * Even though we only ever write an unmount record (one block), we
1389          * support writing log records up to the max log buffer size of 256k to
1390          * improve log format performance. This means a record can require up
1391          * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
1392          * data (each header supports 32k of data).
1393          */
1394         cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
1395         if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
1396                 hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
1397                 if (sunit % XLOG_HEADER_CYCLE_SIZE)
1398                         hdrs++;
1399         }
1400
1401         /*
1402          * A fixed number of extended headers is expected based on h_size. If
1403          * required, format those now so the unmount record is located
1404          * correctly.
1405          *
1406          * Since we only write an unmount record, we only need one h_cycle_data
1407          * entry for the unmount record block. The subsequent record data
1408          * blocks are zeroed, which means we can stamp them directly with the
1409          * cycle and zero the rest of the cycle data in the extended headers.
1410          */
1411         if (hdrs > 1) {
1412                 for (i = 1; i < hdrs; i++) {
1413                         p = nextfunc(p, BBSIZE, private);
1414                         memset(p, 0, BBSIZE);
1415                         /* xlog_rec_ext_header.xh_cycle */
1416                         *(__be32 *)p = cycle_lsn;
1417                 }
1418         }
1419
1420         /*
1421          * The total length is the max of the stripe unit or 2 basic block
1422          * minimum (1 hdr blk + 1 data blk). The record length is the total
1423          * minus however many header blocks are required.
1424          */
1425         head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
1426
1427         /*
1428          * Write out the unmount record, pack the first word into the record
1429          * header and stamp the block with the cycle.
1430          */
1431         p = nextfunc(p, BBSIZE, private);
1432         unmount_record(p);
1433
1434         head->h_cycle_data[0] = *(__be32 *)p;
1435         *(__be32 *)p = cycle_lsn;
1436
1437         /*
1438          * Finally, zero all remaining blocks in the record and stamp each with
1439          * the cycle. We don't need to pack any of these blocks because the
1440          * cycle data in the headers has already been zeroed.
1441          */
1442         len = max(len, hdrs + 1);
1443         for (i = hdrs + 1; i < len; i++) {
1444                 p = nextfunc(p, BBSIZE, private);
1445                 memset(p, 0, BBSIZE);
1446                 *(__be32 *)p = cycle_lsn;
1447         }
1448
1449         return BBTOB(len);
1450 }
1451
1452 void
1453 libxfs_buf_set_priority(
1454         struct xfs_buf  *bp,
1455         int             priority)
1456 {
1457         cache_node_set_priority(libxfs_bcache, &bp->b_node, priority);
1458 }
1459
1460 int
1461 libxfs_buf_priority(
1462         struct xfs_buf  *bp)
1463 {
1464         return cache_node_get_priority(&bp->b_node);
1465 }
1466
1467 /*
1468  * Log a message about and stale a buffer that a caller has decided is corrupt.
1469  *
1470  * This function should be called for the kinds of metadata corruption that
1471  * cannot be detect from a verifier, such as incorrect inter-block relationship
1472  * data.  Do /not/ call this function from a verifier function.
1473  *
1474  * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
1475  * be marked stale, but b_error will not be set.  The caller is responsible for
1476  * releasing the buffer or fixing it.
1477  */
1478 void
1479 __xfs_buf_mark_corrupt(
1480         struct xfs_buf          *bp,
1481         xfs_failaddr_t          fa)
1482 {
1483         ASSERT(bp->b_flags & XBF_DONE);
1484
1485         xfs_buf_corruption_error(bp, fa);
1486         xfs_buf_stale(bp);
1487 }