libxfs/rdwr.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19
  20 #include "libxfs_priv.h"
  21 #include "init.h"
  22 #include "xfs_fs.h"
  23 #include "xfs_shared.h"
  24 #include "xfs_format.h"
  25 #include "xfs_log_format.h"
  26 #include "xfs_trans_resv.h"
  27 #include "xfs_mount.h"
  28 #include "xfs_inode_buf.h"
  29 #include "xfs_inode_fork.h"
  30 #include "xfs_inode.h"
  31 #include "xfs_trans.h"
  32
  33 #include "libxfs.h"             /* for LIBXFS_EXIT_ON_FAILURE */
  34
  35 /*
  36  * Important design/architecture note:
  37  *
  38  * The userspace code that uses the buffer cache is much less constrained than
  39  * the kernel code. The userspace code is pretty nasty in places, especially
  40  * when it comes to buffer error handling.  Very little of the userspace code
  41  * outside libxfs clears bp->b_error - very little code even checks it - so the
  42  * libxfs code is tripping on stale errors left by the userspace code.
  43  *
  44  * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
  45  * in the kernel, because those functions are used by the libxfs_readbuf_*
  46  * functions and hence need to leave the buffers unchanged on cache hits. This
  47  * is actually the only way to gather a write error from a libxfs_writebuf()
  48  * call - you need to get the buffer again so you can check bp->b_error field -
  49  * assuming that the buffer is still in the cache when you check, that is.
  50  *
  51  * This is very different to the kernel code which does not release buffers on a
  52  * write so we can wait on IO and check errors. The kernel buffer cache also
  53  * guarantees a buffer of a known initial state from xfs_buf_get() even on a
  54  * cache hit.
  55  *
  56  * IOWs, userspace is behaving quite differently to the kernel and as a result
  57  * it leaks errors from reads, invalidations and writes through
  58  * libxfs_getbuf/libxfs_readbuf.
  59  *
  60  * The result of this is that until the userspace code outside libxfs is cleaned
  61  * up, functions that release buffers from userspace control (i.e
  62  * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
  63  * propagation of stale errors into future buffer operations.
  64  */
  65
  66 #define BDSTRAT_SIZE    (256 * 1024)
  67
  68 #define IO_BCOMPARE_CHECK
  69
  70 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
  71 int
  72 libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
  73 {
  74         xfs_off_t       start_offset, end_offset, offset;
  75         ssize_t         zsize, bytes;
  76         char            *z;
  77         int             fd;
  78
  79         zsize = min(BDSTRAT_SIZE, BBTOB(len));
  80         if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
  81                 fprintf(stderr,
  82                         _("%s: %s can't memalign %d bytes: %s\n"),
  83                         progname, __FUNCTION__, (int)zsize, strerror(errno));
  84                 exit(1);
  85         }
  86         memset(z, 0, zsize);
  87
  88         fd = libxfs_device_to_fd(btp->dev);
  89         start_offset = LIBXFS_BBTOOFF64(start);
  90
  91         if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
  92                 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
  93                         progname, __FUNCTION__,
  94                         (unsigned long long)start_offset, strerror(errno));
  95                 exit(1);
  96         }
  97
  98         end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
  99         for (offset = 0; offset < end_offset; ) {
 100                 bytes = min((ssize_t)(end_offset - offset), zsize);
 101                 if ((bytes = write(fd, z, bytes)) < 0) {
 102                         fprintf(stderr, _("%s: %s write failed: %s\n"),
 103                                 progname, __FUNCTION__, strerror(errno));
 104                         exit(1);
 105                 } else if (bytes == 0) {
 106                         fprintf(stderr, _("%s: %s not progressing?\n"),
 107                                 progname, __FUNCTION__);
 108                         exit(1);
 109                 }
 110                 offset += bytes;
 111         }
 112         free(z);
 113         return 0;
 114 }
 115
 116 static void unmount_record(void *p)
 117 {
 118         xlog_op_header_t        *op = (xlog_op_header_t *)p;
 119         /* the data section must be 32 bit size aligned */
 120         struct {
 121             uint16_t magic;
 122             uint16_t pad1;
 123             uint32_t pad2; /* may as well make it 64 bits */
 124         } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
 125
 126         memset(p, 0, BBSIZE);
 127         /* dummy tid to mark this as written from userspace */
 128         op->oh_tid = cpu_to_be32(0xb0c0d0d0);
 129         op->oh_len = cpu_to_be32(sizeof(magic));
 130         op->oh_clientid = XFS_LOG;
 131         op->oh_flags = XLOG_UNMOUNT_TRANS;
 132         op->oh_res2 = 0;
 133
 134         /* and the data for this op */
 135         memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
 136 }
 137
 138 static char *next(
 139         char            *ptr,
 140         int             offset,
 141         void            *private)
 142 {
 143         struct xfs_buf  *buf = (struct xfs_buf *)private;
 144
 145         if (buf &&
 146             (XFS_BUF_COUNT(buf) < (int)(ptr - XFS_BUF_PTR(buf)) + offset))
 147                 abort();
 148
 149         return ptr + offset;
 150 }
 151
 152 /*
 153  * Format the log. The caller provides either a buftarg which is used to access
 154  * the log via buffers or a direct pointer to a buffer that encapsulates the
 155  * entire log.
 156  */
 157 int
 158 libxfs_log_clear(
 159         struct xfs_buftarg      *btp,
 160         char                    *dptr,
 161         xfs_daddr_t             start,
 162         uint                    length,         /* basic blocks */
 163         uuid_t                  *fs_uuid,
 164         int                     version,
 165         int                     sunit,          /* bytes */
 166         int                     fmt,
 167         int                     cycle,
 168         bool                    max)
 169 {
 170         struct xfs_buf          *bp = NULL;
 171         int                     len;
 172         xfs_lsn_t               lsn;
 173         xfs_lsn_t               tail_lsn;
 174         xfs_daddr_t             blk;
 175         xfs_daddr_t             end_blk;
 176         char                    *ptr;
 177
 178         if (((btp && dptr) || (!btp && !dptr)) ||
 179             (btp && !btp->dev) || !fs_uuid)
 180                 return -EINVAL;
 181
 182         /* first zero the log */
 183         if (btp)
 184                 libxfs_device_zero(btp, start, length);
 185         else
 186                 memset(dptr, 0, BBTOB(length));
 187
 188         /*
 189          * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
 190          * special reset case where we only write a single record where the lsn
 191          * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
 192          * the specified cycle and points tail_lsn at the last record of the
 193          * previous cycle.
 194          */
 195         len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
 196         len = MAX(len, 2);
 197         lsn = xlog_assign_lsn(cycle, 0);
 198         if (cycle == XLOG_INIT_CYCLE)
 199                 tail_lsn = lsn;
 200         else
 201                 tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
 202
 203         /* write out the first log record */
 204         ptr = dptr;
 205         if (btp) {
 206                 bp = libxfs_getbufr(btp, start, len);
 207                 ptr = XFS_BUF_PTR(bp);
 208         }
 209         libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
 210                           next, bp);
 211         if (bp) {
 212                 bp->b_flags |= LIBXFS_B_DIRTY;
 213                 libxfs_putbufr(bp);
 214         }
 215
 216         /*
 217          * There's nothing else to do if this is a log reset. The kernel detects
 218          * the rest of the log is zeroed and starts at cycle 1.
 219          */
 220         if (cycle == XLOG_INIT_CYCLE)
 221                 return 0;
 222
 223         /*
 224          * Bump the record size for a full log format if the caller allows it.
 225          * This is primarily for performance reasons and most callers don't care
 226          * about record size since the log is clean after we're done.
 227          */
 228         if (max)
 229                 len = BTOBB(BDSTRAT_SIZE);
 230
 231         /*
 232          * Otherwise, fill everything beyond the initial record with records of
 233          * the previous cycle so the kernel head/tail detection works correctly.
 234          *
 235          * We don't particularly care about the record size or content here.
 236          * It's only important that the headers are in place such that the
 237          * kernel finds 1.) a clean log and 2.) the correct current cycle value.
 238          * Therefore, bump up the record size to the max to use larger I/Os and
 239          * improve performance.
 240          */
 241         cycle--;
 242         blk = start + len;
 243         if (dptr)
 244                 dptr += BBTOB(len);
 245         end_blk = start + length;
 246
 247         len = min(end_blk - blk, len);
 248         while (blk < end_blk) {
 249                 lsn = xlog_assign_lsn(cycle, blk - start);
 250                 tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
 251
 252                 ptr = dptr;
 253                 if (btp) {
 254                         bp = libxfs_getbufr(btp, blk, len);
 255                         ptr = XFS_BUF_PTR(bp);
 256                 }
 257                 /*
 258                  * Note: pass the full buffer length as the sunit to initialize
 259                  * the entire buffer.
 260                  */
 261                 libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
 262                                   tail_lsn, next, bp);
 263                 if (bp) {
 264                         bp->b_flags |= LIBXFS_B_DIRTY;
 265                         libxfs_putbufr(bp);
 266                 }
 267
 268                 blk += len;
 269                 if (dptr)
 270                         dptr += BBTOB(len);
 271                 len = min(end_blk - blk, len);
 272         }
 273
 274         return 0;
 275 }
 276
 277 int
 278 libxfs_log_header(
 279         char                    *caddr,
 280         uuid_t                  *fs_uuid,
 281         int                     version,
 282         int                     sunit,
 283         int                     fmt,
 284         xfs_lsn_t               lsn,
 285         xfs_lsn_t               tail_lsn,
 286         libxfs_get_block_t      *nextfunc,
 287         void                    *private)
 288 {
 289         xlog_rec_header_t       *head = (xlog_rec_header_t *)caddr;
 290         char                    *p = caddr;
 291         __be32                  cycle_lsn;
 292         int                     i, len;
 293         int                     hdrs = 1;
 294
 295         if (lsn == NULLCOMMITLSN)
 296                 lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
 297         if (tail_lsn == NULLCOMMITLSN)
 298                 tail_lsn = lsn;
 299
 300         len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
 301
 302         memset(p, 0, BBSIZE);
 303         head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
 304         head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
 305         head->h_version = cpu_to_be32(version);
 306         head->h_crc = cpu_to_le32(0);
 307         head->h_prev_block = cpu_to_be32(-1);
 308         head->h_num_logops = cpu_to_be32(1);
 309         head->h_fmt = cpu_to_be32(fmt);
 310         head->h_size = cpu_to_be32(MAX(sunit, XLOG_BIG_RECORD_BSIZE));
 311
 312         head->h_lsn = cpu_to_be64(lsn);
 313         head->h_tail_lsn = cpu_to_be64(tail_lsn);
 314
 315         memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
 316
 317         /*
 318          * The kernel expects to see either a log record header magic value or
 319          * the LSN cycle at the top of every log block. The first word of each
 320          * non-header block is copied to the record headers and replaced with
 321          * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
 322          * details).
 323          *
 324          * Even though we only ever write an unmount record (one block), we
 325          * support writing log records up to the max log buffer size of 256k to
 326          * improve log format performance. This means a record can require up
 327          * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
 328          * data (each header supports 32k of data).
 329          */
 330         cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
 331         if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
 332                 hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
 333                 if (sunit % XLOG_HEADER_CYCLE_SIZE)
 334                         hdrs++;
 335         }
 336
 337         /*
 338          * A fixed number of extended headers is expected based on h_size. If
 339          * required, format those now so the unmount record is located
 340          * correctly.
 341          *
 342          * Since we only write an unmount record, we only need one h_cycle_data
 343          * entry for the unmount record block. The subsequent record data
 344          * blocks are zeroed, which means we can stamp them directly with the
 345          * cycle and zero the rest of the cycle data in the extended headers.
 346          */
 347         if (hdrs > 1) {
 348                 for (i = 1; i < hdrs; i++) {
 349                         p = nextfunc(p, BBSIZE, private);
 350                         memset(p, 0, BBSIZE);
 351                         /* xlog_rec_ext_header.xh_cycle */
 352                         *(__be32 *)p = cycle_lsn;
 353                 }
 354         }
 355
 356         /*
 357          * The total length is the max of the stripe unit or 2 basic block
 358          * minimum (1 hdr blk + 1 data blk). The record length is the total
 359          * minus however many header blocks are required.
 360          */
 361         head->h_len = cpu_to_be32(MAX(BBTOB(2), sunit) - hdrs * BBSIZE);
 362
 363         /*
 364          * Write out the unmount record, pack the first word into the record
 365          * header and stamp the block with the cycle.
 366          */
 367         p = nextfunc(p, BBSIZE, private);
 368         unmount_record(p);
 369
 370         head->h_cycle_data[0] = *(__be32 *)p;
 371         *(__be32 *)p = cycle_lsn;
 372
 373         /*
 374          * Finally, zero all remaining blocks in the record and stamp each with
 375          * the cycle. We don't need to pack any of these blocks because the
 376          * cycle data in the headers has already been zeroed.
 377          */
 378         len = MAX(len, hdrs + 1);
 379         for (i = hdrs + 1; i < len; i++) {
 380                 p = nextfunc(p, BBSIZE, private);
 381                 memset(p, 0, BBSIZE);
 382                 *(__be32 *)p = cycle_lsn;
 383         }
 384
 385         return BBTOB(len);
 386 }
 387
 388 /*
 389  * Simple I/O (buffer cache) interface
 390  */
 391
 392
 393 #ifdef XFS_BUF_TRACING
 394
 395 #undef libxfs_readbuf
 396 #undef libxfs_readbuf_map
 397 #undef libxfs_writebuf
 398 #undef libxfs_getbuf
 399 #undef libxfs_getbuf_map
 400 #undef libxfs_getbuf_flags
 401 #undef libxfs_putbuf
 402
 403 xfs_buf_t       *libxfs_readbuf(struct xfs_buftarg *, xfs_daddr_t, int, int,
 404                                 const struct xfs_buf_ops *);
 405 xfs_buf_t       *libxfs_readbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
 406                                 int, int, const struct xfs_buf_ops *);
 407 int             libxfs_writebuf(xfs_buf_t *, int);
 408 xfs_buf_t       *libxfs_getbuf(struct xfs_buftarg *, xfs_daddr_t, int);
 409 xfs_buf_t       *libxfs_getbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
 410                                 int, int);
 411 xfs_buf_t       *libxfs_getbuf_flags(struct xfs_buftarg *, xfs_daddr_t, int,
 412                                 unsigned int);
 413 void            libxfs_putbuf (xfs_buf_t *);
 414
 415 #define __add_trace(bp, func, file, line)       \
 416 do {                                            \
 417         if (bp) {                               \
 418                 (bp)->b_func = (func);          \
 419                 (bp)->b_file = (file);          \
 420                 (bp)->b_line = (line);          \
 421         }                                       \
 422 } while (0)
 423
 424 xfs_buf_t *
 425 libxfs_trace_readbuf(const char *func, const char *file, int line,
 426                 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
 427                 const struct xfs_buf_ops *ops)
 428 {
 429         xfs_buf_t       *bp = libxfs_readbuf(btp, blkno, len, flags, ops);
 430         __add_trace(bp, func, file, line);
 431         return bp;
 432 }
 433
 434 xfs_buf_t *
 435 libxfs_trace_readbuf_map(const char *func, const char *file, int line,
 436                 struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, int flags,
 437                 const struct xfs_buf_ops *ops)
 438 {
 439         xfs_buf_t       *bp = libxfs_readbuf_map(btp, map, nmaps, flags, ops);
 440         __add_trace(bp, func, file, line);
 441         return bp;
 442 }
 443
 444 int
 445 libxfs_trace_writebuf(const char *func, const char *file, int line, xfs_buf_t *bp, int flags)
 446 {
 447         __add_trace(bp, func, file, line);
 448         return libxfs_writebuf(bp, flags);
 449 }
 450
 451 xfs_buf_t *
 452 libxfs_trace_getbuf(const char *func, const char *file, int line,
 453                 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
 454 {
 455         xfs_buf_t       *bp = libxfs_getbuf(btp, blkno, len);
 456         __add_trace(bp, func, file, line);
 457         return bp;
 458 }
 459
 460 xfs_buf_t *
 461 libxfs_trace_getbuf_map(const char *func, const char *file, int line,
 462                 struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
 463                 int flags)
 464 {
 465         xfs_buf_t       *bp = libxfs_getbuf_map(btp, map, nmaps, flags);
 466         __add_trace(bp, func, file, line);
 467         return bp;
 468 }
 469
 470 xfs_buf_t *
 471 libxfs_trace_getbuf_flags(const char *func, const char *file, int line,
 472                 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, unsigned int flags)
 473 {
 474         xfs_buf_t       *bp = libxfs_getbuf_flags(btp, blkno, len, flags);
 475         __add_trace(bp, func, file, line);
 476         return bp;
 477 }
 478
 479 void
 480 libxfs_trace_putbuf(const char *func, const char *file, int line, xfs_buf_t *bp)
 481 {
 482         __add_trace(bp, func, file, line);
 483         libxfs_putbuf(bp);
 484 }
 485
 486
 487 #endif
 488
 489
 490 xfs_buf_t *
 491 libxfs_getsb(xfs_mount_t *mp, int flags)
 492 {
 493         return libxfs_readbuf(mp->m_ddev_targp, XFS_SB_DADDR,
 494                                 XFS_FSS_TO_BB(mp, 1), flags, &xfs_sb_buf_ops);
 495 }
 496
 497 kmem_zone_t                     *xfs_buf_zone;
 498
 499 static struct cache_mru         xfs_buf_freelist =
 500         {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
 501          0, PTHREAD_MUTEX_INITIALIZER };
 502
 503 /*
 504  * The bufkey is used to pass the new buffer information to the cache object
 505  * allocation routine. Because discontiguous buffers need to pass different
 506  * information, we need fields to pass that information. However, because the
 507  * blkno and bblen is needed for the initial cache entry lookup (i.e. for
 508  * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
 509  * buffer initialisation instead of a contiguous buffer.
 510  */
 511 struct xfs_bufkey {
 512         struct xfs_buftarg      *buftarg;
 513         xfs_daddr_t             blkno;
 514         unsigned int            bblen;
 515         struct xfs_buf_map      *map;
 516         int                     nmaps;
 517 };
 518
 519 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
 520 #define GOLDEN_RATIO_PRIME      0x9e37fffffffc0001UL
 521 #define CACHE_LINE_SIZE         64
 522 static unsigned int
 523 libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
 524 {
 525         uint64_t        hashval = ((struct xfs_bufkey *)key)->blkno;
 526         uint64_t        tmp;
 527
 528         tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
 529         tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
 530         return tmp % hashsize;
 531 }
 532
 533 static int
 534 libxfs_bcompare(struct cache_node *node, cache_key_t key)
 535 {
 536         struct xfs_buf  *bp = (struct xfs_buf *)node;
 537         struct xfs_bufkey *bkey = (struct xfs_bufkey *)key;
 538
 539         if (bp->b_target->dev == bkey->buftarg->dev &&
 540             bp->b_bn == bkey->blkno) {
 541                 if (bp->b_bcount == BBTOB(bkey->bblen))
 542                         return CACHE_HIT;
 543 #ifdef IO_BCOMPARE_CHECK
 544                 if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
 545                         fprintf(stderr,
 546         "%lx: Badness in key lookup (length)\n"
 547         "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
 548                                 pthread_self(),
 549                                 (unsigned long long)bp->b_bn, (int)bp->b_bcount,
 550                                 (unsigned long long)bkey->blkno,
 551                                 BBTOB(bkey->bblen));
 552                 }
 553 #endif
 554                 return CACHE_PURGE;
 555         }
 556         return CACHE_MISS;
 557 }
 558
 559 void
 560 libxfs_bprint(xfs_buf_t *bp)
 561 {
 562         fprintf(stderr, "Buffer %p blkno=%llu bytes=%u flags=0x%x count=%u\n",
 563                 bp, (unsigned long long)bp->b_bn, (unsigned)bp->b_bcount,
 564                 bp->b_flags, bp->b_node.cn_count);
 565 }
 566
 567 static void
 568 __initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 569                 unsigned int bytes)
 570 {
 571         bp->b_flags = 0;
 572         bp->b_bn = bno;
 573         bp->b_bcount = bytes;
 574         bp->b_length = BTOBB(bytes);
 575         bp->b_target = btp;
 576         bp->b_error = 0;
 577         if (!bp->b_addr)
 578                 bp->b_addr = memalign(libxfs_device_alignment(), bytes);
 579         if (!bp->b_addr) {
 580                 fprintf(stderr,
 581                         _("%s: %s can't memalign %u bytes: %s\n"),
 582                         progname, __FUNCTION__, bytes,
 583                         strerror(errno));
 584                 exit(1);
 585         }
 586         memset(bp->b_addr, 0, bytes);
 587 #ifdef XFS_BUF_TRACING
 588         list_head_init(&bp->b_lock_list);
 589 #endif
 590         pthread_mutex_init(&bp->b_lock, NULL);
 591         bp->b_holder = 0;
 592         bp->b_recur = 0;
 593         bp->b_ops = NULL;
 594
 595         if (!bp->b_maps) {
 596                 bp->b_nmaps = 1;
 597                 bp->b_maps = &bp->__b_map;
 598                 bp->b_maps[0].bm_bn = bp->b_bn;
 599                 bp->b_maps[0].bm_len = bp->b_length;
 600         }
 601 }
 602
 603 static void
 604 libxfs_initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
 605                 unsigned int bytes)
 606 {
 607         __initbuf(bp, btp, bno, bytes);
 608 }
 609
 610 static void
 611 libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
 612                 struct xfs_buf_map *map, int nmaps)
 613 {
 614         unsigned int bytes = 0;
 615         int i;
 616
 617         bytes = sizeof(struct xfs_buf_map) * nmaps;
 618         bp->b_maps = malloc(bytes);
 619         if (!bp->b_maps) {
 620                 fprintf(stderr,
 621                         _("%s: %s can't malloc %u bytes: %s\n"),
 622                         progname, __FUNCTION__, bytes,
 623                         strerror(errno));
 624                 exit(1);
 625         }
 626         bp->b_nmaps = nmaps;
 627
 628         bytes = 0;
 629         for ( i = 0; i < nmaps; i++) {
 630                 bp->b_maps[i].bm_bn = map[i].bm_bn;
 631                 bp->b_maps[i].bm_len = map[i].bm_len;
 632                 bytes += BBTOB(map[i].bm_len);
 633         }
 634
 635         __initbuf(bp, btp, map[0].bm_bn, bytes);
 636         bp->b_flags |= LIBXFS_B_DISCONTIG;
 637 }
 638
 639 xfs_buf_t *
 640 __libxfs_getbufr(int blen)
 641 {
 642         xfs_buf_t       *bp;
 643
 644         /*
 645          * first look for a buffer that can be used as-is,
 646          * if one cannot be found, see if there is a buffer,
 647          * and if so, free its buffer and set b_addr to NULL
 648          * before calling libxfs_initbuf.
 649          */
 650         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
 651         if (!list_empty(&xfs_buf_freelist.cm_list)) {
 652                 list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
 653                         if (bp->b_bcount == blen) {
 654                                 list_del_init(&bp->b_node.cn_mru);
 655                                 break;
 656                         }
 657                 }
 658                 if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
 659                         bp = list_entry(xfs_buf_freelist.cm_list.next,
 660                                         xfs_buf_t, b_node.cn_mru);
 661                         list_del_init(&bp->b_node.cn_mru);
 662                         free(bp->b_addr);
 663                         bp->b_addr = NULL;
 664                         if (bp->b_maps != &bp->__b_map)
 665                                 free(bp->b_maps);
 666                         bp->b_maps = NULL;
 667                 }
 668         } else
 669                 bp = kmem_zone_zalloc(xfs_buf_zone, 0);
 670         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 671         bp->b_ops = NULL;
 672         if (bp->b_flags & LIBXFS_B_DIRTY)
 673                 fprintf(stderr, "found dirty buffer (bulk) on free list!");
 674
 675         return bp;
 676 }
 677
 678 xfs_buf_t *
 679 libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
 680 {
 681         xfs_buf_t       *bp;
 682         int             blen = BBTOB(bblen);
 683
 684         bp =__libxfs_getbufr(blen);
 685         if (bp)
 686                 libxfs_initbuf(bp, btp, blkno, blen);
 687 #ifdef IO_DEBUG
 688         printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
 689                 pthread_self(), __FUNCTION__, blen,
 690                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 691 #endif
 692
 693         return bp;
 694 }
 695
 696 xfs_buf_t *
 697 libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
 698                 struct xfs_buf_map *map, int nmaps)
 699 {
 700         xfs_buf_t       *bp;
 701         int             blen = BBTOB(bblen);
 702
 703         if (!map || !nmaps) {
 704                 fprintf(stderr,
 705                         _("%s: %s invalid map %p or nmaps %d\n"),
 706                         progname, __FUNCTION__, map, nmaps);
 707                 exit(1);
 708         }
 709
 710         if (blkno != map[0].bm_bn) {
 711                 fprintf(stderr,
 712                         _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
 713                         progname, __FUNCTION__, (long long)map[0].bm_bn,
 714                         (long long)blkno);
 715                 exit(1);
 716         }
 717
 718         bp =__libxfs_getbufr(blen);
 719         if (bp)
 720                 libxfs_initbuf_map(bp, btp, map, nmaps);
 721 #ifdef IO_DEBUG
 722         printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
 723                 pthread_self(), __FUNCTION__, blen,
 724                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 725 #endif
 726
 727         return bp;
 728 }
 729
 730 #ifdef XFS_BUF_TRACING
 731 struct list_head        lock_buf_list = {&lock_buf_list, &lock_buf_list};
 732 int                     lock_buf_count = 0;
 733 #endif
 734
 735 extern int     use_xfs_buf_lock;
 736
 737 static struct xfs_buf *
 738 __cache_lookup(struct xfs_bufkey *key, unsigned int flags)
 739 {
 740         struct xfs_buf  *bp;
 741
 742         cache_node_get(libxfs_bcache, key, (struct cache_node **)&bp);
 743         if (!bp)
 744                 return NULL;
 745
 746         if (use_xfs_buf_lock) {
 747                 int ret;
 748
 749                 ret = pthread_mutex_trylock(&bp->b_lock);
 750                 if (ret) {
 751                         ASSERT(ret == EAGAIN);
 752                         if (flags & LIBXFS_GETBUF_TRYLOCK)
 753                                 goto out_put;
 754
 755                         if (pthread_equal(bp->b_holder, pthread_self())) {
 756                                 fprintf(stderr,
 757         _("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
 758                                         key->blkno);
 759                                 bp->b_recur++;
 760                                 return bp;
 761                         } else {
 762                                 pthread_mutex_lock(&bp->b_lock);
 763                         }
 764                 }
 765
 766                 bp->b_holder = pthread_self();
 767         }
 768
 769         cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp,
 770                 cache_node_get_priority((struct cache_node *)bp) -
 771                                                 CACHE_PREFETCH_PRIORITY);
 772 #ifdef XFS_BUF_TRACING
 773         pthread_mutex_lock(&libxfs_bcache->c_mutex);
 774         lock_buf_count++;
 775         list_add(&bp->b_lock_list, &lock_buf_list);
 776         pthread_mutex_unlock(&libxfs_bcache->c_mutex);
 777 #endif
 778 #ifdef IO_DEBUG
 779         printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
 780                 pthread_self(), __FUNCTION__,
 781                 bp, bp->b_bn, (long long)LIBXFS_BBTOOFF64(key->blkno));
 782 #endif
 783
 784         return bp;
 785 out_put:
 786         cache_node_put(libxfs_bcache, (struct cache_node *)bp);
 787         return NULL;
 788 }
 789
 790 struct xfs_buf *
 791 libxfs_getbuf_flags(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len,
 792                 unsigned int flags)
 793 {
 794         struct xfs_bufkey key = {0};
 795
 796         key.buftarg = btp;
 797         key.blkno = blkno;
 798         key.bblen = len;
 799
 800         return __cache_lookup(&key, flags);
 801 }
 802
 803 /*
 804  * Clean the buffer flags for libxfs_getbuf*(), which wants to return
 805  * an unused buffer with clean state.  This prevents CRC errors on a
 806  * re-read of a corrupt block that was prefetched and freed.  This
 807  * can happen with a massively corrupt directory that is discarded,
 808  * but whose blocks are then recycled into expanding lost+found.
 809  *
 810  * Note however that if the buffer's dirty (prefetch calls getbuf)
 811  * we'll leave the state alone because we don't want to discard blocks
 812  * that have been fixed.
 813  */
 814 static void
 815 reset_buf_state(
 816         struct xfs_buf  *bp)
 817 {
 818         if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
 819                 bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
 820                                 LIBXFS_B_UPTODATE);
 821 }
 822
 823 struct xfs_buf *
 824 libxfs_getbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
 825 {
 826         struct xfs_buf  *bp;
 827
 828         bp = libxfs_getbuf_flags(btp, blkno, len, 0);
 829         reset_buf_state(bp);
 830         return bp;
 831 }
 832
 833 static struct xfs_buf *
 834 __libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
 835                     int nmaps, int flags)
 836 {
 837         struct xfs_bufkey key = {0};
 838         int i;
 839
 840         if (nmaps == 1)
 841                 return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
 842                                            flags);
 843
 844         key.buftarg = btp;
 845         key.blkno = map[0].bm_bn;
 846         for (i = 0; i < nmaps; i++) {
 847                 key.bblen += map[i].bm_len;
 848         }
 849         key.map = map;
 850         key.nmaps = nmaps;
 851
 852         return __cache_lookup(&key, flags);
 853 }
 854
 855 struct xfs_buf *
 856 libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
 857                   int nmaps, int flags)
 858 {
 859         struct xfs_buf  *bp;
 860
 861         bp = __libxfs_getbuf_map(btp, map, nmaps, flags);
 862         reset_buf_state(bp);
 863         return bp;
 864 }
 865
 866 void
 867 libxfs_putbuf(xfs_buf_t *bp)
 868 {
 869         /*
 870          * ensure that any errors on this use of the buffer don't carry
 871          * over to the next user.
 872          */
 873         bp->b_error = 0;
 874
 875 #ifdef XFS_BUF_TRACING
 876         pthread_mutex_lock(&libxfs_bcache->c_mutex);
 877         lock_buf_count--;
 878         ASSERT(lock_buf_count >= 0);
 879         list_del_init(&bp->b_lock_list);
 880         pthread_mutex_unlock(&libxfs_bcache->c_mutex);
 881 #endif
 882         if (use_xfs_buf_lock) {
 883                 if (bp->b_recur) {
 884                         bp->b_recur--;
 885                 } else {
 886                         bp->b_holder = 0;
 887                         pthread_mutex_unlock(&bp->b_lock);
 888                 }
 889         }
 890
 891         cache_node_put(libxfs_bcache, (struct cache_node *)bp);
 892 }
 893
 894 void
 895 libxfs_purgebuf(xfs_buf_t *bp)
 896 {
 897         struct xfs_bufkey key = {0};
 898
 899         key.buftarg = bp->b_target;
 900         key.blkno = bp->b_bn;
 901         key.bblen = bp->b_length;
 902
 903         cache_node_purge(libxfs_bcache, &key, (struct cache_node *)bp);
 904 }
 905
 906 static struct cache_node *
 907 libxfs_balloc(cache_key_t key)
 908 {
 909         struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key;
 910
 911         if (bufkey->map)
 912                 return (struct cache_node *)
 913                        libxfs_getbufr_map(bufkey->buftarg,
 914                                           bufkey->blkno, bufkey->bblen,
 915                                           bufkey->map, bufkey->nmaps);
 916         return (struct cache_node *)libxfs_getbufr(bufkey->buftarg,
 917                                           bufkey->blkno, bufkey->bblen);
 918 }
 919
 920
 921 static int
 922 __read_buf(int fd, void *buf, int len, off64_t offset, int flags)
 923 {
 924         int     sts;
 925
 926         sts = pread(fd, buf, len, offset);
 927         if (sts < 0) {
 928                 int error = errno;
 929                 fprintf(stderr, _("%s: read failed: %s\n"),
 930                         progname, strerror(error));
 931                 if (flags & LIBXFS_EXIT_ON_FAILURE)
 932                         exit(1);
 933                 return -error;
 934         } else if (sts != len) {
 935                 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
 936                         progname, sts, len);
 937                 if (flags & LIBXFS_EXIT_ON_FAILURE)
 938                         exit(1);
 939                 return -EIO;
 940         }
 941         return 0;
 942 }
 943
 944 int
 945 libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, xfs_buf_t *bp,
 946                 int len, int flags)
 947 {
 948         int     fd = libxfs_device_to_fd(btp->dev);
 949         int     bytes = BBTOB(len);
 950         int     error;
 951
 952         ASSERT(BBTOB(len) <= bp->b_bcount);
 953
 954         error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
 955         if (!error &&
 956             bp->b_target->dev == btp->dev &&
 957             bp->b_bn == blkno &&
 958             bp->b_bcount == bytes)
 959                 bp->b_flags |= LIBXFS_B_UPTODATE;
 960 #ifdef IO_DEBUG
 961         printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
 962                 pthread_self(), __FUNCTION__, bytes, error,
 963                 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 964 #endif
 965         return error;
 966 }
 967
 968 void
 969 libxfs_readbuf_verify(struct xfs_buf *bp, const struct xfs_buf_ops *ops)
 970 {
 971         if (!ops)
 972                 return;
 973         bp->b_ops = ops;
 974         bp->b_ops->verify_read(bp);
 975         bp->b_flags &= ~LIBXFS_B_UNCHECKED;
 976 }
 977
 978
 979 xfs_buf_t *
 980 libxfs_readbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
 981                 const struct xfs_buf_ops *ops)
 982 {
 983         xfs_buf_t       *bp;
 984         int             error;
 985
 986         bp = libxfs_getbuf_flags(btp, blkno, len, 0);
 987         if (!bp)
 988                 return NULL;
 989
 990         /*
 991          * if the buffer was prefetched, it is likely that it was not validated.
 992          * Hence if we are supplied an ops function and the buffer is marked as
 993          * unchecked, we need to validate it now.
 994          *
 995          * We do this verification even if the buffer is dirty - the
 996          * verification is almost certainly going to fail the CRC check in this
 997          * case as a dirty buffer has not had the CRC recalculated. However, we
 998          * should not be dirtying unchecked buffers and therefore failing it
 999          * here because it's dirty and unchecked indicates we've screwed up
1000          * somewhere else.
1001          */
1002         bp->b_error = 0;
1003         if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
1004                 if (bp->b_flags & LIBXFS_B_UNCHECKED)
1005                         libxfs_readbuf_verify(bp, ops);
1006                 return bp;
1007         }
1008
1009         /*
1010          * Set the ops on a cache miss (i.e. first physical read) as the
1011          * verifier may change the ops to match the type of buffer it contains.
1012          * A cache hit might reset the verifier to the original type if we set
1013          * it again, but it won't get called again and set to match the buffer
1014          * contents. *cough* xfs_da_node_buf_ops *cough*.
1015          */
1016         error = libxfs_readbufr(btp, blkno, bp, len, flags);
1017         if (error)
1018                 bp->b_error = error;
1019         else
1020                 libxfs_readbuf_verify(bp, ops);
1021         return bp;
1022 }
1023
1024 int
1025 libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
1026 {
1027         int     fd;
1028         int     error = 0;
1029         char    *buf;
1030         int     i;
1031
1032         fd = libxfs_device_to_fd(btp->dev);
1033         buf = bp->b_addr;
1034         for (i = 0; i < bp->b_nmaps; i++) {
1035                 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
1036                 int len = BBTOB(bp->b_maps[i].bm_len);
1037
1038                 error = __read_buf(fd, buf, len, offset, flags);
1039                 if (error) {
1040                         bp->b_error = error;
1041                         break;
1042                 }
1043                 buf += len;
1044         }
1045
1046         if (!error)
1047                 bp->b_flags |= LIBXFS_B_UPTODATE;
1048 #ifdef IO_DEBUG
1049         printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1050                 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
1051                 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
1052 #endif
1053         return error;
1054 }
1055
1056 struct xfs_buf *
1057 libxfs_readbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
1058                 int flags, const struct xfs_buf_ops *ops)
1059 {
1060         struct xfs_buf  *bp;
1061         int             error = 0;
1062
1063         if (nmaps == 1)
1064                 return libxfs_readbuf(btp, map[0].bm_bn, map[0].bm_len,
1065                                         flags, ops);
1066
1067         bp = __libxfs_getbuf_map(btp, map, nmaps, 0);
1068         if (!bp)
1069                 return NULL;
1070
1071         bp->b_error = 0;
1072         if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
1073                 if (bp->b_flags & LIBXFS_B_UNCHECKED)
1074                         libxfs_readbuf_verify(bp, ops);
1075                 return bp;
1076         }
1077         error = libxfs_readbufr_map(btp, bp, flags);
1078         if (!error)
1079                 libxfs_readbuf_verify(bp, ops);
1080
1081 #ifdef IO_DEBUGX
1082         printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1083                 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
1084                 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
1085 #endif
1086         return bp;
1087 }
1088
1089 static int
1090 __write_buf(int fd, void *buf, int len, off64_t offset, int flags)
1091 {
1092         int     sts;
1093
1094         sts = pwrite(fd, buf, len, offset);
1095         if (sts < 0) {
1096                 int error = errno;
1097                 fprintf(stderr, _("%s: pwrite failed: %s\n"),
1098                         progname, strerror(error));
1099                 if (flags & LIBXFS_B_EXIT)
1100                         exit(1);
1101                 return -error;
1102         } else if (sts != len) {
1103                 fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
1104                         progname, sts, len);
1105                 if (flags & LIBXFS_B_EXIT)
1106                         exit(1);
1107                 return -EIO;
1108         }
1109         return 0;
1110 }
1111
1112 int
1113 libxfs_writebufr(xfs_buf_t *bp)
1114 {
1115         int     fd = libxfs_device_to_fd(bp->b_target->dev);
1116
1117         /*
1118          * we never write buffers that are marked stale. This indicates they
1119          * contain data that has been invalidated, and even if the buffer is
1120          * dirty it must *never* be written. Verifiers are wonderful for finding
1121          * bugs like this. Make sure the error is obvious as to the cause.
1122          */
1123         if (bp->b_flags & LIBXFS_B_STALE) {
1124                 bp->b_error = -ESTALE;
1125                 return bp->b_error;
1126         }
1127
1128         /*
1129          * clear any pre-existing error status on the buffer. This can occur if
1130          * the buffer is corrupt on disk and the repair process doesn't clear
1131          * the error before fixing and writing it back.
1132          */
1133         bp->b_error = 0;
1134         if (bp->b_ops) {
1135                 bp->b_ops->verify_write(bp);
1136                 if (bp->b_error) {
1137                         fprintf(stderr,
1138         _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
1139                                 __func__, bp->b_ops->name,
1140                                 (long long)bp->b_bn, bp->b_bcount);
1141                         return bp->b_error;
1142                 }
1143         }
1144
1145         if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
1146                 bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
1147                                     LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
1148         } else {
1149                 int     i;
1150                 char    *buf = bp->b_addr;
1151
1152                 for (i = 0; i < bp->b_nmaps; i++) {
1153                         off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
1154                         int len = BBTOB(bp->b_maps[i].bm_len);
1155
1156                         bp->b_error = __write_buf(fd, buf, len, offset,
1157                                                   bp->b_flags);
1158                         if (bp->b_error)
1159                                 break;
1160                         buf += len;
1161                 }
1162         }
1163
1164 #ifdef IO_DEBUG
1165         printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
1166                         pthread_self(), __FUNCTION__, bp->b_bcount,
1167                         (long long)LIBXFS_BBTOOFF64(bp->b_bn),
1168                         (long long)bp->b_bn, bp, bp->b_error);
1169 #endif
1170         if (!bp->b_error) {
1171                 bp->b_flags |= LIBXFS_B_UPTODATE;
1172                 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
1173                                  LIBXFS_B_UNCHECKED);
1174         }
1175         return bp->b_error;
1176 }
1177
1178 int
1179 libxfs_writebuf_int(xfs_buf_t *bp, int flags)
1180 {
1181         /*
1182          * Clear any error hanging over from reading the buffer. This prevents
1183          * subsequent reads after this write from seeing stale errors.
1184          */
1185         bp->b_error = 0;
1186         bp->b_flags &= ~LIBXFS_B_STALE;
1187         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
1188         return 0;
1189 }
1190
1191 int
1192 libxfs_writebuf(xfs_buf_t *bp, int flags)
1193 {
1194 #ifdef IO_DEBUG
1195         printf("%lx: %s: dirty blkno=%llu(%llu)\n",
1196                         pthread_self(), __FUNCTION__,
1197                         (long long)LIBXFS_BBTOOFF64(bp->b_bn),
1198                         (long long)bp->b_bn);
1199 #endif
1200         /*
1201          * Clear any error hanging over from reading the buffer. This prevents
1202          * subsequent reads after this write from seeing stale errors.
1203          */
1204         bp->b_error = 0;
1205         bp->b_flags &= ~LIBXFS_B_STALE;
1206         bp->b_flags |= (LIBXFS_B_DIRTY | flags);
1207         libxfs_putbuf(bp);
1208         return 0;
1209 }
1210
1211 void
1212 libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
1213 {
1214 #ifdef IO_DEBUG
1215         if (boff + len > bp->b_bcount) {
1216                 printf("Badness, iomove out of range!\n"
1217                         "bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
1218                         (long long)bp->b_bn, bp->b_bcount, boff, len);
1219                 abort();
1220         }
1221 #endif
1222         switch (flags) {
1223         case LIBXFS_BZERO:
1224                 memset(bp->b_addr + boff, 0, len);
1225                 break;
1226         case LIBXFS_BREAD:
1227                 memcpy(data, bp->b_addr + boff, len);
1228                 break;
1229         case LIBXFS_BWRITE:
1230                 memcpy(bp->b_addr + boff, data, len);
1231                 break;
1232         }
1233 }
1234
1235 static void
1236 libxfs_brelse(
1237         struct cache_node       *node)
1238 {
1239         struct xfs_buf          *bp = (struct xfs_buf *)node;
1240
1241         if (!bp)
1242                 return;
1243         if (bp->b_flags & LIBXFS_B_DIRTY)
1244                 fprintf(stderr,
1245                         "releasing dirty buffer to free list!");
1246
1247         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
1248         list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
1249         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
1250 }
1251
1252 static unsigned int
1253 libxfs_bulkrelse(
1254         struct cache            *cache,
1255         struct list_head        *list)
1256 {
1257         xfs_buf_t               *bp;
1258         int                     count = 0;
1259
1260         if (list_empty(list))
1261                 return 0 ;
1262
1263         list_for_each_entry(bp, list, b_node.cn_mru) {
1264                 if (bp->b_flags & LIBXFS_B_DIRTY)
1265                         fprintf(stderr,
1266                                 "releasing dirty buffer (bulk) to free list!");
1267                 count++;
1268         }
1269
1270         pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
1271         list_splice(list, &xfs_buf_freelist.cm_list);
1272         pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
1273
1274         return count;
1275 }
1276
1277 /*
1278  * Free everything from the xfs_buf_freelist MRU, used at final teardown
1279  */
1280 void
1281 libxfs_bcache_free(void)
1282 {
1283         struct list_head        *cm_list;
1284         xfs_buf_t               *bp, *next;
1285
1286         cm_list = &xfs_buf_freelist.cm_list;
1287         list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
1288                 free(bp->b_addr);
1289                 if (bp->b_maps != &bp->__b_map)
1290                         free(bp->b_maps);
1291                 kmem_zone_free(xfs_buf_zone, bp);
1292         }
1293 }
1294
1295 /*
1296  * When a buffer is marked dirty, the error is cleared. Hence if we are trying
1297  * to flush a buffer prior to cache reclaim that has an error on it it means
1298  * we've already tried to flush it and it failed. Prevent repeated corruption
1299  * errors from being reported by skipping such buffers - when the corruption is
1300  * fixed the buffer will be marked dirty again and we can write it again.
1301  */
1302 static int
1303 libxfs_bflush(
1304         struct cache_node       *node)
1305 {
1306         struct xfs_buf          *bp = (struct xfs_buf *)node;
1307
1308         if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
1309                 return libxfs_writebufr(bp);
1310         return bp->b_error;
1311 }
1312
1313 void
1314 libxfs_putbufr(xfs_buf_t *bp)
1315 {
1316         if (bp->b_flags & LIBXFS_B_DIRTY)
1317                 libxfs_writebufr(bp);
1318         libxfs_brelse((struct cache_node *)bp);
1319 }
1320
1321
1322 void
1323 libxfs_bcache_purge(void)
1324 {
1325         cache_purge(libxfs_bcache);
1326 }
1327
1328 void
1329 libxfs_bcache_flush(void)
1330 {
1331         cache_flush(libxfs_bcache);
1332 }
1333
1334 int
1335 libxfs_bcache_overflowed(void)
1336 {
1337         return cache_overflowed(libxfs_bcache);
1338 }
1339
1340 struct cache_operations libxfs_bcache_operations = {
1341         .hash           = libxfs_bhash,
1342         .alloc          = libxfs_balloc,
1343         .flush          = libxfs_bflush,
1344         .relse          = libxfs_brelse,
1345         .compare        = libxfs_bcompare,
1346         .bulkrelse      = libxfs_bulkrelse
1347 };
1348
1349
1350 /*
1351  * Inode cache stubs.
1352  */
1353
1354 kmem_zone_t             *xfs_inode_zone;
1355 extern kmem_zone_t      *xfs_ili_zone;
1356
1357 /*
1358  * If there are inline format data / attr forks attached to this inode,
1359  * make sure they're not corrupt.
1360  */
1361 bool
1362 libxfs_inode_verify_forks(
1363         struct xfs_inode        *ip,
1364         struct xfs_ifork_ops    *ops)
1365 {
1366         struct xfs_ifork        *ifp;
1367         xfs_failaddr_t          fa;
1368
1369         if (!ops)
1370                 return true;
1371
1372         fa = xfs_ifork_verify_data(ip, ops);
1373         if (fa) {
1374                 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1375                 xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
1376                                 ifp->if_u1.if_data, ifp->if_bytes, fa);
1377                 return false;
1378         }
1379
1380         fa = xfs_ifork_verify_attr(ip, ops);
1381         if (fa) {
1382                 ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
1383                 xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
1384                                 ifp ? ifp->if_u1.if_data : NULL,
1385                                 ifp ? ifp->if_bytes : 0, fa);
1386                 return false;
1387         }
1388         return true;
1389 }
1390
1391 int
1392 libxfs_iget(
1393         struct xfs_mount        *mp,
1394         struct xfs_trans        *tp,
1395         xfs_ino_t               ino,
1396         uint                    lock_flags,
1397         struct xfs_inode        **ipp,
1398         struct xfs_ifork_ops    *ifork_ops)
1399 {
1400         struct xfs_inode        *ip;
1401         int                     error = 0;
1402
1403         ip = kmem_zone_zalloc(xfs_inode_zone, 0);
1404         if (!ip)
1405                 return -ENOMEM;
1406
1407         ip->i_ino = ino;
1408         ip->i_mount = mp;
1409         error = xfs_iread(mp, tp, ip, 0);
1410         if (error) {
1411                 kmem_zone_free(xfs_inode_zone, ip);
1412                 *ipp = NULL;
1413                 return error;
1414         }
1415
1416         if (!libxfs_inode_verify_forks(ip, ifork_ops)) {
1417                 libxfs_iput(ip);
1418                 return -EFSCORRUPTED;
1419         }
1420
1421         /*
1422          * set up the inode ops structure that the libxfs code relies on
1423          */
1424         if (XFS_ISDIR(ip))
1425                 ip->d_ops = mp->m_dir_inode_ops;
1426         else
1427                 ip->d_ops = mp->m_nondir_inode_ops;
1428
1429         *ipp = ip;
1430         return 0;
1431 }
1432
1433 static void
1434 libxfs_idestroy(xfs_inode_t *ip)
1435 {
1436         switch (VFS_I(ip)->i_mode & S_IFMT) {
1437                 case S_IFREG:
1438                 case S_IFDIR:
1439                 case S_IFLNK:
1440                         libxfs_idestroy_fork(ip, XFS_DATA_FORK);
1441                         break;
1442         }
1443         if (ip->i_afp)
1444                 libxfs_idestroy_fork(ip, XFS_ATTR_FORK);
1445         if (ip->i_cowfp)
1446                 xfs_idestroy_fork(ip, XFS_COW_FORK);
1447 }
1448
1449 void
1450 libxfs_iput(xfs_inode_t *ip)
1451 {
1452         if (ip->i_itemp)
1453                 kmem_zone_free(xfs_ili_zone, ip->i_itemp);
1454         ip->i_itemp = NULL;
1455         libxfs_idestroy(ip);
1456         kmem_zone_free(xfs_inode_zone, ip);
1457 }