2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include "libxfs_priv.h"
23 #include "xfs_shared.h"
24 #include "xfs_format.h"
25 #include "xfs_log_format.h"
26 #include "xfs_trans_resv.h"
27 #include "xfs_mount.h"
28 #include "xfs_inode_buf.h"
29 #include "xfs_inode_fork.h"
30 #include "xfs_inode.h"
31 #include "xfs_trans.h"
33 #include "libxfs.h" /* for LIBXFS_EXIT_ON_FAILURE */
36 * Important design/architecture note:
38 * The userspace code that uses the buffer cache is much less constrained than
39 * the kernel code. The userspace code is pretty nasty in places, especially
40 * when it comes to buffer error handling. Very little of the userspace code
41 * outside libxfs clears bp->b_error - very little code even checks it - so the
42 * libxfs code is tripping on stale errors left by the userspace code.
44 * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
45 * in the kernel, because those functions are used by the libxfs_readbuf_*
46 * functions and hence need to leave the buffers unchanged on cache hits. This
47 * is actually the only way to gather a write error from a libxfs_writebuf()
48 * call - you need to get the buffer again so you can check bp->b_error field -
49 * assuming that the buffer is still in the cache when you check, that is.
51 * This is very different to the kernel code which does not release buffers on a
52 * write so we can wait on IO and check errors. The kernel buffer cache also
53 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
56 * IOWs, userspace is behaving quite differently to the kernel and as a result
57 * it leaks errors from reads, invalidations and writes through
58 * libxfs_getbuf/libxfs_readbuf.
60 * The result of this is that until the userspace code outside libxfs is cleaned
61 * up, functions that release buffers from userspace control (i.e
62 * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
63 * propagation of stale errors into future buffer operations.
66 #define BDSTRAT_SIZE (256 * 1024)
68 #define IO_BCOMPARE_CHECK
70 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
72 libxfs_device_zero(struct xfs_buftarg
*btp
, xfs_daddr_t start
, uint len
)
74 xfs_off_t start_offset
, end_offset
, offset
;
79 zsize
= min(BDSTRAT_SIZE
, BBTOB(len
));
80 if ((z
= memalign(libxfs_device_alignment(), zsize
)) == NULL
) {
82 _("%s: %s can't memalign %d bytes: %s\n"),
83 progname
, __FUNCTION__
, (int)zsize
, strerror(errno
));
88 fd
= libxfs_device_to_fd(btp
->dev
);
89 start_offset
= LIBXFS_BBTOOFF64(start
);
91 if ((lseek64(fd
, start_offset
, SEEK_SET
)) < 0) {
92 fprintf(stderr
, _("%s: %s seek to offset %llu failed: %s\n"),
93 progname
, __FUNCTION__
,
94 (unsigned long long)start_offset
, strerror(errno
));
98 end_offset
= LIBXFS_BBTOOFF64(start
+ len
) - start_offset
;
99 for (offset
= 0; offset
< end_offset
; ) {
100 bytes
= min((ssize_t
)(end_offset
- offset
), zsize
);
101 if ((bytes
= write(fd
, z
, bytes
)) < 0) {
102 fprintf(stderr
, _("%s: %s write failed: %s\n"),
103 progname
, __FUNCTION__
, strerror(errno
));
105 } else if (bytes
== 0) {
106 fprintf(stderr
, _("%s: %s not progressing?\n"),
107 progname
, __FUNCTION__
);
116 static void unmount_record(void *p
)
118 xlog_op_header_t
*op
= (xlog_op_header_t
*)p
;
119 /* the data section must be 32 bit size aligned */
123 __uint32_t pad2
; /* may as well make it 64 bits */
124 } magic
= { XLOG_UNMOUNT_TYPE
, 0, 0 };
126 memset(p
, 0, BBSIZE
);
127 /* dummy tid to mark this as written from userspace */
128 op
->oh_tid
= cpu_to_be32(0xb0c0d0d0);
129 op
->oh_len
= cpu_to_be32(sizeof(magic
));
130 op
->oh_clientid
= XFS_LOG
;
131 op
->oh_flags
= XLOG_UNMOUNT_TRANS
;
134 /* and the data for this op */
135 memcpy((char *)p
+ sizeof(xlog_op_header_t
), &magic
, sizeof(magic
));
143 struct xfs_buf
*buf
= (struct xfs_buf
*)private;
146 (XFS_BUF_COUNT(buf
) < (int)(ptr
- XFS_BUF_PTR(buf
)) + offset
))
153 * Format the log. The caller provides either a buftarg which is used to access
154 * the log via buffers or a direct pointer to a buffer that encapsulates the
159 struct xfs_buftarg
*btp
,
162 uint length
, /* basic blocks */
165 int sunit
, /* bytes */
170 struct xfs_buf
*bp
= NULL
;
178 if (((btp
&& dptr
) || (!btp
&& !dptr
)) ||
179 (btp
&& !btp
->dev
) || !fs_uuid
)
182 /* first zero the log */
184 libxfs_device_zero(btp
, start
, length
);
186 memset(dptr
, 0, BBTOB(length
));
189 * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
190 * special reset case where we only write a single record where the lsn
191 * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
192 * the specified cycle and points tail_lsn at the last record of the
195 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 2;
197 lsn
= xlog_assign_lsn(cycle
, 0);
198 if (cycle
== XLOG_INIT_CYCLE
)
201 tail_lsn
= xlog_assign_lsn(cycle
- 1, length
- len
);
203 /* write out the first log record */
206 bp
= libxfs_getbufr(btp
, start
, len
);
207 ptr
= XFS_BUF_PTR(bp
);
209 libxfs_log_header(ptr
, fs_uuid
, version
, sunit
, fmt
, lsn
, tail_lsn
,
212 bp
->b_flags
|= LIBXFS_B_DIRTY
;
217 * There's nothing else to do if this is a log reset. The kernel detects
218 * the rest of the log is zeroed and starts at cycle 1.
220 if (cycle
== XLOG_INIT_CYCLE
)
224 * Bump the record size for a full log format if the caller allows it.
225 * This is primarily for performance reasons and most callers don't care
226 * about record size since the log is clean after we're done.
229 len
= BTOBB(BDSTRAT_SIZE
);
232 * Otherwise, fill everything beyond the initial record with records of
233 * the previous cycle so the kernel head/tail detection works correctly.
235 * We don't particularly care about the record size or content here.
236 * It's only important that the headers are in place such that the
237 * kernel finds 1.) a clean log and 2.) the correct current cycle value.
238 * Therefore, bump up the record size to the max to use larger I/Os and
239 * improve performance.
245 end_blk
= start
+ length
;
247 len
= min(end_blk
- blk
, len
);
248 while (blk
< end_blk
) {
249 lsn
= xlog_assign_lsn(cycle
, blk
- start
);
250 tail_lsn
= xlog_assign_lsn(cycle
, blk
- start
- len
);
254 bp
= libxfs_getbufr(btp
, blk
, len
);
255 ptr
= XFS_BUF_PTR(bp
);
258 * Note: pass the full buffer length as the sunit to initialize
261 libxfs_log_header(ptr
, fs_uuid
, version
, BBTOB(len
), fmt
, lsn
,
264 bp
->b_flags
|= LIBXFS_B_DIRTY
;
271 len
= min(end_blk
- blk
, len
);
286 libxfs_get_block_t
*nextfunc
,
289 xlog_rec_header_t
*head
= (xlog_rec_header_t
*)caddr
;
295 if (lsn
== NULLCOMMITLSN
)
296 lsn
= xlog_assign_lsn(XLOG_INIT_CYCLE
, 0);
297 if (tail_lsn
== NULLCOMMITLSN
)
300 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 1;
302 memset(p
, 0, BBSIZE
);
303 head
->h_magicno
= cpu_to_be32(XLOG_HEADER_MAGIC_NUM
);
304 head
->h_cycle
= cpu_to_be32(CYCLE_LSN(lsn
));
305 head
->h_version
= cpu_to_be32(version
);
306 head
->h_crc
= cpu_to_le32(0);
307 head
->h_prev_block
= cpu_to_be32(-1);
308 head
->h_num_logops
= cpu_to_be32(1);
309 head
->h_fmt
= cpu_to_be32(fmt
);
310 head
->h_size
= cpu_to_be32(MAX(sunit
, XLOG_BIG_RECORD_BSIZE
));
312 head
->h_lsn
= cpu_to_be64(lsn
);
313 head
->h_tail_lsn
= cpu_to_be64(tail_lsn
);
315 memcpy(&head
->h_fs_uuid
, fs_uuid
, sizeof(uuid_t
));
318 * The kernel expects to see either a log record header magic value or
319 * the LSN cycle at the top of every log block. The first word of each
320 * non-header block is copied to the record headers and replaced with
321 * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
324 * Even though we only ever write an unmount record (one block), we
325 * support writing log records up to the max log buffer size of 256k to
326 * improve log format performance. This means a record can require up
327 * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
328 * data (each header supports 32k of data).
330 cycle_lsn
= CYCLE_LSN_DISK(head
->h_lsn
);
331 if (version
== 2 && sunit
> XLOG_HEADER_CYCLE_SIZE
) {
332 hdrs
= sunit
/ XLOG_HEADER_CYCLE_SIZE
;
333 if (sunit
% XLOG_HEADER_CYCLE_SIZE
)
338 * A fixed number of extended headers is expected based on h_size. If
339 * required, format those now so the unmount record is located
342 * Since we only write an unmount record, we only need one h_cycle_data
343 * entry for the unmount record block. The subsequent record data
344 * blocks are zeroed, which means we can stamp them directly with the
345 * cycle and zero the rest of the cycle data in the extended headers.
348 for (i
= 1; i
< hdrs
; i
++) {
349 p
= nextfunc(p
, BBSIZE
, private);
350 memset(p
, 0, BBSIZE
);
351 /* xlog_rec_ext_header.xh_cycle */
352 *(__be32
*)p
= cycle_lsn
;
357 * The total length is the max of the stripe unit or 2 basic block
358 * minimum (1 hdr blk + 1 data blk). The record length is the total
359 * minus however many header blocks are required.
361 head
->h_len
= cpu_to_be32(MAX(BBTOB(2), sunit
) - hdrs
* BBSIZE
);
364 * Write out the unmount record, pack the first word into the record
365 * header and stamp the block with the cycle.
367 p
= nextfunc(p
, BBSIZE
, private);
370 head
->h_cycle_data
[0] = *(__be32
*)p
;
371 *(__be32
*)p
= cycle_lsn
;
374 * Finally, zero all remaining blocks in the record and stamp each with
375 * the cycle. We don't need to pack any of these blocks because the
376 * cycle data in the headers has already been zeroed.
378 len
= MAX(len
, hdrs
+ 1);
379 for (i
= hdrs
+ 1; i
< len
; i
++) {
380 p
= nextfunc(p
, BBSIZE
, private);
381 memset(p
, 0, BBSIZE
);
382 *(__be32
*)p
= cycle_lsn
;
389 * Simple I/O (buffer cache) interface
393 #ifdef XFS_BUF_TRACING
395 #undef libxfs_readbuf
396 #undef libxfs_readbuf_map
397 #undef libxfs_writebuf
399 #undef libxfs_getbuf_map
400 #undef libxfs_getbuf_flags
403 xfs_buf_t
*libxfs_readbuf(struct xfs_buftarg
*, xfs_daddr_t
, int, int,
404 const struct xfs_buf_ops
*);
405 xfs_buf_t
*libxfs_readbuf_map(struct xfs_buftarg
*, struct xfs_buf_map
*,
406 int, int, const struct xfs_buf_ops
*);
407 int libxfs_writebuf(xfs_buf_t
*, int);
408 xfs_buf_t
*libxfs_getbuf(struct xfs_buftarg
*, xfs_daddr_t
, int);
409 xfs_buf_t
*libxfs_getbuf_map(struct xfs_buftarg
*, struct xfs_buf_map
*,
411 xfs_buf_t
*libxfs_getbuf_flags(struct xfs_buftarg
*, xfs_daddr_t
, int,
413 void libxfs_putbuf (xfs_buf_t
*);
415 #define __add_trace(bp, func, file, line) \
418 (bp)->b_func = (func); \
419 (bp)->b_file = (file); \
420 (bp)->b_line = (line); \
425 libxfs_trace_readbuf(const char *func
, const char *file
, int line
,
426 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, int flags
,
427 const struct xfs_buf_ops
*ops
)
429 xfs_buf_t
*bp
= libxfs_readbuf(btp
, blkno
, len
, flags
, ops
);
430 __add_trace(bp
, func
, file
, line
);
435 libxfs_trace_readbuf_map(const char *func
, const char *file
, int line
,
436 struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
, int flags
,
437 const struct xfs_buf_ops
*ops
)
439 xfs_buf_t
*bp
= libxfs_readbuf_map(btp
, map
, nmaps
, flags
, ops
);
440 __add_trace(bp
, func
, file
, line
);
445 libxfs_trace_writebuf(const char *func
, const char *file
, int line
, xfs_buf_t
*bp
, int flags
)
447 __add_trace(bp
, func
, file
, line
);
448 return libxfs_writebuf(bp
, flags
);
452 libxfs_trace_getbuf(const char *func
, const char *file
, int line
,
453 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
)
455 xfs_buf_t
*bp
= libxfs_getbuf(btp
, blkno
, len
);
456 __add_trace(bp
, func
, file
, line
);
461 libxfs_trace_getbuf_map(const char *func
, const char *file
, int line
,
462 struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
,
465 xfs_buf_t
*bp
= libxfs_getbuf_map(btp
, map
, nmaps
, flags
);
466 __add_trace(bp
, func
, file
, line
);
471 libxfs_trace_getbuf_flags(const char *func
, const char *file
, int line
,
472 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, unsigned int flags
)
474 xfs_buf_t
*bp
= libxfs_getbuf_flags(btp
, blkno
, len
, flags
);
475 __add_trace(bp
, func
, file
, line
);
480 libxfs_trace_putbuf(const char *func
, const char *file
, int line
, xfs_buf_t
*bp
)
482 __add_trace(bp
, func
, file
, line
);
491 libxfs_getsb(xfs_mount_t
*mp
, int flags
)
493 return libxfs_readbuf(mp
->m_ddev_targp
, XFS_SB_DADDR
,
494 XFS_FSS_TO_BB(mp
, 1), flags
, &xfs_sb_buf_ops
);
497 kmem_zone_t
*xfs_buf_zone
;
499 static struct cache_mru xfs_buf_freelist
=
500 {{&xfs_buf_freelist
.cm_list
, &xfs_buf_freelist
.cm_list
},
501 0, PTHREAD_MUTEX_INITIALIZER
};
504 * The bufkey is used to pass the new buffer information to the cache object
505 * allocation routine. Because discontiguous buffers need to pass different
506 * information, we need fields to pass that information. However, because the
507 * blkno and bblen is needed for the initial cache entry lookup (i.e. for
508 * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
509 * buffer initialisation instead of a contiguous buffer.
512 struct xfs_buftarg
*buftarg
;
515 struct xfs_buf_map
*map
;
519 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
520 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
521 #define CACHE_LINE_SIZE 64
523 libxfs_bhash(cache_key_t key
, unsigned int hashsize
, unsigned int hashshift
)
525 uint64_t hashval
= ((struct xfs_bufkey
*)key
)->blkno
;
528 tmp
= hashval
^ (GOLDEN_RATIO_PRIME
+ hashval
) / CACHE_LINE_SIZE
;
529 tmp
= tmp
^ ((tmp
^ GOLDEN_RATIO_PRIME
) >> hashshift
);
530 return tmp
% hashsize
;
534 libxfs_bcompare(struct cache_node
*node
, cache_key_t key
)
536 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
537 struct xfs_bufkey
*bkey
= (struct xfs_bufkey
*)key
;
539 if (bp
->b_target
->dev
== bkey
->buftarg
->dev
&&
540 bp
->b_bn
== bkey
->blkno
) {
541 if (bp
->b_bcount
== BBTOB(bkey
->bblen
))
543 #ifdef IO_BCOMPARE_CHECK
544 if (!(libxfs_bcache
->c_flags
& CACHE_MISCOMPARE_PURGE
)) {
546 "%lx: Badness in key lookup (length)\n"
547 "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
549 (unsigned long long)bp
->b_bn
, (int)bp
->b_bcount
,
550 (unsigned long long)bkey
->blkno
,
560 libxfs_bprint(xfs_buf_t
*bp
)
562 fprintf(stderr
, "Buffer 0x%p blkno=%llu bytes=%u flags=0x%x count=%u\n",
563 bp
, (unsigned long long)bp
->b_bn
, (unsigned)bp
->b_bcount
,
564 bp
->b_flags
, bp
->b_node
.cn_count
);
568 __initbuf(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
573 bp
->b_bcount
= bytes
;
574 bp
->b_length
= BTOBB(bytes
);
578 bp
->b_addr
= memalign(libxfs_device_alignment(), bytes
);
581 _("%s: %s can't memalign %u bytes: %s\n"),
582 progname
, __FUNCTION__
, bytes
,
586 memset(bp
->b_addr
, 0, bytes
);
587 #ifdef XFS_BUF_TRACING
588 list_head_init(&bp
->b_lock_list
);
590 pthread_mutex_init(&bp
->b_lock
, NULL
);
597 libxfs_initbuf(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
600 __initbuf(bp
, btp
, bno
, bytes
);
604 libxfs_initbuf_map(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
,
605 struct xfs_buf_map
*map
, int nmaps
)
607 unsigned int bytes
= 0;
610 bytes
= sizeof(struct xfs_buf_map
) * nmaps
;
611 bp
->b_maps
= malloc(bytes
);
614 _("%s: %s can't malloc %u bytes: %s\n"),
615 progname
, __FUNCTION__
, bytes
,
622 for ( i
= 0; i
< nmaps
; i
++) {
623 bp
->b_maps
[i
].bm_bn
= map
[i
].bm_bn
;
624 bp
->b_maps
[i
].bm_len
= map
[i
].bm_len
;
625 bytes
+= BBTOB(map
[i
].bm_len
);
628 __initbuf(bp
, btp
, map
[0].bm_bn
, bytes
);
629 bp
->b_flags
|= LIBXFS_B_DISCONTIG
;
633 __libxfs_getbufr(int blen
)
638 * first look for a buffer that can be used as-is,
639 * if one cannot be found, see if there is a buffer,
640 * and if so, free its buffer and set b_addr to NULL
641 * before calling libxfs_initbuf.
643 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
644 if (!list_empty(&xfs_buf_freelist
.cm_list
)) {
645 list_for_each_entry(bp
, &xfs_buf_freelist
.cm_list
, b_node
.cn_mru
) {
646 if (bp
->b_bcount
== blen
) {
647 list_del_init(&bp
->b_node
.cn_mru
);
651 if (&bp
->b_node
.cn_mru
== &xfs_buf_freelist
.cm_list
) {
652 bp
= list_entry(xfs_buf_freelist
.cm_list
.next
,
653 xfs_buf_t
, b_node
.cn_mru
);
654 list_del_init(&bp
->b_node
.cn_mru
);
661 bp
= kmem_zone_zalloc(xfs_buf_zone
, 0);
662 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
664 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
665 fprintf(stderr
, "found dirty buffer (bulk) on free list!");
671 libxfs_getbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
)
674 int blen
= BBTOB(bblen
);
676 bp
=__libxfs_getbufr(blen
);
678 libxfs_initbuf(bp
, btp
, blkno
, blen
);
680 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
681 pthread_self(), __FUNCTION__
, blen
,
682 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
689 libxfs_getbufr_map(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
,
690 struct xfs_buf_map
*map
, int nmaps
)
693 int blen
= BBTOB(bblen
);
695 if (!map
|| !nmaps
) {
697 _("%s: %s invalid map %p or nmaps %d\n"),
698 progname
, __FUNCTION__
, map
, nmaps
);
702 if (blkno
!= map
[0].bm_bn
) {
704 _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
705 progname
, __FUNCTION__
, (long long)map
[0].bm_bn
,
710 bp
=__libxfs_getbufr(blen
);
712 libxfs_initbuf_map(bp
, btp
, map
, nmaps
);
714 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
715 pthread_self(), __FUNCTION__
, blen
,
716 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
722 #ifdef XFS_BUF_TRACING
723 struct list_head lock_buf_list
= {&lock_buf_list
, &lock_buf_list
};
724 int lock_buf_count
= 0;
727 extern int use_xfs_buf_lock
;
729 static struct xfs_buf
*
730 __cache_lookup(struct xfs_bufkey
*key
, unsigned int flags
)
734 cache_node_get(libxfs_bcache
, key
, (struct cache_node
**)&bp
);
738 if (use_xfs_buf_lock
) {
741 ret
= pthread_mutex_trylock(&bp
->b_lock
);
743 ASSERT(ret
== EAGAIN
);
744 if (flags
& LIBXFS_GETBUF_TRYLOCK
)
747 if (pthread_equal(bp
->b_holder
, pthread_self())) {
749 _("Warning: recursive buffer locking at block %" PRIu64
" detected\n"),
754 pthread_mutex_lock(&bp
->b_lock
);
758 bp
->b_holder
= pthread_self();
761 cache_node_set_priority(libxfs_bcache
, (struct cache_node
*)bp
,
762 cache_node_get_priority((struct cache_node
*)bp
) -
763 CACHE_PREFETCH_PRIORITY
);
764 #ifdef XFS_BUF_TRACING
765 pthread_mutex_lock(&libxfs_bcache
->c_mutex
);
767 list_add(&bp
->b_lock_list
, &lock_buf_list
);
768 pthread_mutex_unlock(&libxfs_bcache
->c_mutex
);
771 printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
772 pthread_self(), __FUNCTION__
,
773 bp
, bp
->b_bn
, (long long)LIBXFS_BBTOOFF64(key
->blkno
));
778 cache_node_put(libxfs_bcache
, (struct cache_node
*)bp
);
783 libxfs_getbuf_flags(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
,
786 struct xfs_bufkey key
= {0};
792 return __cache_lookup(&key
, flags
);
796 * Clean the buffer flags for libxfs_getbuf*(), which wants to return
797 * an unused buffer with clean state. This prevents CRC errors on a
798 * re-read of a corrupt block that was prefetched and freed. This
799 * can happen with a massively corrupt directory that is discarded,
800 * but whose blocks are then recycled into expanding lost+found.
802 * Note however that if the buffer's dirty (prefetch calls getbuf)
803 * we'll leave the state alone because we don't want to discard blocks
804 * that have been fixed.
810 if (bp
&& !(bp
->b_flags
& LIBXFS_B_DIRTY
))
811 bp
->b_flags
&= ~(LIBXFS_B_UNCHECKED
| LIBXFS_B_STALE
|
816 libxfs_getbuf(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
)
820 bp
= libxfs_getbuf_flags(btp
, blkno
, len
, 0);
825 static struct xfs_buf
*
826 __libxfs_getbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
,
827 int nmaps
, int flags
)
829 struct xfs_bufkey key
= {0};
833 return libxfs_getbuf_flags(btp
, map
[0].bm_bn
, map
[0].bm_len
,
837 key
.blkno
= map
[0].bm_bn
;
838 for (i
= 0; i
< nmaps
; i
++) {
839 key
.bblen
+= map
[i
].bm_len
;
844 return __cache_lookup(&key
, flags
);
848 libxfs_getbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
,
849 int nmaps
, int flags
)
853 bp
= __libxfs_getbuf_map(btp
, map
, nmaps
, flags
);
859 libxfs_putbuf(xfs_buf_t
*bp
)
862 * ensure that any errors on this use of the buffer don't carry
863 * over to the next user.
867 #ifdef XFS_BUF_TRACING
868 pthread_mutex_lock(&libxfs_bcache
->c_mutex
);
870 ASSERT(lock_buf_count
>= 0);
871 list_del_init(&bp
->b_lock_list
);
872 pthread_mutex_unlock(&libxfs_bcache
->c_mutex
);
874 if (use_xfs_buf_lock
) {
879 pthread_mutex_unlock(&bp
->b_lock
);
883 cache_node_put(libxfs_bcache
, (struct cache_node
*)bp
);
887 libxfs_purgebuf(xfs_buf_t
*bp
)
889 struct xfs_bufkey key
= {0};
891 key
.buftarg
= bp
->b_target
;
892 key
.blkno
= bp
->b_bn
;
893 key
.bblen
= bp
->b_length
;
895 cache_node_purge(libxfs_bcache
, &key
, (struct cache_node
*)bp
);
898 static struct cache_node
*
899 libxfs_balloc(cache_key_t key
)
901 struct xfs_bufkey
*bufkey
= (struct xfs_bufkey
*)key
;
904 return (struct cache_node
*)
905 libxfs_getbufr_map(bufkey
->buftarg
,
906 bufkey
->blkno
, bufkey
->bblen
,
907 bufkey
->map
, bufkey
->nmaps
);
908 return (struct cache_node
*)libxfs_getbufr(bufkey
->buftarg
,
909 bufkey
->blkno
, bufkey
->bblen
);
914 __read_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
918 sts
= pread64(fd
, buf
, len
, offset
);
921 fprintf(stderr
, _("%s: read failed: %s\n"),
922 progname
, strerror(error
));
923 if (flags
& LIBXFS_EXIT_ON_FAILURE
)
926 } else if (sts
!= len
) {
927 fprintf(stderr
, _("%s: error - read only %d of %d bytes\n"),
929 if (flags
& LIBXFS_EXIT_ON_FAILURE
)
937 libxfs_readbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, xfs_buf_t
*bp
,
940 int fd
= libxfs_device_to_fd(btp
->dev
);
941 int bytes
= BBTOB(len
);
944 ASSERT(BBTOB(len
) <= bp
->b_bcount
);
946 error
= __read_buf(fd
, bp
->b_addr
, bytes
, LIBXFS_BBTOOFF64(blkno
), flags
);
948 bp
->b_target
->dev
== btp
->dev
&&
950 bp
->b_bcount
== bytes
)
951 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
953 printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
954 pthread_self(), __FUNCTION__
, bytes
, error
,
955 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
961 libxfs_readbuf_verify(struct xfs_buf
*bp
, const struct xfs_buf_ops
*ops
)
966 bp
->b_ops
->verify_read(bp
);
967 bp
->b_flags
&= ~LIBXFS_B_UNCHECKED
;
972 libxfs_readbuf(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, int flags
,
973 const struct xfs_buf_ops
*ops
)
978 bp
= libxfs_getbuf_flags(btp
, blkno
, len
, 0);
983 * if the buffer was prefetched, it is likely that it was not validated.
984 * Hence if we are supplied an ops function and the buffer is marked as
985 * unchecked, we need to validate it now.
987 * We do this verification even if the buffer is dirty - the
988 * verification is almost certainly going to fail the CRC check in this
989 * case as a dirty buffer has not had the CRC recalculated. However, we
990 * should not be dirtying unchecked buffers and therefore failing it
991 * here because it's dirty and unchecked indicates we've screwed up
995 if ((bp
->b_flags
& (LIBXFS_B_UPTODATE
|LIBXFS_B_DIRTY
))) {
996 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
997 libxfs_readbuf_verify(bp
, ops
);
1002 * Set the ops on a cache miss (i.e. first physical read) as the
1003 * verifier may change the ops to match the type of buffer it contains.
1004 * A cache hit might reset the verifier to the original type if we set
1005 * it again, but it won't get called again and set to match the buffer
1006 * contents. *cough* xfs_da_node_buf_ops *cough*.
1008 error
= libxfs_readbufr(btp
, blkno
, bp
, len
, flags
);
1010 bp
->b_error
= error
;
1012 libxfs_readbuf_verify(bp
, ops
);
1017 libxfs_readbufr_map(struct xfs_buftarg
*btp
, struct xfs_buf
*bp
, int flags
)
1024 fd
= libxfs_device_to_fd(btp
->dev
);
1026 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
1027 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
1028 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
1030 error
= __read_buf(fd
, buf
, len
, offset
, flags
);
1032 bp
->b_error
= error
;
1039 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
1041 printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
1042 pthread_self(), __FUNCTION__
, , error
,
1043 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
1049 libxfs_readbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
,
1050 int flags
, const struct xfs_buf_ops
*ops
)
1056 return libxfs_readbuf(btp
, map
[0].bm_bn
, map
[0].bm_len
,
1059 bp
= __libxfs_getbuf_map(btp
, map
, nmaps
, 0);
1064 if ((bp
->b_flags
& (LIBXFS_B_UPTODATE
|LIBXFS_B_DIRTY
))) {
1065 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
1066 libxfs_readbuf_verify(bp
, ops
);
1069 error
= libxfs_readbufr_map(btp
, bp
, flags
);
1071 libxfs_readbuf_verify(bp
, ops
);
1074 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1075 pthread_self(), __FUNCTION__
, buf
- (char *)bp
->b_addr
, error
,
1076 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
), (long long)bp
->b_bn
, bp
);
1082 __write_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
1086 sts
= pwrite64(fd
, buf
, len
, offset
);
1089 fprintf(stderr
, _("%s: pwrite64 failed: %s\n"),
1090 progname
, strerror(error
));
1091 if (flags
& LIBXFS_B_EXIT
)
1094 } else if (sts
!= len
) {
1095 fprintf(stderr
, _("%s: error - pwrite64 only %d of %d bytes\n"),
1096 progname
, sts
, len
);
1097 if (flags
& LIBXFS_B_EXIT
)
1105 libxfs_writebufr(xfs_buf_t
*bp
)
1107 int fd
= libxfs_device_to_fd(bp
->b_target
->dev
);
1110 * we never write buffers that are marked stale. This indicates they
1111 * contain data that has been invalidated, and even if the buffer is
1112 * dirty it must *never* be written. Verifiers are wonderful for finding
1113 * bugs like this. Make sure the error is obvious as to the cause.
1115 if (bp
->b_flags
& LIBXFS_B_STALE
) {
1116 bp
->b_error
= -ESTALE
;
1121 * clear any pre-existing error status on the buffer. This can occur if
1122 * the buffer is corrupt on disk and the repair process doesn't clear
1123 * the error before fixing and writing it back.
1127 bp
->b_ops
->verify_write(bp
);
1130 _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
1131 __func__
, bp
->b_ops
->name
,
1132 (long long)bp
->b_bn
, bp
->b_bcount
);
1137 if (!(bp
->b_flags
& LIBXFS_B_DISCONTIG
)) {
1138 bp
->b_error
= __write_buf(fd
, bp
->b_addr
, bp
->b_bcount
,
1139 LIBXFS_BBTOOFF64(bp
->b_bn
), bp
->b_flags
);
1142 char *buf
= bp
->b_addr
;
1144 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
1145 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
1146 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
1148 bp
->b_error
= __write_buf(fd
, buf
, len
, offset
,
1157 printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
1158 pthread_self(), __FUNCTION__
, bp
->b_bcount
,
1159 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
),
1160 (long long)bp
->b_bn
, bp
, bp
->b_error
);
1163 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
1164 bp
->b_flags
&= ~(LIBXFS_B_DIRTY
| LIBXFS_B_EXIT
|
1165 LIBXFS_B_UNCHECKED
);
1171 libxfs_writebuf_int(xfs_buf_t
*bp
, int flags
)
1174 * Clear any error hanging over from reading the buffer. This prevents
1175 * subsequent reads after this write from seeing stale errors.
1178 bp
->b_flags
&= ~LIBXFS_B_STALE
;
1179 bp
->b_flags
|= (LIBXFS_B_DIRTY
| flags
);
1184 libxfs_writebuf(xfs_buf_t
*bp
, int flags
)
1187 printf("%lx: %s: dirty blkno=%llu(%llu)\n",
1188 pthread_self(), __FUNCTION__
,
1189 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
),
1190 (long long)bp
->b_bn
);
1193 * Clear any error hanging over from reading the buffer. This prevents
1194 * subsequent reads after this write from seeing stale errors.
1197 bp
->b_flags
&= ~LIBXFS_B_STALE
;
1198 bp
->b_flags
|= (LIBXFS_B_DIRTY
| flags
);
1204 libxfs_iomove(xfs_buf_t
*bp
, uint boff
, int len
, void *data
, int flags
)
1207 if (boff
+ len
> bp
->b_bcount
) {
1208 printf("Badness, iomove out of range!\n"
1209 "bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
1210 (long long)bp
->b_bn
, bp
->b_bcount
, boff
, len
);
1216 memset(bp
->b_addr
+ boff
, 0, len
);
1219 memcpy(data
, bp
->b_addr
+ boff
, len
);
1222 memcpy(bp
->b_addr
+ boff
, data
, len
);
1229 struct cache_node
*node
)
1231 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
1235 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1237 "releasing dirty buffer to free list!");
1239 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
1240 list_add(&bp
->b_node
.cn_mru
, &xfs_buf_freelist
.cm_list
);
1241 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
1246 struct cache
*cache
,
1247 struct list_head
*list
)
1252 if (list_empty(list
))
1255 list_for_each_entry(bp
, list
, b_node
.cn_mru
) {
1256 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1258 "releasing dirty buffer (bulk) to free list!");
1262 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
1263 list_splice(list
, &xfs_buf_freelist
.cm_list
);
1264 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
1270 * When a buffer is marked dirty, the error is cleared. Hence if we are trying
1271 * to flush a buffer prior to cache reclaim that has an error on it it means
1272 * we've already tried to flush it and it failed. Prevent repeated corruption
1273 * errors from being reported by skipping such buffers - when the corruption is
1274 * fixed the buffer will be marked dirty again and we can write it again.
1278 struct cache_node
*node
)
1280 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
1282 if (!bp
->b_error
&& bp
->b_flags
& LIBXFS_B_DIRTY
)
1283 return libxfs_writebufr(bp
);
1288 libxfs_putbufr(xfs_buf_t
*bp
)
1290 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1291 libxfs_writebufr(bp
);
1292 libxfs_brelse((struct cache_node
*)bp
);
1297 libxfs_bcache_purge(void)
1299 cache_purge(libxfs_bcache
);
1303 libxfs_bcache_flush(void)
1305 cache_flush(libxfs_bcache
);
1309 libxfs_bcache_overflowed(void)
1311 return cache_overflowed(libxfs_bcache
);
1314 struct cache_operations libxfs_bcache_operations
= {
1315 .hash
= libxfs_bhash
,
1316 .alloc
= libxfs_balloc
,
1317 .flush
= libxfs_bflush
,
1318 .relse
= libxfs_brelse
,
1319 .compare
= libxfs_bcompare
,
1320 .bulkrelse
= libxfs_bulkrelse
1325 * Inode cache stubs.
1328 extern kmem_zone_t
*xfs_ili_zone
;
1329 extern kmem_zone_t
*xfs_inode_zone
;
1332 libxfs_iget(xfs_mount_t
*mp
, xfs_trans_t
*tp
, xfs_ino_t ino
, uint lock_flags
,
1338 ip
= kmem_zone_zalloc(xfs_inode_zone
, 0);
1344 error
= xfs_iread(mp
, tp
, ip
, 0);
1346 kmem_zone_free(xfs_inode_zone
, ip
);
1352 * set up the inode ops structure that the libxfs code relies on
1355 ip
->d_ops
= mp
->m_dir_inode_ops
;
1357 ip
->d_ops
= mp
->m_nondir_inode_ops
;
1364 libxfs_idestroy(xfs_inode_t
*ip
)
1366 switch (VFS_I(ip
)->i_mode
& S_IFMT
) {
1370 libxfs_idestroy_fork(ip
, XFS_DATA_FORK
);
1374 libxfs_idestroy_fork(ip
, XFS_ATTR_FORK
);
1378 libxfs_iput(xfs_inode_t
*ip
)
1381 kmem_zone_free(xfs_ili_zone
, ip
->i_itemp
);
1383 libxfs_idestroy(ip
);
1384 kmem_zone_free(xfs_inode_zone
, ip
);