1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
8 #include "libxfs_priv.h"
11 #include "xfs_shared.h"
12 #include "xfs_format.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans_resv.h"
15 #include "xfs_mount.h"
16 #include "xfs_inode_buf.h"
17 #include "xfs_inode_fork.h"
18 #include "xfs_inode.h"
19 #include "xfs_trans.h"
20 #include "libfrog/platform.h"
24 static void libxfs_brelse(struct cache_node
*node
);
27 * Important design/architecture note:
29 * The userspace code that uses the buffer cache is much less constrained than
30 * the kernel code. The userspace code is pretty nasty in places, especially
31 * when it comes to buffer error handling. Very little of the userspace code
32 * outside libxfs clears bp->b_error - very little code even checks it - so the
33 * libxfs code is tripping on stale errors left by the userspace code.
35 * We can't clear errors or zero buffer contents in libxfs_buf_get-* like we do
36 * in the kernel, because those functions are used by the libxfs_readbuf_*
37 * functions and hence need to leave the buffers unchanged on cache hits. This
38 * is actually the only way to gather a write error from a libxfs_writebuf()
39 * call - you need to get the buffer again so you can check bp->b_error field -
40 * assuming that the buffer is still in the cache when you check, that is.
42 * This is very different to the kernel code which does not release buffers on a
43 * write so we can wait on IO and check errors. The kernel buffer cache also
44 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
47 * IOWs, userspace is behaving quite differently to the kernel and as a result
48 * it leaks errors from reads, invalidations and writes through
49 * libxfs_buf_get/libxfs_buf_read.
51 * The result of this is that until the userspace code outside libxfs is cleaned
52 * up, functions that release buffers from userspace control (i.e
53 * libxfs_writebuf/libxfs_buf_relse) need to zero bp->b_error to prevent
54 * propagation of stale errors into future buffer operations.
57 #define BDSTRAT_SIZE (256 * 1024)
59 #define IO_BCOMPARE_CHECK
61 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
63 libxfs_device_zero(struct xfs_buftarg
*btp
, xfs_daddr_t start
, uint len
)
65 xfs_off_t start_offset
, end_offset
, offset
;
71 fd
= libxfs_device_to_fd(btp
->bt_bdev
);
72 start_offset
= LIBXFS_BBTOOFF64(start
);
74 /* try to use special zeroing methods, fall back to writes if needed */
75 len_bytes
= LIBXFS_BBTOOFF64(len
);
76 error
= platform_zero_range(fd
, start_offset
, len_bytes
);
78 xfs_buftarg_trip_write(btp
);
82 zsize
= min(BDSTRAT_SIZE
, BBTOB(len
));
83 if ((z
= memalign(libxfs_device_alignment(), zsize
)) == NULL
) {
85 _("%s: %s can't memalign %d bytes: %s\n"),
86 progname
, __FUNCTION__
, (int)zsize
, strerror(errno
));
91 if ((lseek(fd
, start_offset
, SEEK_SET
)) < 0) {
92 fprintf(stderr
, _("%s: %s seek to offset %llu failed: %s\n"),
93 progname
, __FUNCTION__
,
94 (unsigned long long)start_offset
, strerror(errno
));
98 end_offset
= LIBXFS_BBTOOFF64(start
+ len
) - start_offset
;
99 for (offset
= 0; offset
< end_offset
; ) {
100 bytes
= min((ssize_t
)(end_offset
- offset
), zsize
);
101 if ((bytes
= write(fd
, z
, bytes
)) < 0) {
102 fprintf(stderr
, _("%s: %s write failed: %s\n"),
103 progname
, __FUNCTION__
, strerror(errno
));
105 } else if (bytes
== 0) {
106 fprintf(stderr
, _("%s: %s not progressing?\n"),
107 progname
, __FUNCTION__
);
110 xfs_buftarg_trip_write(btp
);
117 static void unmount_record(void *p
)
119 xlog_op_header_t
*op
= (xlog_op_header_t
*)p
;
120 /* the data section must be 32 bit size aligned */
124 uint32_t pad2
; /* may as well make it 64 bits */
125 } magic
= { XLOG_UNMOUNT_TYPE
, 0, 0 };
127 memset(p
, 0, BBSIZE
);
128 /* dummy tid to mark this as written from userspace */
129 op
->oh_tid
= cpu_to_be32(0xb0c0d0d0);
130 op
->oh_len
= cpu_to_be32(sizeof(magic
));
131 op
->oh_clientid
= XFS_LOG
;
132 op
->oh_flags
= XLOG_UNMOUNT_TRANS
;
135 /* and the data for this op */
136 memcpy((char *)p
+ sizeof(xlog_op_header_t
), &magic
, sizeof(magic
));
144 struct xfs_buf
*buf
= (struct xfs_buf
*)private;
147 (BBTOB(buf
->b_length
) < (int)(ptr
- (char *)buf
->b_addr
) + offset
))
155 struct xfs_mount
*mp
)
159 libxfs_buf_read(mp
->m_ddev_targp
, XFS_SB_DADDR
, XFS_FSS_TO_BB(mp
, 1),
160 0, &bp
, &xfs_sb_buf_ops
);
164 struct kmem_cache
*xfs_buf_cache
;
166 static struct cache_mru xfs_buf_freelist
=
167 {{&xfs_buf_freelist
.cm_list
, &xfs_buf_freelist
.cm_list
},
168 0, PTHREAD_MUTEX_INITIALIZER
};
171 * The bufkey is used to pass the new buffer information to the cache object
172 * allocation routine. Because discontiguous buffers need to pass different
173 * information, we need fields to pass that information. However, because the
174 * blkno and bblen is needed for the initial cache entry lookup (i.e. for
175 * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
176 * buffer initialisation instead of a contiguous buffer.
179 struct xfs_buftarg
*buftarg
;
182 struct xfs_buf_map
*map
;
186 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
187 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
188 #define CACHE_LINE_SIZE 64
190 libxfs_bhash(cache_key_t key
, unsigned int hashsize
, unsigned int hashshift
)
192 uint64_t hashval
= ((struct xfs_bufkey
*)key
)->blkno
;
195 tmp
= hashval
^ (GOLDEN_RATIO_PRIME
+ hashval
) / CACHE_LINE_SIZE
;
196 tmp
= tmp
^ ((tmp
^ GOLDEN_RATIO_PRIME
) >> hashshift
);
197 return tmp
% hashsize
;
201 libxfs_bcompare(struct cache_node
*node
, cache_key_t key
)
203 struct xfs_buf
*bp
= container_of(node
, struct xfs_buf
,
205 struct xfs_bufkey
*bkey
= (struct xfs_bufkey
*)key
;
207 if (bp
->b_target
->bt_bdev
== bkey
->buftarg
->bt_bdev
&&
208 bp
->b_cache_key
== bkey
->blkno
) {
209 if (bp
->b_length
== bkey
->bblen
)
211 #ifdef IO_BCOMPARE_CHECK
212 if (!(libxfs_bcache
->c_flags
& CACHE_MISCOMPARE_PURGE
)) {
214 "%lx: Badness in key lookup (length)\n"
215 "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
217 (unsigned long long)xfs_buf_daddr(bp
),
219 (unsigned long long)bkey
->blkno
,
229 __initbuf(struct xfs_buf
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
233 bp
->b_cache_key
= bno
;
234 bp
->b_length
= BTOBB(bytes
);
236 bp
->b_mount
= btp
->bt_mount
;
239 bp
->b_addr
= memalign(libxfs_device_alignment(), bytes
);
242 _("%s: %s can't memalign %u bytes: %s\n"),
243 progname
, __FUNCTION__
, bytes
,
247 memset(bp
->b_addr
, 0, bytes
);
248 pthread_mutex_init(&bp
->b_lock
, NULL
);
252 INIT_LIST_HEAD(&bp
->b_li_list
);
255 bp
->b_maps
= &bp
->__b_map
;
257 if (bp
->b_maps
== &bp
->__b_map
) {
259 bp
->b_maps
[0].bm_bn
= bno
;
260 bp
->b_maps
[0].bm_len
= bp
->b_length
;
265 libxfs_initbuf(struct xfs_buf
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
268 __initbuf(bp
, btp
, bno
, bytes
);
272 libxfs_initbuf_map(struct xfs_buf
*bp
, struct xfs_buftarg
*btp
,
273 struct xfs_buf_map
*map
, int nmaps
)
275 unsigned int bytes
= 0;
278 bytes
= sizeof(struct xfs_buf_map
) * nmaps
;
279 bp
->b_maps
= malloc(bytes
);
282 _("%s: %s can't malloc %u bytes: %s\n"),
283 progname
, __FUNCTION__
, bytes
,
290 for ( i
= 0; i
< nmaps
; i
++) {
291 bp
->b_maps
[i
].bm_bn
= map
[i
].bm_bn
;
292 bp
->b_maps
[i
].bm_len
= map
[i
].bm_len
;
293 bytes
+= BBTOB(map
[i
].bm_len
);
296 __initbuf(bp
, btp
, map
[0].bm_bn
, bytes
);
297 bp
->b_flags
|= LIBXFS_B_DISCONTIG
;
300 static struct xfs_buf
*
301 __libxfs_getbufr(int blen
)
306 * first look for a buffer that can be used as-is,
307 * if one cannot be found, see if there is a buffer,
308 * and if so, free its buffer and set b_addr to NULL
309 * before calling libxfs_initbuf.
311 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
312 if (!list_empty(&xfs_buf_freelist
.cm_list
)) {
313 list_for_each_entry(bp
, &xfs_buf_freelist
.cm_list
, b_node
.cn_mru
) {
314 if (bp
->b_length
== BTOBB(blen
)) {
315 list_del_init(&bp
->b_node
.cn_mru
);
319 if (&bp
->b_node
.cn_mru
== &xfs_buf_freelist
.cm_list
) {
320 bp
= list_entry(xfs_buf_freelist
.cm_list
.next
,
321 struct xfs_buf
, b_node
.cn_mru
);
322 list_del_init(&bp
->b_node
.cn_mru
);
325 if (bp
->b_maps
!= &bp
->__b_map
)
330 bp
= kmem_cache_zalloc(xfs_buf_cache
, 0);
331 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
333 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
334 fprintf(stderr
, "found dirty buffer (bulk) on free list!\n");
339 static struct xfs_buf
*
340 libxfs_getbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
)
343 int blen
= BBTOB(bblen
);
345 bp
=__libxfs_getbufr(blen
);
347 libxfs_initbuf(bp
, btp
, blkno
, blen
);
351 static struct xfs_buf
*
352 libxfs_getbufr_map(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
,
353 struct xfs_buf_map
*map
, int nmaps
)
356 int blen
= BBTOB(bblen
);
358 if (!map
|| !nmaps
) {
360 _("%s: %s invalid map %p or nmaps %d\n"),
361 progname
, __FUNCTION__
, map
, nmaps
);
365 if (blkno
!= map
[0].bm_bn
) {
367 _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
368 progname
, __FUNCTION__
, (long long)map
[0].bm_bn
,
373 bp
=__libxfs_getbufr(blen
);
375 libxfs_initbuf_map(bp
, btp
, map
, nmaps
);
383 if (use_xfs_buf_lock
)
384 pthread_mutex_lock(&bp
->b_lock
);
389 struct xfs_bufkey
*key
,
391 struct xfs_buf
**bpp
)
393 struct cache_node
*cn
= NULL
;
398 cache_node_get(libxfs_bcache
, key
, &cn
);
401 bp
= container_of(cn
, struct xfs_buf
, b_node
);
403 if (use_xfs_buf_lock
) {
406 ret
= pthread_mutex_trylock(&bp
->b_lock
);
408 ASSERT(ret
== EAGAIN
);
409 if (flags
& LIBXFS_GETBUF_TRYLOCK
) {
410 cache_node_put(libxfs_bcache
, cn
);
414 if (pthread_equal(bp
->b_holder
, pthread_self())) {
416 _("Warning: recursive buffer locking at block %" PRIu64
" detected\n"),
422 pthread_mutex_lock(&bp
->b_lock
);
426 bp
->b_holder
= pthread_self();
429 cache_node_set_priority(libxfs_bcache
, cn
,
430 cache_node_get_priority(cn
) - CACHE_PREFETCH_PRIORITY
);
437 struct xfs_buftarg
*btp
,
441 struct xfs_buf
**bpp
)
443 struct xfs_bufkey key
= {NULL
};
450 ret
= __cache_lookup(&key
, flags
, bpp
);
454 if (btp
== btp
->bt_mount
->m_ddev_targp
) {
455 (*bpp
)->b_pag
= xfs_perag_get(btp
->bt_mount
,
456 xfs_daddr_to_agno(btp
->bt_mount
, blkno
));
463 * Clean the buffer flags for libxfs_getbuf*(), which wants to return
464 * an unused buffer with clean state. This prevents CRC errors on a
465 * re-read of a corrupt block that was prefetched and freed. This
466 * can happen with a massively corrupt directory that is discarded,
467 * but whose blocks are then recycled into expanding lost+found.
469 * Note however that if the buffer's dirty (prefetch calls getbuf)
470 * we'll leave the state alone because we don't want to discard blocks
471 * that have been fixed.
477 if (bp
&& !(bp
->b_flags
& LIBXFS_B_DIRTY
))
478 bp
->b_flags
&= ~(LIBXFS_B_UNCHECKED
| LIBXFS_B_STALE
|
483 __libxfs_buf_get_map(
484 struct xfs_buftarg
*btp
,
485 struct xfs_buf_map
*map
,
488 struct xfs_buf
**bpp
)
490 struct xfs_bufkey key
= {NULL
};
494 return libxfs_getbuf_flags(btp
, map
[0].bm_bn
, map
[0].bm_len
,
498 key
.blkno
= map
[0].bm_bn
;
499 for (i
= 0; i
< nmaps
; i
++) {
500 key
.bblen
+= map
[i
].bm_len
;
505 return __cache_lookup(&key
, flags
, bpp
);
510 struct xfs_buftarg
*btp
,
511 struct xfs_buf_map
*map
,
514 struct xfs_buf
**bpp
)
518 error
= __libxfs_buf_get_map(btp
, map
, nmaps
, flags
, bpp
);
522 reset_buf_state(*bpp
);
531 * ensure that any errors on this use of the buffer don't carry
532 * over to the next user.
535 if (use_xfs_buf_lock
) {
540 pthread_mutex_unlock(&bp
->b_lock
);
544 if (!list_empty(&bp
->b_node
.cn_hash
))
545 cache_node_put(libxfs_bcache
, &bp
->b_node
);
546 else if (--bp
->b_node
.cn_count
== 0) {
547 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
549 libxfs_brelse(&bp
->b_node
);
553 static struct cache_node
*
557 struct xfs_bufkey
*bufkey
= (struct xfs_bufkey
*)key
;
561 bp
= libxfs_getbufr_map(bufkey
->buftarg
, bufkey
->blkno
,
562 bufkey
->bblen
, bufkey
->map
, bufkey
->nmaps
);
564 bp
= libxfs_getbufr(bufkey
->buftarg
, bufkey
->blkno
,
571 __read_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
575 sts
= pread(fd
, buf
, len
, offset
);
578 fprintf(stderr
, _("%s: read failed: %s\n"),
579 progname
, strerror(error
));
581 } else if (sts
!= len
) {
582 fprintf(stderr
, _("%s: error - read only %d of %d bytes\n"),
590 libxfs_readbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, struct xfs_buf
*bp
,
593 int fd
= libxfs_device_to_fd(btp
->bt_bdev
);
594 int bytes
= BBTOB(len
);
597 ASSERT(len
<= bp
->b_length
);
599 error
= __read_buf(fd
, bp
->b_addr
, bytes
, LIBXFS_BBTOOFF64(blkno
), flags
);
601 bp
->b_target
->bt_bdev
== btp
->bt_bdev
&&
602 bp
->b_cache_key
== blkno
&&
604 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
610 libxfs_readbuf_verify(
612 const struct xfs_buf_ops
*ops
)
618 bp
->b_ops
->verify_read(bp
);
619 bp
->b_flags
&= ~LIBXFS_B_UNCHECKED
;
624 libxfs_readbufr_map(struct xfs_buftarg
*btp
, struct xfs_buf
*bp
, int flags
)
631 fd
= libxfs_device_to_fd(btp
->bt_bdev
);
633 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
634 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
635 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
637 error
= __read_buf(fd
, buf
, len
, offset
, flags
);
646 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
652 struct xfs_buftarg
*btp
,
653 struct xfs_buf_map
*map
,
656 struct xfs_buf
**bpp
,
657 const struct xfs_buf_ops
*ops
)
660 bool salvage
= flags
& LIBXFS_READBUF_SALVAGE
;
665 error
= libxfs_getbuf_flags(btp
, map
[0].bm_bn
, map
[0].bm_len
,
668 error
= __libxfs_buf_get_map(btp
, map
, nmaps
, 0, &bp
);
673 * If the buffer was prefetched, it is likely that it was not validated.
674 * Hence if we are supplied an ops function and the buffer is marked as
675 * unchecked, we need to validate it now.
677 * We do this verification even if the buffer is dirty - the
678 * verification is almost certainly going to fail the CRC check in this
679 * case as a dirty buffer has not had the CRC recalculated. However, we
680 * should not be dirtying unchecked buffers and therefore failing it
681 * here because it's dirty and unchecked indicates we've screwed up
684 * Note that if the caller passes in LIBXFS_READBUF_SALVAGE, that means
685 * they want the buffer even if it fails verification.
688 if (bp
->b_flags
& (LIBXFS_B_UPTODATE
| LIBXFS_B_DIRTY
)) {
689 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
690 error
= libxfs_readbuf_verify(bp
, ops
);
691 if (error
&& !salvage
)
697 * Set the ops on a cache miss (i.e. first physical read) as the
698 * verifier may change the ops to match the type of buffer it contains.
699 * A cache hit might reset the verifier to the original type if we set
700 * it again, but it won't get called again and set to match the buffer
701 * contents. *cough* xfs_da_node_buf_ops *cough*.
704 error
= libxfs_readbufr(btp
, map
[0].bm_bn
, bp
, map
[0].bm_len
,
707 error
= libxfs_readbufr_map(btp
, bp
, flags
);
711 error
= libxfs_readbuf_verify(bp
, ops
);
712 if (error
&& !salvage
)
719 libxfs_buf_relse(bp
);
723 /* Allocate a raw uncached buffer. */
724 static inline struct xfs_buf
*
725 libxfs_getbufr_uncached(
726 struct xfs_buftarg
*targ
,
732 bp
= libxfs_getbufr(targ
, daddr
, bblen
);
736 INIT_LIST_HEAD(&bp
->b_node
.cn_hash
);
737 bp
->b_node
.cn_count
= 1;
742 * Allocate an uncached buffer that points nowhere. The refcount will be 1,
743 * and the cache node hash list will be empty to indicate that it's uncached.
746 libxfs_buf_get_uncached(
747 struct xfs_buftarg
*targ
,
750 struct xfs_buf
**bpp
)
752 *bpp
= libxfs_getbufr_uncached(targ
, XFS_BUF_DADDR_NULL
, bblen
);
753 return *bpp
!= NULL
? 0 : -ENOMEM
;
757 * Allocate and read an uncached buffer. The refcount will be 1, and the cache
758 * node hash list will be empty to indicate that it's uncached.
761 libxfs_buf_read_uncached(
762 struct xfs_buftarg
*targ
,
766 struct xfs_buf
**bpp
,
767 const struct xfs_buf_ops
*ops
)
773 bp
= libxfs_getbufr_uncached(targ
, daddr
, bblen
);
777 error
= libxfs_readbufr(targ
, daddr
, bp
, bblen
, flags
);
781 error
= libxfs_readbuf_verify(bp
, ops
);
788 libxfs_buf_relse(bp
);
793 __write_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
797 sts
= pwrite(fd
, buf
, len
, offset
);
800 fprintf(stderr
, _("%s: pwrite failed: %s\n"),
801 progname
, strerror(error
));
803 } else if (sts
!= len
) {
804 fprintf(stderr
, _("%s: error - pwrite only %d of %d bytes\n"),
815 int fd
= libxfs_device_to_fd(bp
->b_target
->bt_bdev
);
818 * we never write buffers that are marked stale. This indicates they
819 * contain data that has been invalidated, and even if the buffer is
820 * dirty it must *never* be written. Verifiers are wonderful for finding
821 * bugs like this. Make sure the error is obvious as to the cause.
823 if (bp
->b_flags
& LIBXFS_B_STALE
) {
824 bp
->b_error
= -ESTALE
;
828 /* Trigger the writeback hook if there is one. */
829 if (bp
->b_mount
->m_buf_writeback_fn
)
830 bp
->b_mount
->m_buf_writeback_fn(bp
);
833 * clear any pre-existing error status on the buffer. This can occur if
834 * the buffer is corrupt on disk and the repair process doesn't clear
835 * the error before fixing and writing it back.
839 bp
->b_ops
->verify_write(bp
);
842 _("%s: write verifier failed on %s bno 0x%llx/0x%x\n"),
843 __func__
, bp
->b_ops
->name
,
844 (unsigned long long)xfs_buf_daddr(bp
),
850 if (!(bp
->b_flags
& LIBXFS_B_DISCONTIG
)) {
851 bp
->b_error
= __write_buf(fd
, bp
->b_addr
, BBTOB(bp
->b_length
),
852 LIBXFS_BBTOOFF64(xfs_buf_daddr(bp
)),
856 void *buf
= bp
->b_addr
;
858 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
859 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
860 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
862 bp
->b_error
= __write_buf(fd
, buf
, len
, offset
,
872 _("%s: write failed on %s bno 0x%llx/0x%x, err=%d\n"),
873 __func__
, bp
->b_ops
? bp
->b_ops
->name
: "(unknown)",
874 (unsigned long long)xfs_buf_daddr(bp
),
875 bp
->b_length
, -bp
->b_error
);
877 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
878 bp
->b_flags
&= ~(LIBXFS_B_DIRTY
| LIBXFS_B_UNCHECKED
);
879 xfs_buftarg_trip_write(bp
->b_target
);
885 * Mark a buffer dirty. The dirty data will be written out when the cache
886 * is flushed (or at release time if the buffer is uncached).
889 libxfs_buf_mark_dirty(
893 * Clear any error hanging over from reading the buffer. This prevents
894 * subsequent reads after this write from seeing stale errors.
897 bp
->b_flags
&= ~LIBXFS_B_STALE
;
898 bp
->b_flags
|= LIBXFS_B_DIRTY
;
901 /* Prepare a buffer to be sent to the MRU list. */
903 libxfs_buf_prepare_mru(
907 xfs_perag_put(bp
->b_pag
);
910 if (!(bp
->b_flags
& LIBXFS_B_DIRTY
))
913 /* Complain about (and remember) dropping dirty buffers. */
914 fprintf(stderr
, _("%s: Releasing dirty buffer to free list!\n"),
917 if (bp
->b_error
== -EFSCORRUPTED
)
918 bp
->b_target
->flags
|= XFS_BUFTARG_CORRUPT_WRITE
;
919 bp
->b_target
->flags
|= XFS_BUFTARG_LOST_WRITE
;
924 struct cache_node
*node
)
926 struct xfs_buf
*bp
= container_of(node
, struct xfs_buf
,
931 libxfs_buf_prepare_mru(bp
);
933 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
934 list_add(&bp
->b_node
.cn_mru
, &xfs_buf_freelist
.cm_list
);
935 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
941 struct list_head
*list
)
946 if (list_empty(list
))
949 list_for_each_entry(bp
, list
, b_node
.cn_mru
) {
950 libxfs_buf_prepare_mru(bp
);
954 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
955 list_splice(list
, &xfs_buf_freelist
.cm_list
);
956 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
962 * Free everything from the xfs_buf_freelist MRU, used at final teardown
965 libxfs_bcache_free(void)
967 struct list_head
*cm_list
;
968 struct xfs_buf
*bp
, *next
;
970 cm_list
= &xfs_buf_freelist
.cm_list
;
971 list_for_each_entry_safe(bp
, next
, cm_list
, b_node
.cn_mru
) {
973 if (bp
->b_maps
!= &bp
->__b_map
)
975 kmem_cache_free(xfs_buf_cache
, bp
);
980 * When a buffer is marked dirty, the error is cleared. Hence if we are trying
981 * to flush a buffer prior to cache reclaim that has an error on it it means
982 * we've already tried to flush it and it failed. Prevent repeated corruption
983 * errors from being reported by skipping such buffers - when the corruption is
984 * fixed the buffer will be marked dirty again and we can write it again.
988 struct cache_node
*node
)
990 struct xfs_buf
*bp
= container_of(node
, struct xfs_buf
,
993 if (!bp
->b_error
&& bp
->b_flags
& LIBXFS_B_DIRTY
)
994 return libxfs_bwrite(bp
);
999 libxfs_bcache_purge(void)
1001 cache_purge(libxfs_bcache
);
1005 libxfs_bcache_flush(void)
1007 cache_flush(libxfs_bcache
);
1011 libxfs_bcache_overflowed(void)
1013 return cache_overflowed(libxfs_bcache
);
1016 struct cache_operations libxfs_bcache_operations
= {
1017 .hash
= libxfs_bhash
,
1018 .alloc
= libxfs_balloc
,
1019 .flush
= libxfs_bflush
,
1020 .relse
= libxfs_brelse
,
1021 .compare
= libxfs_bcompare
,
1022 .bulkrelse
= libxfs_bulkrelse
1026 * Verify an on-disk magic value against the magic value specified in the
1027 * verifier structure. The verifier magic is in disk byte order so the caller is
1028 * expected to pass the value directly from disk.
1035 struct xfs_mount
*mp
= bp
->b_mount
;
1038 idx
= xfs_has_crc(mp
);
1039 if (unlikely(WARN_ON(!bp
->b_ops
|| !bp
->b_ops
->magic
[idx
])))
1041 return dmagic
== bp
->b_ops
->magic
[idx
];
1045 * Verify an on-disk magic value against the magic value specified in the
1046 * verifier structure. The verifier magic is in disk byte order so the caller is
1047 * expected to pass the value directly from disk.
1054 struct xfs_mount
*mp
= bp
->b_mount
;
1057 idx
= xfs_has_crc(mp
);
1058 if (unlikely(WARN_ON(!bp
->b_ops
|| !bp
->b_ops
->magic16
[idx
])))
1060 return dmagic
== bp
->b_ops
->magic16
[idx
];
1064 * Inode cache stubs.
1067 struct kmem_cache
*xfs_inode_cache
;
1068 extern struct kmem_cache
*xfs_ili_cache
;
1072 struct xfs_mount
*mp
,
1073 struct xfs_trans
*tp
,
1076 struct xfs_inode
**ipp
)
1078 struct xfs_inode
*ip
;
1082 ip
= kmem_cache_zalloc(xfs_inode_cache
, 0);
1086 VFS_I(ip
)->i_count
= 1;
1089 ip
->i_af
.if_format
= XFS_DINODE_FMT_EXTENTS
;
1090 ip
->i_df
.if_present
= 1;
1091 spin_lock_init(&VFS_I(ip
)->i_lock
);
1093 error
= xfs_imap(mp
, tp
, ip
->i_ino
, &ip
->i_imap
, 0);
1097 error
= xfs_imap_to_bp(mp
, tp
, &ip
->i_imap
, &bp
);
1101 error
= xfs_inode_from_disk(ip
,
1102 xfs_buf_offset(bp
, ip
->i_imap
.im_boffset
));
1104 xfs_buf_set_ref(bp
, XFS_INO_REF
);
1105 xfs_trans_brelse(tp
, bp
);
1114 kmem_cache_free(xfs_inode_cache
, ip
);
1120 libxfs_idestroy(xfs_inode_t
*ip
)
1122 switch (VFS_I(ip
)->i_mode
& S_IFMT
) {
1126 libxfs_idestroy_fork(&ip
->i_df
);
1129 if (ip
->i_af
.if_present
) {
1130 libxfs_idestroy_fork(&ip
->i_af
);
1131 libxfs_ifork_zap_attr(ip
);
1134 libxfs_idestroy_fork(ip
->i_cowfp
);
1135 kmem_cache_free(xfs_ifork_cache
, ip
->i_cowfp
);
1141 struct xfs_inode
*ip
)
1143 VFS_I(ip
)->i_count
--;
1145 if (VFS_I(ip
)->i_count
== 0) {
1146 ASSERT(ip
->i_itemp
== NULL
);
1147 libxfs_idestroy(ip
);
1148 kmem_cache_free(xfs_inode_cache
, ip
);
1153 * Flush everything dirty in the kernel and disk write caches to stable media.
1154 * Returns 0 for success or a negative error code.
1157 libxfs_blkdev_issue_flush(
1158 struct xfs_buftarg
*btp
)
1162 if (btp
->bt_bdev
== 0)
1165 fd
= libxfs_device_to_fd(btp
->bt_bdev
);
1166 ret
= platform_flush_device(fd
, btp
->bt_bdev
);
1167 return ret
? -errno
: 0;
1171 * Write out a buffer list synchronously.
1173 * This will take the @buffer_list, write all buffers out and wait for I/O
1174 * completion on all of the buffers. @buffer_list is consumed by the function,
1175 * so callers must have some other way of tracking buffers if they require such
1179 xfs_buf_delwri_submit(
1180 struct list_head
*buffer_list
)
1182 struct xfs_buf
*bp
, *n
;
1183 int error
= 0, error2
;
1185 list_for_each_entry_safe(bp
, n
, buffer_list
, b_list
) {
1186 list_del_init(&bp
->b_list
);
1187 error2
= libxfs_bwrite(bp
);
1190 libxfs_buf_relse(bp
);
1197 * Cancel a delayed write list.
1199 * Remove each buffer from the list, clear the delwri queue flag and drop the
1200 * associated buffer reference.
1203 xfs_buf_delwri_cancel(
1204 struct list_head
*list
)
1208 while (!list_empty(list
)) {
1209 bp
= list_first_entry(list
, struct xfs_buf
, b_list
);
1211 list_del_init(&bp
->b_list
);
1212 libxfs_buf_relse(bp
);
1217 * Format the log. The caller provides either a buftarg which is used to access
1218 * the log via buffers or a direct pointer to a buffer that encapsulates the
1223 struct xfs_buftarg
*btp
,
1226 uint length
, /* basic blocks */
1229 int sunit
, /* bytes */
1234 struct xfs_buf
*bp
= NULL
;
1239 xfs_daddr_t end_blk
;
1242 if (((btp
&& dptr
) || (!btp
&& !dptr
)) ||
1243 (btp
&& !btp
->bt_bdev
) || !fs_uuid
)
1246 /* first zero the log */
1248 libxfs_device_zero(btp
, start
, length
);
1250 memset(dptr
, 0, BBTOB(length
));
1253 * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
1254 * special reset case where we only write a single record where the lsn
1255 * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
1256 * the specified cycle and points tail_lsn at the last record of the
1259 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 2;
1261 lsn
= xlog_assign_lsn(cycle
, 0);
1262 if (cycle
== XLOG_INIT_CYCLE
)
1265 tail_lsn
= xlog_assign_lsn(cycle
- 1, length
- len
);
1267 /* write out the first log record */
1270 bp
= libxfs_getbufr_uncached(btp
, start
, len
);
1273 libxfs_log_header(ptr
, fs_uuid
, version
, sunit
, fmt
, lsn
, tail_lsn
,
1276 libxfs_buf_mark_dirty(bp
);
1277 libxfs_buf_relse(bp
);
1281 * There's nothing else to do if this is a log reset. The kernel detects
1282 * the rest of the log is zeroed and starts at cycle 1.
1284 if (cycle
== XLOG_INIT_CYCLE
)
1288 * Bump the record size for a full log format if the caller allows it.
1289 * This is primarily for performance reasons and most callers don't care
1290 * about record size since the log is clean after we're done.
1293 len
= BTOBB(BDSTRAT_SIZE
);
1296 * Otherwise, fill everything beyond the initial record with records of
1297 * the previous cycle so the kernel head/tail detection works correctly.
1299 * We don't particularly care about the record size or content here.
1300 * It's only important that the headers are in place such that the
1301 * kernel finds 1.) a clean log and 2.) the correct current cycle value.
1302 * Therefore, bump up the record size to the max to use larger I/Os and
1303 * improve performance.
1309 end_blk
= start
+ length
;
1311 len
= min(end_blk
- blk
, len
);
1312 while (blk
< end_blk
) {
1313 lsn
= xlog_assign_lsn(cycle
, blk
- start
);
1314 tail_lsn
= xlog_assign_lsn(cycle
, blk
- start
- len
);
1318 bp
= libxfs_getbufr_uncached(btp
, blk
, len
);
1322 * Note: pass the full buffer length as the sunit to initialize
1323 * the entire buffer.
1325 libxfs_log_header(ptr
, fs_uuid
, version
, BBTOB(len
), fmt
, lsn
,
1326 tail_lsn
, next
, bp
);
1328 libxfs_buf_mark_dirty(bp
);
1329 libxfs_buf_relse(bp
);
1335 len
= min(end_blk
- blk
, len
);
1350 libxfs_get_block_t
*nextfunc
,
1353 xlog_rec_header_t
*head
= (xlog_rec_header_t
*)caddr
;
1359 if (lsn
== NULLCOMMITLSN
)
1360 lsn
= xlog_assign_lsn(XLOG_INIT_CYCLE
, 0);
1361 if (tail_lsn
== NULLCOMMITLSN
)
1364 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 1;
1366 memset(p
, 0, BBSIZE
);
1367 head
->h_magicno
= cpu_to_be32(XLOG_HEADER_MAGIC_NUM
);
1368 head
->h_cycle
= cpu_to_be32(CYCLE_LSN(lsn
));
1369 head
->h_version
= cpu_to_be32(version
);
1370 head
->h_crc
= cpu_to_le32(0);
1371 head
->h_prev_block
= cpu_to_be32(-1);
1372 head
->h_num_logops
= cpu_to_be32(1);
1373 head
->h_fmt
= cpu_to_be32(fmt
);
1374 head
->h_size
= cpu_to_be32(max(sunit
, XLOG_BIG_RECORD_BSIZE
));
1376 head
->h_lsn
= cpu_to_be64(lsn
);
1377 head
->h_tail_lsn
= cpu_to_be64(tail_lsn
);
1379 memcpy(&head
->h_fs_uuid
, fs_uuid
, sizeof(uuid_t
));
1382 * The kernel expects to see either a log record header magic value or
1383 * the LSN cycle at the top of every log block. The first word of each
1384 * non-header block is copied to the record headers and replaced with
1385 * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
1388 * Even though we only ever write an unmount record (one block), we
1389 * support writing log records up to the max log buffer size of 256k to
1390 * improve log format performance. This means a record can require up
1391 * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
1392 * data (each header supports 32k of data).
1394 cycle_lsn
= CYCLE_LSN_DISK(head
->h_lsn
);
1395 if (version
== 2 && sunit
> XLOG_HEADER_CYCLE_SIZE
) {
1396 hdrs
= sunit
/ XLOG_HEADER_CYCLE_SIZE
;
1397 if (sunit
% XLOG_HEADER_CYCLE_SIZE
)
1402 * A fixed number of extended headers is expected based on h_size. If
1403 * required, format those now so the unmount record is located
1406 * Since we only write an unmount record, we only need one h_cycle_data
1407 * entry for the unmount record block. The subsequent record data
1408 * blocks are zeroed, which means we can stamp them directly with the
1409 * cycle and zero the rest of the cycle data in the extended headers.
1412 for (i
= 1; i
< hdrs
; i
++) {
1413 p
= nextfunc(p
, BBSIZE
, private);
1414 memset(p
, 0, BBSIZE
);
1415 /* xlog_rec_ext_header.xh_cycle */
1416 *(__be32
*)p
= cycle_lsn
;
1421 * The total length is the max of the stripe unit or 2 basic block
1422 * minimum (1 hdr blk + 1 data blk). The record length is the total
1423 * minus however many header blocks are required.
1425 head
->h_len
= cpu_to_be32(max(BBTOB(2), sunit
) - hdrs
* BBSIZE
);
1428 * Write out the unmount record, pack the first word into the record
1429 * header and stamp the block with the cycle.
1431 p
= nextfunc(p
, BBSIZE
, private);
1434 head
->h_cycle_data
[0] = *(__be32
*)p
;
1435 *(__be32
*)p
= cycle_lsn
;
1438 * Finally, zero all remaining blocks in the record and stamp each with
1439 * the cycle. We don't need to pack any of these blocks because the
1440 * cycle data in the headers has already been zeroed.
1442 len
= max(len
, hdrs
+ 1);
1443 for (i
= hdrs
+ 1; i
< len
; i
++) {
1444 p
= nextfunc(p
, BBSIZE
, private);
1445 memset(p
, 0, BBSIZE
);
1446 *(__be32
*)p
= cycle_lsn
;
1453 libxfs_buf_set_priority(
1457 cache_node_set_priority(libxfs_bcache
, &bp
->b_node
, priority
);
1461 libxfs_buf_priority(
1464 return cache_node_get_priority(&bp
->b_node
);
1468 * Log a message about and stale a buffer that a caller has decided is corrupt.
1470 * This function should be called for the kinds of metadata corruption that
1471 * cannot be detect from a verifier, such as incorrect inter-block relationship
1472 * data. Do /not/ call this function from a verifier function.
1474 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
1475 * be marked stale, but b_error will not be set. The caller is responsible for
1476 * releasing the buffer or fixing it.
1479 __xfs_buf_mark_corrupt(
1483 ASSERT(bp
->b_flags
& XBF_DONE
);
1485 xfs_buf_corruption_error(bp
, fa
);