1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
8 #include "libxfs_priv.h"
11 #include "xfs_shared.h"
12 #include "xfs_format.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans_resv.h"
15 #include "xfs_mount.h"
16 #include "xfs_inode_buf.h"
17 #include "xfs_inode_fork.h"
18 #include "xfs_inode.h"
19 #include "xfs_trans.h"
20 #include "libfrog/platform.h"
24 static void libxfs_brelse(struct cache_node
*node
);
27 * Important design/architecture note:
29 * The userspace code that uses the buffer cache is much less constrained than
30 * the kernel code. The userspace code is pretty nasty in places, especially
31 * when it comes to buffer error handling. Very little of the userspace code
32 * outside libxfs clears bp->b_error - very little code even checks it - so the
33 * libxfs code is tripping on stale errors left by the userspace code.
35 * We can't clear errors or zero buffer contents in libxfs_buf_get-* like we do
36 * in the kernel, because those functions are used by the libxfs_readbuf_*
37 * functions and hence need to leave the buffers unchanged on cache hits. This
38 * is actually the only way to gather a write error from a libxfs_writebuf()
39 * call - you need to get the buffer again so you can check bp->b_error field -
40 * assuming that the buffer is still in the cache when you check, that is.
42 * This is very different to the kernel code which does not release buffers on a
43 * write so we can wait on IO and check errors. The kernel buffer cache also
44 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
47 * IOWs, userspace is behaving quite differently to the kernel and as a result
48 * it leaks errors from reads, invalidations and writes through
49 * libxfs_buf_get/libxfs_buf_read.
51 * The result of this is that until the userspace code outside libxfs is cleaned
52 * up, functions that release buffers from userspace control (i.e
53 * libxfs_writebuf/libxfs_buf_relse) need to zero bp->b_error to prevent
54 * propagation of stale errors into future buffer operations.
57 #define BDSTRAT_SIZE (256 * 1024)
59 #define IO_BCOMPARE_CHECK
61 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
63 libxfs_device_zero(struct xfs_buftarg
*btp
, xfs_daddr_t start
, uint len
)
65 int fd
= btp
->bt_bdev_fd
;
66 xfs_off_t start_offset
, end_offset
, offset
;
72 start_offset
= LIBXFS_BBTOOFF64(start
);
74 /* try to use special zeroing methods, fall back to writes if needed */
75 len_bytes
= LIBXFS_BBTOOFF64(len
);
76 error
= platform_zero_range(fd
, start_offset
, len_bytes
);
78 xfs_buftarg_trip_write(btp
);
82 zsize
= min(BDSTRAT_SIZE
, BBTOB(len
));
83 if ((z
= memalign(libxfs_device_alignment(), zsize
)) == NULL
) {
85 _("%s: %s can't memalign %d bytes: %s\n"),
86 progname
, __FUNCTION__
, (int)zsize
, strerror(errno
));
91 if ((lseek(fd
, start_offset
, SEEK_SET
)) < 0) {
92 fprintf(stderr
, _("%s: %s seek to offset %llu failed: %s\n"),
93 progname
, __FUNCTION__
,
94 (unsigned long long)start_offset
, strerror(errno
));
98 end_offset
= LIBXFS_BBTOOFF64(start
+ len
) - start_offset
;
99 for (offset
= 0; offset
< end_offset
; ) {
100 bytes
= min((ssize_t
)(end_offset
- offset
), zsize
);
101 if ((bytes
= write(fd
, z
, bytes
)) < 0) {
102 fprintf(stderr
, _("%s: %s write failed: %s\n"),
103 progname
, __FUNCTION__
, strerror(errno
));
105 } else if (bytes
== 0) {
106 fprintf(stderr
, _("%s: %s not progressing?\n"),
107 progname
, __FUNCTION__
);
110 xfs_buftarg_trip_write(btp
);
117 static void unmount_record(void *p
)
119 xlog_op_header_t
*op
= (xlog_op_header_t
*)p
;
120 /* the data section must be 32 bit size aligned */
124 uint32_t pad2
; /* may as well make it 64 bits */
125 } magic
= { XLOG_UNMOUNT_TYPE
, 0, 0 };
127 memset(p
, 0, BBSIZE
);
128 /* dummy tid to mark this as written from userspace */
129 op
->oh_tid
= cpu_to_be32(0xb0c0d0d0);
130 op
->oh_len
= cpu_to_be32(sizeof(magic
));
131 op
->oh_clientid
= XFS_LOG
;
132 op
->oh_flags
= XLOG_UNMOUNT_TRANS
;
135 /* and the data for this op */
136 memcpy((char *)p
+ sizeof(xlog_op_header_t
), &magic
, sizeof(magic
));
144 struct xfs_buf
*buf
= (struct xfs_buf
*)private;
147 (BBTOB(buf
->b_length
) < (int)(ptr
- (char *)buf
->b_addr
) + offset
))
155 struct xfs_mount
*mp
)
159 libxfs_buf_read(mp
->m_ddev_targp
, XFS_SB_DADDR
, XFS_FSS_TO_BB(mp
, 1),
160 0, &bp
, &xfs_sb_buf_ops
);
164 struct kmem_cache
*xfs_buf_cache
;
166 static struct cache_mru xfs_buf_freelist
=
167 {{&xfs_buf_freelist
.cm_list
, &xfs_buf_freelist
.cm_list
},
168 0, PTHREAD_MUTEX_INITIALIZER
};
171 * The bufkey is used to pass the new buffer information to the cache object
172 * allocation routine. Because discontiguous buffers need to pass different
173 * information, we need fields to pass that information. However, because the
174 * blkno and bblen is needed for the initial cache entry lookup (i.e. for
175 * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
176 * buffer initialisation instead of a contiguous buffer.
179 struct xfs_buftarg
*buftarg
;
182 struct xfs_buf_map
*map
;
186 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
187 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
188 #define CACHE_LINE_SIZE 64
190 libxfs_bhash(cache_key_t key
, unsigned int hashsize
, unsigned int hashshift
)
192 uint64_t hashval
= ((struct xfs_bufkey
*)key
)->blkno
;
195 tmp
= hashval
^ (GOLDEN_RATIO_PRIME
+ hashval
) / CACHE_LINE_SIZE
;
196 tmp
= tmp
^ ((tmp
^ GOLDEN_RATIO_PRIME
) >> hashshift
);
197 return tmp
% hashsize
;
201 libxfs_bcompare(struct cache_node
*node
, cache_key_t key
)
203 struct xfs_buf
*bp
= container_of(node
, struct xfs_buf
,
205 struct xfs_bufkey
*bkey
= (struct xfs_bufkey
*)key
;
207 if (bp
->b_target
->bt_bdev
== bkey
->buftarg
->bt_bdev
&&
208 bp
->b_cache_key
== bkey
->blkno
) {
209 if (bp
->b_length
== bkey
->bblen
)
211 #ifdef IO_BCOMPARE_CHECK
212 if (!(libxfs_bcache
->c_flags
& CACHE_MISCOMPARE_PURGE
)) {
214 "%lx: Badness in key lookup (length)\n"
215 "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
217 (unsigned long long)xfs_buf_daddr(bp
),
219 (unsigned long long)bkey
->blkno
,
229 __initbuf(struct xfs_buf
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
233 bp
->b_cache_key
= bno
;
234 bp
->b_length
= BTOBB(bytes
);
236 bp
->b_mount
= btp
->bt_mount
;
239 bp
->b_addr
= memalign(libxfs_device_alignment(), bytes
);
242 _("%s: %s can't memalign %u bytes: %s\n"),
243 progname
, __FUNCTION__
, bytes
,
247 memset(bp
->b_addr
, 0, bytes
);
248 pthread_mutex_init(&bp
->b_lock
, NULL
);
252 INIT_LIST_HEAD(&bp
->b_li_list
);
255 bp
->b_maps
= &bp
->__b_map
;
257 if (bp
->b_maps
== &bp
->__b_map
) {
259 bp
->b_maps
[0].bm_bn
= bno
;
260 bp
->b_maps
[0].bm_len
= bp
->b_length
;
265 libxfs_initbuf(struct xfs_buf
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
268 __initbuf(bp
, btp
, bno
, bytes
);
272 libxfs_initbuf_map(struct xfs_buf
*bp
, struct xfs_buftarg
*btp
,
273 struct xfs_buf_map
*map
, int nmaps
)
275 unsigned int bytes
= 0;
278 bytes
= sizeof(struct xfs_buf_map
) * nmaps
;
279 bp
->b_maps
= malloc(bytes
);
282 _("%s: %s can't malloc %u bytes: %s\n"),
283 progname
, __FUNCTION__
, bytes
,
290 for ( i
= 0; i
< nmaps
; i
++) {
291 bp
->b_maps
[i
].bm_bn
= map
[i
].bm_bn
;
292 bp
->b_maps
[i
].bm_len
= map
[i
].bm_len
;
293 bytes
+= BBTOB(map
[i
].bm_len
);
296 __initbuf(bp
, btp
, map
[0].bm_bn
, bytes
);
297 bp
->b_flags
|= LIBXFS_B_DISCONTIG
;
300 static struct xfs_buf
*
301 __libxfs_getbufr(int blen
)
306 * first look for a buffer that can be used as-is,
307 * if one cannot be found, see if there is a buffer,
308 * and if so, free its buffer and set b_addr to NULL
309 * before calling libxfs_initbuf.
311 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
312 if (!list_empty(&xfs_buf_freelist
.cm_list
)) {
313 list_for_each_entry(bp
, &xfs_buf_freelist
.cm_list
, b_node
.cn_mru
) {
314 if (bp
->b_length
== BTOBB(blen
)) {
315 list_del_init(&bp
->b_node
.cn_mru
);
319 if (&bp
->b_node
.cn_mru
== &xfs_buf_freelist
.cm_list
) {
320 bp
= list_entry(xfs_buf_freelist
.cm_list
.next
,
321 struct xfs_buf
, b_node
.cn_mru
);
322 list_del_init(&bp
->b_node
.cn_mru
);
325 if (bp
->b_maps
!= &bp
->__b_map
)
330 bp
= kmem_cache_zalloc(xfs_buf_cache
, 0);
331 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
333 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
334 fprintf(stderr
, "found dirty buffer (bulk) on free list!\n");
339 static struct xfs_buf
*
340 libxfs_getbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
)
343 int blen
= BBTOB(bblen
);
345 bp
=__libxfs_getbufr(blen
);
347 libxfs_initbuf(bp
, btp
, blkno
, blen
);
351 static struct xfs_buf
*
352 libxfs_getbufr_map(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
,
353 struct xfs_buf_map
*map
, int nmaps
)
356 int blen
= BBTOB(bblen
);
358 if (!map
|| !nmaps
) {
360 _("%s: %s invalid map %p or nmaps %d\n"),
361 progname
, __FUNCTION__
, map
, nmaps
);
365 if (blkno
!= map
[0].bm_bn
) {
367 _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
368 progname
, __FUNCTION__
, (long long)map
[0].bm_bn
,
373 bp
=__libxfs_getbufr(blen
);
375 libxfs_initbuf_map(bp
, btp
, map
, nmaps
);
383 if (use_xfs_buf_lock
)
384 pthread_mutex_lock(&bp
->b_lock
);
391 if (use_xfs_buf_lock
)
392 pthread_mutex_unlock(&bp
->b_lock
);
397 struct xfs_bufkey
*key
,
399 struct xfs_buf
**bpp
)
401 struct cache_node
*cn
= NULL
;
406 cache_node_get(libxfs_bcache
, key
, &cn
);
409 bp
= container_of(cn
, struct xfs_buf
, b_node
);
411 if (use_xfs_buf_lock
) {
414 ret
= pthread_mutex_trylock(&bp
->b_lock
);
416 ASSERT(ret
== EAGAIN
);
417 if (flags
& LIBXFS_GETBUF_TRYLOCK
) {
418 cache_node_put(libxfs_bcache
, cn
);
422 if (pthread_equal(bp
->b_holder
, pthread_self())) {
424 _("Warning: recursive buffer locking at block %" PRIu64
" detected\n"),
430 pthread_mutex_lock(&bp
->b_lock
);
434 bp
->b_holder
= pthread_self();
437 cache_node_set_priority(libxfs_bcache
, cn
,
438 cache_node_get_priority(cn
) - CACHE_PREFETCH_PRIORITY
);
445 struct xfs_buftarg
*btp
,
449 struct xfs_buf
**bpp
)
451 struct xfs_bufkey key
= {NULL
};
458 ret
= __cache_lookup(&key
, flags
, bpp
);
462 if (btp
== btp
->bt_mount
->m_ddev_targp
) {
463 (*bpp
)->b_pag
= xfs_perag_get(btp
->bt_mount
,
464 xfs_daddr_to_agno(btp
->bt_mount
, blkno
));
471 * Clean the buffer flags for libxfs_getbuf*(), which wants to return
472 * an unused buffer with clean state. This prevents CRC errors on a
473 * re-read of a corrupt block that was prefetched and freed. This
474 * can happen with a massively corrupt directory that is discarded,
475 * but whose blocks are then recycled into expanding lost+found.
477 * Note however that if the buffer's dirty (prefetch calls getbuf)
478 * we'll leave the state alone because we don't want to discard blocks
479 * that have been fixed.
485 if (bp
&& !(bp
->b_flags
& LIBXFS_B_DIRTY
))
486 bp
->b_flags
&= ~(LIBXFS_B_UNCHECKED
| LIBXFS_B_STALE
|
491 __libxfs_buf_get_map(
492 struct xfs_buftarg
*btp
,
493 struct xfs_buf_map
*map
,
496 struct xfs_buf
**bpp
)
498 struct xfs_bufkey key
= {NULL
};
502 return libxfs_getbuf_flags(btp
, map
[0].bm_bn
, map
[0].bm_len
,
506 key
.blkno
= map
[0].bm_bn
;
507 for (i
= 0; i
< nmaps
; i
++) {
508 key
.bblen
+= map
[i
].bm_len
;
513 return __cache_lookup(&key
, flags
, bpp
);
518 struct xfs_buftarg
*btp
,
519 struct xfs_buf_map
*map
,
522 struct xfs_buf
**bpp
)
526 error
= __libxfs_buf_get_map(btp
, map
, nmaps
, flags
, bpp
);
530 reset_buf_state(*bpp
);
539 * ensure that any errors on this use of the buffer don't carry
540 * over to the next user.
543 if (use_xfs_buf_lock
) {
548 pthread_mutex_unlock(&bp
->b_lock
);
552 if (!list_empty(&bp
->b_node
.cn_hash
))
553 cache_node_put(libxfs_bcache
, &bp
->b_node
);
554 else if (--bp
->b_node
.cn_count
== 0) {
555 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
557 libxfs_brelse(&bp
->b_node
);
561 static struct cache_node
*
565 struct xfs_bufkey
*bufkey
= (struct xfs_bufkey
*)key
;
569 bp
= libxfs_getbufr_map(bufkey
->buftarg
, bufkey
->blkno
,
570 bufkey
->bblen
, bufkey
->map
, bufkey
->nmaps
);
572 bp
= libxfs_getbufr(bufkey
->buftarg
, bufkey
->blkno
,
579 __read_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
583 sts
= pread(fd
, buf
, len
, offset
);
586 fprintf(stderr
, _("%s: read failed: %s\n"),
587 progname
, strerror(error
));
589 } else if (sts
!= len
) {
590 fprintf(stderr
, _("%s: error - read only %d of %d bytes\n"),
598 libxfs_readbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, struct xfs_buf
*bp
,
601 int fd
= btp
->bt_bdev_fd
;
602 int bytes
= BBTOB(len
);
605 ASSERT(len
<= bp
->b_length
);
607 error
= __read_buf(fd
, bp
->b_addr
, bytes
, LIBXFS_BBTOOFF64(blkno
), flags
);
609 bp
->b_target
->bt_bdev
== btp
->bt_bdev
&&
610 bp
->b_cache_key
== blkno
&&
612 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
618 libxfs_readbuf_verify(
620 const struct xfs_buf_ops
*ops
)
626 bp
->b_ops
->verify_read(bp
);
627 bp
->b_flags
&= ~LIBXFS_B_UNCHECKED
;
632 libxfs_readbufr_map(struct xfs_buftarg
*btp
, struct xfs_buf
*bp
, int flags
)
634 int fd
= btp
->bt_bdev_fd
;
640 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
641 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
642 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
644 error
= __read_buf(fd
, buf
, len
, offset
, flags
);
653 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
659 struct xfs_buftarg
*btp
,
660 struct xfs_buf_map
*map
,
663 struct xfs_buf
**bpp
,
664 const struct xfs_buf_ops
*ops
)
667 bool salvage
= flags
& LIBXFS_READBUF_SALVAGE
;
672 error
= libxfs_getbuf_flags(btp
, map
[0].bm_bn
, map
[0].bm_len
,
675 error
= __libxfs_buf_get_map(btp
, map
, nmaps
, 0, &bp
);
680 * If the buffer was prefetched, it is likely that it was not validated.
681 * Hence if we are supplied an ops function and the buffer is marked as
682 * unchecked, we need to validate it now.
684 * We do this verification even if the buffer is dirty - the
685 * verification is almost certainly going to fail the CRC check in this
686 * case as a dirty buffer has not had the CRC recalculated. However, we
687 * should not be dirtying unchecked buffers and therefore failing it
688 * here because it's dirty and unchecked indicates we've screwed up
691 * Note that if the caller passes in LIBXFS_READBUF_SALVAGE, that means
692 * they want the buffer even if it fails verification.
695 if (bp
->b_flags
& (LIBXFS_B_UPTODATE
| LIBXFS_B_DIRTY
)) {
696 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
697 error
= libxfs_readbuf_verify(bp
, ops
);
698 if (error
&& !salvage
)
704 * Set the ops on a cache miss (i.e. first physical read) as the
705 * verifier may change the ops to match the type of buffer it contains.
706 * A cache hit might reset the verifier to the original type if we set
707 * it again, but it won't get called again and set to match the buffer
708 * contents. *cough* xfs_da_node_buf_ops *cough*.
711 error
= libxfs_readbufr(btp
, map
[0].bm_bn
, bp
, map
[0].bm_len
,
714 error
= libxfs_readbufr_map(btp
, bp
, flags
);
718 error
= libxfs_readbuf_verify(bp
, ops
);
719 if (error
&& !salvage
)
726 libxfs_buf_relse(bp
);
730 /* Allocate a raw uncached buffer. */
731 static inline struct xfs_buf
*
732 libxfs_getbufr_uncached(
733 struct xfs_buftarg
*targ
,
739 bp
= libxfs_getbufr(targ
, daddr
, bblen
);
743 INIT_LIST_HEAD(&bp
->b_node
.cn_hash
);
744 bp
->b_node
.cn_count
= 1;
749 * Allocate an uncached buffer that points nowhere. The refcount will be 1,
750 * and the cache node hash list will be empty to indicate that it's uncached.
753 libxfs_buf_get_uncached(
754 struct xfs_buftarg
*targ
,
757 struct xfs_buf
**bpp
)
759 *bpp
= libxfs_getbufr_uncached(targ
, XFS_BUF_DADDR_NULL
, bblen
);
760 return *bpp
!= NULL
? 0 : -ENOMEM
;
764 * Allocate and read an uncached buffer. The refcount will be 1, and the cache
765 * node hash list will be empty to indicate that it's uncached.
768 libxfs_buf_read_uncached(
769 struct xfs_buftarg
*targ
,
773 struct xfs_buf
**bpp
,
774 const struct xfs_buf_ops
*ops
)
780 bp
= libxfs_getbufr_uncached(targ
, daddr
, bblen
);
784 error
= libxfs_readbufr(targ
, daddr
, bp
, bblen
, flags
);
788 error
= libxfs_readbuf_verify(bp
, ops
);
795 libxfs_buf_relse(bp
);
800 __write_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
804 sts
= pwrite(fd
, buf
, len
, offset
);
807 fprintf(stderr
, _("%s: pwrite failed: %s\n"),
808 progname
, strerror(error
));
810 } else if (sts
!= len
) {
811 fprintf(stderr
, _("%s: error - pwrite only %d of %d bytes\n"),
822 int fd
= bp
->b_target
->bt_bdev_fd
;
825 * we never write buffers that are marked stale. This indicates they
826 * contain data that has been invalidated, and even if the buffer is
827 * dirty it must *never* be written. Verifiers are wonderful for finding
828 * bugs like this. Make sure the error is obvious as to the cause.
830 if (bp
->b_flags
& LIBXFS_B_STALE
) {
831 bp
->b_error
= -ESTALE
;
835 /* Trigger the writeback hook if there is one. */
836 if (bp
->b_mount
->m_buf_writeback_fn
)
837 bp
->b_mount
->m_buf_writeback_fn(bp
);
840 * clear any pre-existing error status on the buffer. This can occur if
841 * the buffer is corrupt on disk and the repair process doesn't clear
842 * the error before fixing and writing it back.
846 bp
->b_ops
->verify_write(bp
);
849 _("%s: write verifier failed on %s bno 0x%llx/0x%x\n"),
850 __func__
, bp
->b_ops
->name
,
851 (unsigned long long)xfs_buf_daddr(bp
),
857 if (!(bp
->b_flags
& LIBXFS_B_DISCONTIG
)) {
858 bp
->b_error
= __write_buf(fd
, bp
->b_addr
, BBTOB(bp
->b_length
),
859 LIBXFS_BBTOOFF64(xfs_buf_daddr(bp
)),
863 void *buf
= bp
->b_addr
;
865 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
866 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
867 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
869 bp
->b_error
= __write_buf(fd
, buf
, len
, offset
,
879 _("%s: write failed on %s bno 0x%llx/0x%x, err=%d\n"),
880 __func__
, bp
->b_ops
? bp
->b_ops
->name
: "(unknown)",
881 (unsigned long long)xfs_buf_daddr(bp
),
882 bp
->b_length
, -bp
->b_error
);
884 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
885 bp
->b_flags
&= ~(LIBXFS_B_DIRTY
| LIBXFS_B_UNCHECKED
);
886 xfs_buftarg_trip_write(bp
->b_target
);
892 * Mark a buffer dirty. The dirty data will be written out when the cache
893 * is flushed (or at release time if the buffer is uncached).
896 libxfs_buf_mark_dirty(
900 * Clear any error hanging over from reading the buffer. This prevents
901 * subsequent reads after this write from seeing stale errors.
904 bp
->b_flags
&= ~LIBXFS_B_STALE
;
905 bp
->b_flags
|= LIBXFS_B_DIRTY
;
908 /* Prepare a buffer to be sent to the MRU list. */
910 libxfs_buf_prepare_mru(
914 xfs_perag_put(bp
->b_pag
);
917 if (!(bp
->b_flags
& LIBXFS_B_DIRTY
))
920 /* Complain about (and remember) dropping dirty buffers. */
921 fprintf(stderr
, _("%s: Releasing dirty buffer to free list!\n"),
924 if (bp
->b_error
== -EFSCORRUPTED
)
925 bp
->b_target
->flags
|= XFS_BUFTARG_CORRUPT_WRITE
;
926 bp
->b_target
->flags
|= XFS_BUFTARG_LOST_WRITE
;
931 struct cache_node
*node
)
933 struct xfs_buf
*bp
= container_of(node
, struct xfs_buf
,
938 libxfs_buf_prepare_mru(bp
);
940 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
941 list_add(&bp
->b_node
.cn_mru
, &xfs_buf_freelist
.cm_list
);
942 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
948 struct list_head
*list
)
953 if (list_empty(list
))
956 list_for_each_entry(bp
, list
, b_node
.cn_mru
) {
957 libxfs_buf_prepare_mru(bp
);
961 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
962 list_splice(list
, &xfs_buf_freelist
.cm_list
);
963 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
969 * Free everything from the xfs_buf_freelist MRU, used at final teardown
972 libxfs_bcache_free(void)
974 struct list_head
*cm_list
;
975 struct xfs_buf
*bp
, *next
;
977 cm_list
= &xfs_buf_freelist
.cm_list
;
978 list_for_each_entry_safe(bp
, next
, cm_list
, b_node
.cn_mru
) {
980 if (bp
->b_maps
!= &bp
->__b_map
)
982 kmem_cache_free(xfs_buf_cache
, bp
);
987 * When a buffer is marked dirty, the error is cleared. Hence if we are trying
988 * to flush a buffer prior to cache reclaim that has an error on it it means
989 * we've already tried to flush it and it failed. Prevent repeated corruption
990 * errors from being reported by skipping such buffers - when the corruption is
991 * fixed the buffer will be marked dirty again and we can write it again.
995 struct cache_node
*node
)
997 struct xfs_buf
*bp
= container_of(node
, struct xfs_buf
,
1000 if (!bp
->b_error
&& bp
->b_flags
& LIBXFS_B_DIRTY
)
1001 return libxfs_bwrite(bp
);
1006 libxfs_bcache_purge(void)
1008 cache_purge(libxfs_bcache
);
1012 libxfs_bcache_flush(void)
1014 cache_flush(libxfs_bcache
);
1018 libxfs_bcache_overflowed(void)
1020 return cache_overflowed(libxfs_bcache
);
1023 struct cache_operations libxfs_bcache_operations
= {
1024 .hash
= libxfs_bhash
,
1025 .alloc
= libxfs_balloc
,
1026 .flush
= libxfs_bflush
,
1027 .relse
= libxfs_brelse
,
1028 .compare
= libxfs_bcompare
,
1029 .bulkrelse
= libxfs_bulkrelse
1033 * Verify an on-disk magic value against the magic value specified in the
1034 * verifier structure. The verifier magic is in disk byte order so the caller is
1035 * expected to pass the value directly from disk.
1042 struct xfs_mount
*mp
= bp
->b_mount
;
1045 idx
= xfs_has_crc(mp
);
1046 if (unlikely(WARN_ON(!bp
->b_ops
|| !bp
->b_ops
->magic
[idx
])))
1048 return dmagic
== bp
->b_ops
->magic
[idx
];
1052 * Verify an on-disk magic value against the magic value specified in the
1053 * verifier structure. The verifier magic is in disk byte order so the caller is
1054 * expected to pass the value directly from disk.
1061 struct xfs_mount
*mp
= bp
->b_mount
;
1064 idx
= xfs_has_crc(mp
);
1065 if (unlikely(WARN_ON(!bp
->b_ops
|| !bp
->b_ops
->magic16
[idx
])))
1067 return dmagic
== bp
->b_ops
->magic16
[idx
];
1071 * Inode cache stubs.
1074 struct kmem_cache
*xfs_inode_cache
;
1075 extern struct kmem_cache
*xfs_ili_cache
;
1079 struct xfs_mount
*mp
,
1080 struct xfs_trans
*tp
,
1083 struct xfs_inode
**ipp
)
1085 struct xfs_inode
*ip
;
1087 struct xfs_perag
*pag
;
1090 /* reject inode numbers outside existing AGs */
1091 if (!ino
|| XFS_INO_TO_AGNO(mp
, ino
) >= mp
->m_sb
.sb_agcount
)
1094 ip
= kmem_cache_zalloc(xfs_inode_cache
, 0);
1098 VFS_I(ip
)->i_count
= 1;
1101 ip
->i_af
.if_format
= XFS_DINODE_FMT_EXTENTS
;
1102 spin_lock_init(&VFS_I(ip
)->i_lock
);
1104 pag
= xfs_perag_get(mp
, XFS_INO_TO_AGNO(mp
, ip
->i_ino
));
1105 error
= xfs_imap(pag
, tp
, ip
->i_ino
, &ip
->i_imap
, 0);
1111 error
= xfs_imap_to_bp(mp
, tp
, &ip
->i_imap
, &bp
);
1115 error
= xfs_inode_from_disk(ip
,
1116 xfs_buf_offset(bp
, ip
->i_imap
.im_boffset
));
1118 xfs_buf_set_ref(bp
, XFS_INO_REF
);
1119 xfs_trans_brelse(tp
, bp
);
1128 kmem_cache_free(xfs_inode_cache
, ip
);
1134 libxfs_idestroy(xfs_inode_t
*ip
)
1136 switch (VFS_I(ip
)->i_mode
& S_IFMT
) {
1140 libxfs_idestroy_fork(&ip
->i_df
);
1144 libxfs_ifork_zap_attr(ip
);
1147 libxfs_idestroy_fork(ip
->i_cowfp
);
1148 kmem_cache_free(xfs_ifork_cache
, ip
->i_cowfp
);
1154 struct xfs_inode
*ip
)
1156 VFS_I(ip
)->i_count
--;
1158 if (VFS_I(ip
)->i_count
== 0) {
1159 ASSERT(ip
->i_itemp
== NULL
);
1160 libxfs_idestroy(ip
);
1161 kmem_cache_free(xfs_inode_cache
, ip
);
1166 * Flush everything dirty in the kernel and disk write caches to stable media.
1167 * Returns 0 for success or a negative error code.
1170 libxfs_blkdev_issue_flush(
1171 struct xfs_buftarg
*btp
)
1175 if (btp
->bt_bdev
== 0)
1178 ret
= platform_flush_device(btp
->bt_bdev_fd
, btp
->bt_bdev
);
1179 return ret
? -errno
: 0;
1183 * Write out a buffer list synchronously.
1185 * This will take the @buffer_list, write all buffers out and wait for I/O
1186 * completion on all of the buffers. @buffer_list is consumed by the function,
1187 * so callers must have some other way of tracking buffers if they require such
1191 xfs_buf_delwri_submit(
1192 struct list_head
*buffer_list
)
1194 struct xfs_buf
*bp
, *n
;
1195 int error
= 0, error2
;
1197 list_for_each_entry_safe(bp
, n
, buffer_list
, b_list
) {
1198 list_del_init(&bp
->b_list
);
1199 error2
= libxfs_bwrite(bp
);
1202 libxfs_buf_relse(bp
);
1209 * Cancel a delayed write list.
1211 * Remove each buffer from the list, clear the delwri queue flag and drop the
1212 * associated buffer reference.
1215 xfs_buf_delwri_cancel(
1216 struct list_head
*list
)
1220 while (!list_empty(list
)) {
1221 bp
= list_first_entry(list
, struct xfs_buf
, b_list
);
1223 list_del_init(&bp
->b_list
);
1224 libxfs_buf_relse(bp
);
1229 * Format the log. The caller provides either a buftarg which is used to access
1230 * the log via buffers or a direct pointer to a buffer that encapsulates the
1235 struct xfs_buftarg
*btp
,
1238 uint length
, /* basic blocks */
1241 int sunit
, /* bytes */
1246 struct xfs_buf
*bp
= NULL
;
1251 xfs_daddr_t end_blk
;
1254 if (((btp
&& dptr
) || (!btp
&& !dptr
)) ||
1255 (btp
&& !btp
->bt_bdev
) || !fs_uuid
)
1258 /* first zero the log */
1260 libxfs_device_zero(btp
, start
, length
);
1262 memset(dptr
, 0, BBTOB(length
));
1265 * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
1266 * special reset case where we only write a single record where the lsn
1267 * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
1268 * the specified cycle and points tail_lsn at the last record of the
1271 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 2;
1273 lsn
= xlog_assign_lsn(cycle
, 0);
1274 if (cycle
== XLOG_INIT_CYCLE
)
1277 tail_lsn
= xlog_assign_lsn(cycle
- 1, length
- len
);
1279 /* write out the first log record */
1282 bp
= libxfs_getbufr_uncached(btp
, start
, len
);
1285 libxfs_log_header(ptr
, fs_uuid
, version
, sunit
, fmt
, lsn
, tail_lsn
,
1288 libxfs_buf_mark_dirty(bp
);
1289 libxfs_buf_relse(bp
);
1293 * There's nothing else to do if this is a log reset. The kernel detects
1294 * the rest of the log is zeroed and starts at cycle 1.
1296 if (cycle
== XLOG_INIT_CYCLE
)
1300 * Bump the record size for a full log format if the caller allows it.
1301 * This is primarily for performance reasons and most callers don't care
1302 * about record size since the log is clean after we're done.
1305 len
= BTOBB(BDSTRAT_SIZE
);
1308 * Otherwise, fill everything beyond the initial record with records of
1309 * the previous cycle so the kernel head/tail detection works correctly.
1311 * We don't particularly care about the record size or content here.
1312 * It's only important that the headers are in place such that the
1313 * kernel finds 1.) a clean log and 2.) the correct current cycle value.
1314 * Therefore, bump up the record size to the max to use larger I/Os and
1315 * improve performance.
1321 end_blk
= start
+ length
;
1323 len
= min(end_blk
- blk
, len
);
1324 while (blk
< end_blk
) {
1325 lsn
= xlog_assign_lsn(cycle
, blk
- start
);
1326 tail_lsn
= xlog_assign_lsn(cycle
, blk
- start
- len
);
1330 bp
= libxfs_getbufr_uncached(btp
, blk
, len
);
1334 * Note: pass the full buffer length as the sunit to initialize
1335 * the entire buffer.
1337 libxfs_log_header(ptr
, fs_uuid
, version
, BBTOB(len
), fmt
, lsn
,
1338 tail_lsn
, next
, bp
);
1340 libxfs_buf_mark_dirty(bp
);
1341 libxfs_buf_relse(bp
);
1347 len
= min(end_blk
- blk
, len
);
1362 libxfs_get_block_t
*nextfunc
,
1365 xlog_rec_header_t
*head
= (xlog_rec_header_t
*)caddr
;
1371 if (lsn
== NULLCOMMITLSN
)
1372 lsn
= xlog_assign_lsn(XLOG_INIT_CYCLE
, 0);
1373 if (tail_lsn
== NULLCOMMITLSN
)
1376 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 1;
1378 memset(p
, 0, BBSIZE
);
1379 head
->h_magicno
= cpu_to_be32(XLOG_HEADER_MAGIC_NUM
);
1380 head
->h_cycle
= cpu_to_be32(CYCLE_LSN(lsn
));
1381 head
->h_version
= cpu_to_be32(version
);
1382 head
->h_crc
= cpu_to_le32(0);
1383 head
->h_prev_block
= cpu_to_be32(-1);
1384 head
->h_num_logops
= cpu_to_be32(1);
1385 head
->h_fmt
= cpu_to_be32(fmt
);
1386 head
->h_size
= cpu_to_be32(max(sunit
, XLOG_BIG_RECORD_BSIZE
));
1388 head
->h_lsn
= cpu_to_be64(lsn
);
1389 head
->h_tail_lsn
= cpu_to_be64(tail_lsn
);
1391 memcpy(&head
->h_fs_uuid
, fs_uuid
, sizeof(uuid_t
));
1394 * The kernel expects to see either a log record header magic value or
1395 * the LSN cycle at the top of every log block. The first word of each
1396 * non-header block is copied to the record headers and replaced with
1397 * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
1400 * Even though we only ever write an unmount record (one block), we
1401 * support writing log records up to the max log buffer size of 256k to
1402 * improve log format performance. This means a record can require up
1403 * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
1404 * data (each header supports 32k of data).
1406 cycle_lsn
= CYCLE_LSN_DISK(head
->h_lsn
);
1407 if (version
== 2 && sunit
> XLOG_HEADER_CYCLE_SIZE
) {
1408 hdrs
= sunit
/ XLOG_HEADER_CYCLE_SIZE
;
1409 if (sunit
% XLOG_HEADER_CYCLE_SIZE
)
1414 * A fixed number of extended headers is expected based on h_size. If
1415 * required, format those now so the unmount record is located
1418 * Since we only write an unmount record, we only need one h_cycle_data
1419 * entry for the unmount record block. The subsequent record data
1420 * blocks are zeroed, which means we can stamp them directly with the
1421 * cycle and zero the rest of the cycle data in the extended headers.
1424 for (i
= 1; i
< hdrs
; i
++) {
1425 p
= nextfunc(p
, BBSIZE
, private);
1426 memset(p
, 0, BBSIZE
);
1427 /* xlog_rec_ext_header.xh_cycle */
1428 *(__be32
*)p
= cycle_lsn
;
1433 * The total length is the max of the stripe unit or 2 basic block
1434 * minimum (1 hdr blk + 1 data blk). The record length is the total
1435 * minus however many header blocks are required.
1437 head
->h_len
= cpu_to_be32(max(BBTOB(2), sunit
) - hdrs
* BBSIZE
);
1440 * Write out the unmount record, pack the first word into the record
1441 * header and stamp the block with the cycle.
1443 p
= nextfunc(p
, BBSIZE
, private);
1446 head
->h_cycle_data
[0] = *(__be32
*)p
;
1447 *(__be32
*)p
= cycle_lsn
;
1450 * Finally, zero all remaining blocks in the record and stamp each with
1451 * the cycle. We don't need to pack any of these blocks because the
1452 * cycle data in the headers has already been zeroed.
1454 len
= max(len
, hdrs
+ 1);
1455 for (i
= hdrs
+ 1; i
< len
; i
++) {
1456 p
= nextfunc(p
, BBSIZE
, private);
1457 memset(p
, 0, BBSIZE
);
1458 *(__be32
*)p
= cycle_lsn
;
1465 libxfs_buf_set_priority(
1469 cache_node_set_priority(libxfs_bcache
, &bp
->b_node
, priority
);
1473 libxfs_buf_priority(
1476 return cache_node_get_priority(&bp
->b_node
);
1480 * Log a message about and stale a buffer that a caller has decided is corrupt.
1482 * This function should be called for the kinds of metadata corruption that
1483 * cannot be detect from a verifier, such as incorrect inter-block relationship
1484 * data. Do /not/ call this function from a verifier function.
1486 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
1487 * be marked stale, but b_error will not be set. The caller is responsible for
1488 * releasing the buffer or fixing it.
1491 __xfs_buf_mark_corrupt(
1495 ASSERT(bp
->b_flags
& XBF_DONE
);
1497 xfs_buf_corruption_error(bp
, fa
);