1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
8 #include "libxfs_priv.h"
11 #include "xfs_shared.h"
12 #include "xfs_format.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans_resv.h"
15 #include "xfs_mount.h"
16 #include "xfs_inode_buf.h"
17 #include "xfs_inode_fork.h"
18 #include "xfs_inode.h"
19 #include "xfs_trans.h"
21 #include "libxfs.h" /* for LIBXFS_EXIT_ON_FAILURE */
24 * Important design/architecture note:
26 * The userspace code that uses the buffer cache is much less constrained than
27 * the kernel code. The userspace code is pretty nasty in places, especially
28 * when it comes to buffer error handling. Very little of the userspace code
29 * outside libxfs clears bp->b_error - very little code even checks it - so the
30 * libxfs code is tripping on stale errors left by the userspace code.
32 * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
33 * in the kernel, because those functions are used by the libxfs_readbuf_*
34 * functions and hence need to leave the buffers unchanged on cache hits. This
35 * is actually the only way to gather a write error from a libxfs_writebuf()
36 * call - you need to get the buffer again so you can check bp->b_error field -
37 * assuming that the buffer is still in the cache when you check, that is.
39 * This is very different to the kernel code which does not release buffers on a
40 * write so we can wait on IO and check errors. The kernel buffer cache also
41 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
44 * IOWs, userspace is behaving quite differently to the kernel and as a result
45 * it leaks errors from reads, invalidations and writes through
46 * libxfs_getbuf/libxfs_readbuf.
48 * The result of this is that until the userspace code outside libxfs is cleaned
49 * up, functions that release buffers from userspace control (i.e
50 * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
51 * propagation of stale errors into future buffer operations.
54 #define BDSTRAT_SIZE (256 * 1024)
56 #define IO_BCOMPARE_CHECK
58 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
60 libxfs_device_zero(struct xfs_buftarg
*btp
, xfs_daddr_t start
, uint len
)
62 xfs_off_t start_offset
, end_offset
, offset
;
67 zsize
= min(BDSTRAT_SIZE
, BBTOB(len
));
68 if ((z
= memalign(libxfs_device_alignment(), zsize
)) == NULL
) {
70 _("%s: %s can't memalign %d bytes: %s\n"),
71 progname
, __FUNCTION__
, (int)zsize
, strerror(errno
));
76 fd
= libxfs_device_to_fd(btp
->dev
);
77 start_offset
= LIBXFS_BBTOOFF64(start
);
79 if ((lseek(fd
, start_offset
, SEEK_SET
)) < 0) {
80 fprintf(stderr
, _("%s: %s seek to offset %llu failed: %s\n"),
81 progname
, __FUNCTION__
,
82 (unsigned long long)start_offset
, strerror(errno
));
86 end_offset
= LIBXFS_BBTOOFF64(start
+ len
) - start_offset
;
87 for (offset
= 0; offset
< end_offset
; ) {
88 bytes
= min((ssize_t
)(end_offset
- offset
), zsize
);
89 if ((bytes
= write(fd
, z
, bytes
)) < 0) {
90 fprintf(stderr
, _("%s: %s write failed: %s\n"),
91 progname
, __FUNCTION__
, strerror(errno
));
93 } else if (bytes
== 0) {
94 fprintf(stderr
, _("%s: %s not progressing?\n"),
95 progname
, __FUNCTION__
);
104 static void unmount_record(void *p
)
106 xlog_op_header_t
*op
= (xlog_op_header_t
*)p
;
107 /* the data section must be 32 bit size aligned */
111 uint32_t pad2
; /* may as well make it 64 bits */
112 } magic
= { XLOG_UNMOUNT_TYPE
, 0, 0 };
114 memset(p
, 0, BBSIZE
);
115 /* dummy tid to mark this as written from userspace */
116 op
->oh_tid
= cpu_to_be32(0xb0c0d0d0);
117 op
->oh_len
= cpu_to_be32(sizeof(magic
));
118 op
->oh_clientid
= XFS_LOG
;
119 op
->oh_flags
= XLOG_UNMOUNT_TRANS
;
122 /* and the data for this op */
123 memcpy((char *)p
+ sizeof(xlog_op_header_t
), &magic
, sizeof(magic
));
131 struct xfs_buf
*buf
= (struct xfs_buf
*)private;
134 (buf
->b_bcount
< (int)(ptr
- (char *)buf
->b_addr
) + offset
))
141 * Format the log. The caller provides either a buftarg which is used to access
142 * the log via buffers or a direct pointer to a buffer that encapsulates the
147 struct xfs_buftarg
*btp
,
150 uint length
, /* basic blocks */
153 int sunit
, /* bytes */
158 struct xfs_buf
*bp
= NULL
;
166 if (((btp
&& dptr
) || (!btp
&& !dptr
)) ||
167 (btp
&& !btp
->dev
) || !fs_uuid
)
170 /* first zero the log */
172 libxfs_device_zero(btp
, start
, length
);
174 memset(dptr
, 0, BBTOB(length
));
177 * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
178 * special reset case where we only write a single record where the lsn
179 * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
180 * the specified cycle and points tail_lsn at the last record of the
183 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 2;
185 lsn
= xlog_assign_lsn(cycle
, 0);
186 if (cycle
== XLOG_INIT_CYCLE
)
189 tail_lsn
= xlog_assign_lsn(cycle
- 1, length
- len
);
191 /* write out the first log record */
194 bp
= libxfs_getbufr(btp
, start
, len
);
197 libxfs_log_header(ptr
, fs_uuid
, version
, sunit
, fmt
, lsn
, tail_lsn
,
200 bp
->b_flags
|= LIBXFS_B_DIRTY
;
205 * There's nothing else to do if this is a log reset. The kernel detects
206 * the rest of the log is zeroed and starts at cycle 1.
208 if (cycle
== XLOG_INIT_CYCLE
)
212 * Bump the record size for a full log format if the caller allows it.
213 * This is primarily for performance reasons and most callers don't care
214 * about record size since the log is clean after we're done.
217 len
= BTOBB(BDSTRAT_SIZE
);
220 * Otherwise, fill everything beyond the initial record with records of
221 * the previous cycle so the kernel head/tail detection works correctly.
223 * We don't particularly care about the record size or content here.
224 * It's only important that the headers are in place such that the
225 * kernel finds 1.) a clean log and 2.) the correct current cycle value.
226 * Therefore, bump up the record size to the max to use larger I/Os and
227 * improve performance.
233 end_blk
= start
+ length
;
235 len
= min(end_blk
- blk
, len
);
236 while (blk
< end_blk
) {
237 lsn
= xlog_assign_lsn(cycle
, blk
- start
);
238 tail_lsn
= xlog_assign_lsn(cycle
, blk
- start
- len
);
242 bp
= libxfs_getbufr(btp
, blk
, len
);
246 * Note: pass the full buffer length as the sunit to initialize
249 libxfs_log_header(ptr
, fs_uuid
, version
, BBTOB(len
), fmt
, lsn
,
252 bp
->b_flags
|= LIBXFS_B_DIRTY
;
259 len
= min(end_blk
- blk
, len
);
274 libxfs_get_block_t
*nextfunc
,
277 xlog_rec_header_t
*head
= (xlog_rec_header_t
*)caddr
;
283 if (lsn
== NULLCOMMITLSN
)
284 lsn
= xlog_assign_lsn(XLOG_INIT_CYCLE
, 0);
285 if (tail_lsn
== NULLCOMMITLSN
)
288 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 1;
290 memset(p
, 0, BBSIZE
);
291 head
->h_magicno
= cpu_to_be32(XLOG_HEADER_MAGIC_NUM
);
292 head
->h_cycle
= cpu_to_be32(CYCLE_LSN(lsn
));
293 head
->h_version
= cpu_to_be32(version
);
294 head
->h_crc
= cpu_to_le32(0);
295 head
->h_prev_block
= cpu_to_be32(-1);
296 head
->h_num_logops
= cpu_to_be32(1);
297 head
->h_fmt
= cpu_to_be32(fmt
);
298 head
->h_size
= cpu_to_be32(max(sunit
, XLOG_BIG_RECORD_BSIZE
));
300 head
->h_lsn
= cpu_to_be64(lsn
);
301 head
->h_tail_lsn
= cpu_to_be64(tail_lsn
);
303 memcpy(&head
->h_fs_uuid
, fs_uuid
, sizeof(uuid_t
));
306 * The kernel expects to see either a log record header magic value or
307 * the LSN cycle at the top of every log block. The first word of each
308 * non-header block is copied to the record headers and replaced with
309 * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
312 * Even though we only ever write an unmount record (one block), we
313 * support writing log records up to the max log buffer size of 256k to
314 * improve log format performance. This means a record can require up
315 * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
316 * data (each header supports 32k of data).
318 cycle_lsn
= CYCLE_LSN_DISK(head
->h_lsn
);
319 if (version
== 2 && sunit
> XLOG_HEADER_CYCLE_SIZE
) {
320 hdrs
= sunit
/ XLOG_HEADER_CYCLE_SIZE
;
321 if (sunit
% XLOG_HEADER_CYCLE_SIZE
)
326 * A fixed number of extended headers is expected based on h_size. If
327 * required, format those now so the unmount record is located
330 * Since we only write an unmount record, we only need one h_cycle_data
331 * entry for the unmount record block. The subsequent record data
332 * blocks are zeroed, which means we can stamp them directly with the
333 * cycle and zero the rest of the cycle data in the extended headers.
336 for (i
= 1; i
< hdrs
; i
++) {
337 p
= nextfunc(p
, BBSIZE
, private);
338 memset(p
, 0, BBSIZE
);
339 /* xlog_rec_ext_header.xh_cycle */
340 *(__be32
*)p
= cycle_lsn
;
345 * The total length is the max of the stripe unit or 2 basic block
346 * minimum (1 hdr blk + 1 data blk). The record length is the total
347 * minus however many header blocks are required.
349 head
->h_len
= cpu_to_be32(max(BBTOB(2), sunit
) - hdrs
* BBSIZE
);
352 * Write out the unmount record, pack the first word into the record
353 * header and stamp the block with the cycle.
355 p
= nextfunc(p
, BBSIZE
, private);
358 head
->h_cycle_data
[0] = *(__be32
*)p
;
359 *(__be32
*)p
= cycle_lsn
;
362 * Finally, zero all remaining blocks in the record and stamp each with
363 * the cycle. We don't need to pack any of these blocks because the
364 * cycle data in the headers has already been zeroed.
366 len
= max(len
, hdrs
+ 1);
367 for (i
= hdrs
+ 1; i
< len
; i
++) {
368 p
= nextfunc(p
, BBSIZE
, private);
369 memset(p
, 0, BBSIZE
);
370 *(__be32
*)p
= cycle_lsn
;
377 * Simple I/O (buffer cache) interface
381 #ifdef XFS_BUF_TRACING
383 #undef libxfs_readbuf
384 #undef libxfs_readbuf_map
385 #undef libxfs_writebuf
387 #undef libxfs_getbuf_map
388 #undef libxfs_getbuf_flags
391 xfs_buf_t
*libxfs_readbuf(struct xfs_buftarg
*, xfs_daddr_t
, int, int,
392 const struct xfs_buf_ops
*);
393 xfs_buf_t
*libxfs_readbuf_map(struct xfs_buftarg
*, struct xfs_buf_map
*,
394 int, int, const struct xfs_buf_ops
*);
395 int libxfs_writebuf(xfs_buf_t
*, int);
396 xfs_buf_t
*libxfs_getbuf(struct xfs_buftarg
*, xfs_daddr_t
, int);
397 xfs_buf_t
*libxfs_getbuf_map(struct xfs_buftarg
*, struct xfs_buf_map
*,
399 xfs_buf_t
*libxfs_getbuf_flags(struct xfs_buftarg
*, xfs_daddr_t
, int,
401 void libxfs_putbuf (xfs_buf_t
*);
403 #define __add_trace(bp, func, file, line) \
406 (bp)->b_func = (func); \
407 (bp)->b_file = (file); \
408 (bp)->b_line = (line); \
413 libxfs_trace_readbuf(const char *func
, const char *file
, int line
,
414 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, int flags
,
415 const struct xfs_buf_ops
*ops
)
417 xfs_buf_t
*bp
= libxfs_readbuf(btp
, blkno
, len
, flags
, ops
);
418 __add_trace(bp
, func
, file
, line
);
423 libxfs_trace_readbuf_map(const char *func
, const char *file
, int line
,
424 struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
, int flags
,
425 const struct xfs_buf_ops
*ops
)
427 xfs_buf_t
*bp
= libxfs_readbuf_map(btp
, map
, nmaps
, flags
, ops
);
428 __add_trace(bp
, func
, file
, line
);
433 libxfs_trace_writebuf(const char *func
, const char *file
, int line
, xfs_buf_t
*bp
, int flags
)
435 __add_trace(bp
, func
, file
, line
);
436 return libxfs_writebuf(bp
, flags
);
440 libxfs_trace_getbuf(const char *func
, const char *file
, int line
,
441 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
)
443 xfs_buf_t
*bp
= libxfs_getbuf(btp
, blkno
, len
);
444 __add_trace(bp
, func
, file
, line
);
449 libxfs_trace_getbuf_map(const char *func
, const char *file
, int line
,
450 struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
,
453 xfs_buf_t
*bp
= libxfs_getbuf_map(btp
, map
, nmaps
, flags
);
454 __add_trace(bp
, func
, file
, line
);
459 libxfs_trace_getbuf_flags(const char *func
, const char *file
, int line
,
460 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, unsigned int flags
)
462 xfs_buf_t
*bp
= libxfs_getbuf_flags(btp
, blkno
, len
, flags
);
463 __add_trace(bp
, func
, file
, line
);
468 libxfs_trace_putbuf(const char *func
, const char *file
, int line
, xfs_buf_t
*bp
)
470 __add_trace(bp
, func
, file
, line
);
479 libxfs_getsb(xfs_mount_t
*mp
, int flags
)
481 return libxfs_readbuf(mp
->m_ddev_targp
, XFS_SB_DADDR
,
482 XFS_FSS_TO_BB(mp
, 1), flags
, &xfs_sb_buf_ops
);
485 kmem_zone_t
*xfs_buf_zone
;
487 static struct cache_mru xfs_buf_freelist
=
488 {{&xfs_buf_freelist
.cm_list
, &xfs_buf_freelist
.cm_list
},
489 0, PTHREAD_MUTEX_INITIALIZER
};
492 * The bufkey is used to pass the new buffer information to the cache object
493 * allocation routine. Because discontiguous buffers need to pass different
494 * information, we need fields to pass that information. However, because the
495 * blkno and bblen is needed for the initial cache entry lookup (i.e. for
496 * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
497 * buffer initialisation instead of a contiguous buffer.
500 struct xfs_buftarg
*buftarg
;
503 struct xfs_buf_map
*map
;
507 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
508 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
509 #define CACHE_LINE_SIZE 64
511 libxfs_bhash(cache_key_t key
, unsigned int hashsize
, unsigned int hashshift
)
513 uint64_t hashval
= ((struct xfs_bufkey
*)key
)->blkno
;
516 tmp
= hashval
^ (GOLDEN_RATIO_PRIME
+ hashval
) / CACHE_LINE_SIZE
;
517 tmp
= tmp
^ ((tmp
^ GOLDEN_RATIO_PRIME
) >> hashshift
);
518 return tmp
% hashsize
;
522 libxfs_bcompare(struct cache_node
*node
, cache_key_t key
)
524 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
525 struct xfs_bufkey
*bkey
= (struct xfs_bufkey
*)key
;
527 if (bp
->b_target
->dev
== bkey
->buftarg
->dev
&&
528 bp
->b_bn
== bkey
->blkno
) {
529 if (bp
->b_bcount
== BBTOB(bkey
->bblen
))
531 #ifdef IO_BCOMPARE_CHECK
532 if (!(libxfs_bcache
->c_flags
& CACHE_MISCOMPARE_PURGE
)) {
534 "%lx: Badness in key lookup (length)\n"
535 "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
537 (unsigned long long)bp
->b_bn
, (int)bp
->b_bcount
,
538 (unsigned long long)bkey
->blkno
,
548 libxfs_bprint(xfs_buf_t
*bp
)
550 fprintf(stderr
, "Buffer %p blkno=%llu bytes=%u flags=0x%x count=%u\n",
551 bp
, (unsigned long long)bp
->b_bn
, (unsigned)bp
->b_bcount
,
552 bp
->b_flags
, bp
->b_node
.cn_count
);
556 __initbuf(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
561 bp
->b_bcount
= bytes
;
562 bp
->b_length
= BTOBB(bytes
);
566 bp
->b_addr
= memalign(libxfs_device_alignment(), bytes
);
569 _("%s: %s can't memalign %u bytes: %s\n"),
570 progname
, __FUNCTION__
, bytes
,
574 memset(bp
->b_addr
, 0, bytes
);
575 #ifdef XFS_BUF_TRACING
576 list_head_init(&bp
->b_lock_list
);
578 pthread_mutex_init(&bp
->b_lock
, NULL
);
585 bp
->b_maps
= &bp
->__b_map
;
586 bp
->b_maps
[0].bm_bn
= bp
->b_bn
;
587 bp
->b_maps
[0].bm_len
= bp
->b_length
;
592 libxfs_initbuf(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
595 __initbuf(bp
, btp
, bno
, bytes
);
599 libxfs_initbuf_map(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
,
600 struct xfs_buf_map
*map
, int nmaps
)
602 unsigned int bytes
= 0;
605 bytes
= sizeof(struct xfs_buf_map
) * nmaps
;
606 bp
->b_maps
= malloc(bytes
);
609 _("%s: %s can't malloc %u bytes: %s\n"),
610 progname
, __FUNCTION__
, bytes
,
617 for ( i
= 0; i
< nmaps
; i
++) {
618 bp
->b_maps
[i
].bm_bn
= map
[i
].bm_bn
;
619 bp
->b_maps
[i
].bm_len
= map
[i
].bm_len
;
620 bytes
+= BBTOB(map
[i
].bm_len
);
623 __initbuf(bp
, btp
, map
[0].bm_bn
, bytes
);
624 bp
->b_flags
|= LIBXFS_B_DISCONTIG
;
628 __libxfs_getbufr(int blen
)
633 * first look for a buffer that can be used as-is,
634 * if one cannot be found, see if there is a buffer,
635 * and if so, free its buffer and set b_addr to NULL
636 * before calling libxfs_initbuf.
638 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
639 if (!list_empty(&xfs_buf_freelist
.cm_list
)) {
640 list_for_each_entry(bp
, &xfs_buf_freelist
.cm_list
, b_node
.cn_mru
) {
641 if (bp
->b_bcount
== blen
) {
642 list_del_init(&bp
->b_node
.cn_mru
);
646 if (&bp
->b_node
.cn_mru
== &xfs_buf_freelist
.cm_list
) {
647 bp
= list_entry(xfs_buf_freelist
.cm_list
.next
,
648 xfs_buf_t
, b_node
.cn_mru
);
649 list_del_init(&bp
->b_node
.cn_mru
);
652 if (bp
->b_maps
!= &bp
->__b_map
)
657 bp
= kmem_zone_zalloc(xfs_buf_zone
, 0);
658 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
660 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
661 fprintf(stderr
, "found dirty buffer (bulk) on free list!");
667 libxfs_getbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
)
670 int blen
= BBTOB(bblen
);
672 bp
=__libxfs_getbufr(blen
);
674 libxfs_initbuf(bp
, btp
, blkno
, blen
);
676 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
677 pthread_self(), __FUNCTION__
, blen
,
678 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
685 libxfs_getbufr_map(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
,
686 struct xfs_buf_map
*map
, int nmaps
)
689 int blen
= BBTOB(bblen
);
691 if (!map
|| !nmaps
) {
693 _("%s: %s invalid map %p or nmaps %d\n"),
694 progname
, __FUNCTION__
, map
, nmaps
);
698 if (blkno
!= map
[0].bm_bn
) {
700 _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
701 progname
, __FUNCTION__
, (long long)map
[0].bm_bn
,
706 bp
=__libxfs_getbufr(blen
);
708 libxfs_initbuf_map(bp
, btp
, map
, nmaps
);
710 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
711 pthread_self(), __FUNCTION__
, blen
,
712 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
718 #ifdef XFS_BUF_TRACING
719 struct list_head lock_buf_list
= {&lock_buf_list
, &lock_buf_list
};
720 int lock_buf_count
= 0;
723 extern int use_xfs_buf_lock
;
725 static struct xfs_buf
*
726 __cache_lookup(struct xfs_bufkey
*key
, unsigned int flags
)
730 cache_node_get(libxfs_bcache
, key
, (struct cache_node
**)&bp
);
734 if (use_xfs_buf_lock
) {
737 ret
= pthread_mutex_trylock(&bp
->b_lock
);
739 ASSERT(ret
== EAGAIN
);
740 if (flags
& LIBXFS_GETBUF_TRYLOCK
)
743 if (pthread_equal(bp
->b_holder
, pthread_self())) {
745 _("Warning: recursive buffer locking at block %" PRIu64
" detected\n"),
750 pthread_mutex_lock(&bp
->b_lock
);
754 bp
->b_holder
= pthread_self();
757 cache_node_set_priority(libxfs_bcache
, (struct cache_node
*)bp
,
758 cache_node_get_priority((struct cache_node
*)bp
) -
759 CACHE_PREFETCH_PRIORITY
);
760 #ifdef XFS_BUF_TRACING
761 pthread_mutex_lock(&libxfs_bcache
->c_mutex
);
763 list_add(&bp
->b_lock_list
, &lock_buf_list
);
764 pthread_mutex_unlock(&libxfs_bcache
->c_mutex
);
767 printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
768 pthread_self(), __FUNCTION__
,
769 bp
, bp
->b_bn
, (long long)LIBXFS_BBTOOFF64(key
->blkno
));
774 cache_node_put(libxfs_bcache
, (struct cache_node
*)bp
);
779 libxfs_getbuf_flags(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
,
782 struct xfs_bufkey key
= {NULL
};
788 return __cache_lookup(&key
, flags
);
792 * Clean the buffer flags for libxfs_getbuf*(), which wants to return
793 * an unused buffer with clean state. This prevents CRC errors on a
794 * re-read of a corrupt block that was prefetched and freed. This
795 * can happen with a massively corrupt directory that is discarded,
796 * but whose blocks are then recycled into expanding lost+found.
798 * Note however that if the buffer's dirty (prefetch calls getbuf)
799 * we'll leave the state alone because we don't want to discard blocks
800 * that have been fixed.
806 if (bp
&& !(bp
->b_flags
& LIBXFS_B_DIRTY
))
807 bp
->b_flags
&= ~(LIBXFS_B_UNCHECKED
| LIBXFS_B_STALE
|
812 libxfs_getbuf(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
)
816 bp
= libxfs_getbuf_flags(btp
, blkno
, len
, 0);
821 static struct xfs_buf
*
822 __libxfs_getbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
,
823 int nmaps
, int flags
)
825 struct xfs_bufkey key
= {NULL
};
829 return libxfs_getbuf_flags(btp
, map
[0].bm_bn
, map
[0].bm_len
,
833 key
.blkno
= map
[0].bm_bn
;
834 for (i
= 0; i
< nmaps
; i
++) {
835 key
.bblen
+= map
[i
].bm_len
;
840 return __cache_lookup(&key
, flags
);
844 libxfs_getbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
,
845 int nmaps
, int flags
)
849 bp
= __libxfs_getbuf_map(btp
, map
, nmaps
, flags
);
855 libxfs_putbuf(xfs_buf_t
*bp
)
858 * ensure that any errors on this use of the buffer don't carry
859 * over to the next user.
863 #ifdef XFS_BUF_TRACING
864 pthread_mutex_lock(&libxfs_bcache
->c_mutex
);
866 ASSERT(lock_buf_count
>= 0);
867 list_del_init(&bp
->b_lock_list
);
868 pthread_mutex_unlock(&libxfs_bcache
->c_mutex
);
870 if (use_xfs_buf_lock
) {
875 pthread_mutex_unlock(&bp
->b_lock
);
879 cache_node_put(libxfs_bcache
, (struct cache_node
*)bp
);
883 libxfs_purgebuf(xfs_buf_t
*bp
)
885 struct xfs_bufkey key
= {NULL
};
887 key
.buftarg
= bp
->b_target
;
888 key
.blkno
= bp
->b_bn
;
889 key
.bblen
= bp
->b_length
;
891 cache_node_purge(libxfs_bcache
, &key
, (struct cache_node
*)bp
);
894 static struct cache_node
*
895 libxfs_balloc(cache_key_t key
)
897 struct xfs_bufkey
*bufkey
= (struct xfs_bufkey
*)key
;
900 return (struct cache_node
*)
901 libxfs_getbufr_map(bufkey
->buftarg
,
902 bufkey
->blkno
, bufkey
->bblen
,
903 bufkey
->map
, bufkey
->nmaps
);
904 return (struct cache_node
*)libxfs_getbufr(bufkey
->buftarg
,
905 bufkey
->blkno
, bufkey
->bblen
);
910 __read_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
914 sts
= pread(fd
, buf
, len
, offset
);
917 fprintf(stderr
, _("%s: read failed: %s\n"),
918 progname
, strerror(error
));
919 if (flags
& LIBXFS_EXIT_ON_FAILURE
)
922 } else if (sts
!= len
) {
923 fprintf(stderr
, _("%s: error - read only %d of %d bytes\n"),
925 if (flags
& LIBXFS_EXIT_ON_FAILURE
)
933 libxfs_readbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, xfs_buf_t
*bp
,
936 int fd
= libxfs_device_to_fd(btp
->dev
);
937 int bytes
= BBTOB(len
);
940 ASSERT(BBTOB(len
) <= bp
->b_bcount
);
942 error
= __read_buf(fd
, bp
->b_addr
, bytes
, LIBXFS_BBTOOFF64(blkno
), flags
);
944 bp
->b_target
->dev
== btp
->dev
&&
946 bp
->b_bcount
== bytes
)
947 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
949 printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
950 pthread_self(), __FUNCTION__
, bytes
, error
,
951 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
957 libxfs_readbuf_verify(struct xfs_buf
*bp
, const struct xfs_buf_ops
*ops
)
962 bp
->b_ops
->verify_read(bp
);
963 bp
->b_flags
&= ~LIBXFS_B_UNCHECKED
;
968 libxfs_readbuf(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, int flags
,
969 const struct xfs_buf_ops
*ops
)
974 bp
= libxfs_getbuf_flags(btp
, blkno
, len
, 0);
979 * if the buffer was prefetched, it is likely that it was not validated.
980 * Hence if we are supplied an ops function and the buffer is marked as
981 * unchecked, we need to validate it now.
983 * We do this verification even if the buffer is dirty - the
984 * verification is almost certainly going to fail the CRC check in this
985 * case as a dirty buffer has not had the CRC recalculated. However, we
986 * should not be dirtying unchecked buffers and therefore failing it
987 * here because it's dirty and unchecked indicates we've screwed up
991 if ((bp
->b_flags
& (LIBXFS_B_UPTODATE
|LIBXFS_B_DIRTY
))) {
992 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
993 libxfs_readbuf_verify(bp
, ops
);
998 * Set the ops on a cache miss (i.e. first physical read) as the
999 * verifier may change the ops to match the type of buffer it contains.
1000 * A cache hit might reset the verifier to the original type if we set
1001 * it again, but it won't get called again and set to match the buffer
1002 * contents. *cough* xfs_da_node_buf_ops *cough*.
1004 error
= libxfs_readbufr(btp
, blkno
, bp
, len
, flags
);
1006 bp
->b_error
= error
;
1008 libxfs_readbuf_verify(bp
, ops
);
1013 libxfs_readbufr_map(struct xfs_buftarg
*btp
, struct xfs_buf
*bp
, int flags
)
1020 fd
= libxfs_device_to_fd(btp
->dev
);
1022 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
1023 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
1024 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
1026 error
= __read_buf(fd
, buf
, len
, offset
, flags
);
1028 bp
->b_error
= error
;
1035 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
1037 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1038 pthread_self(), __FUNCTION__
, buf
- (char *)bp
->b_addr
, error
,
1039 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
), (long long)bp
->b_bn
, bp
);
1045 libxfs_readbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
,
1046 int flags
, const struct xfs_buf_ops
*ops
)
1052 return libxfs_readbuf(btp
, map
[0].bm_bn
, map
[0].bm_len
,
1055 bp
= __libxfs_getbuf_map(btp
, map
, nmaps
, 0);
1060 if ((bp
->b_flags
& (LIBXFS_B_UPTODATE
|LIBXFS_B_DIRTY
))) {
1061 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
1062 libxfs_readbuf_verify(bp
, ops
);
1065 error
= libxfs_readbufr_map(btp
, bp
, flags
);
1067 libxfs_readbuf_verify(bp
, ops
);
1070 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1071 pthread_self(), __FUNCTION__
, buf
- (char *)bp
->b_addr
, error
,
1072 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
), (long long)bp
->b_bn
, bp
);
1078 __write_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
1082 sts
= pwrite(fd
, buf
, len
, offset
);
1085 fprintf(stderr
, _("%s: pwrite failed: %s\n"),
1086 progname
, strerror(error
));
1087 if (flags
& LIBXFS_B_EXIT
)
1090 } else if (sts
!= len
) {
1091 fprintf(stderr
, _("%s: error - pwrite only %d of %d bytes\n"),
1092 progname
, sts
, len
);
1093 if (flags
& LIBXFS_B_EXIT
)
1101 libxfs_writebufr(xfs_buf_t
*bp
)
1103 int fd
= libxfs_device_to_fd(bp
->b_target
->dev
);
1106 * we never write buffers that are marked stale. This indicates they
1107 * contain data that has been invalidated, and even if the buffer is
1108 * dirty it must *never* be written. Verifiers are wonderful for finding
1109 * bugs like this. Make sure the error is obvious as to the cause.
1111 if (bp
->b_flags
& LIBXFS_B_STALE
) {
1112 bp
->b_error
= -ESTALE
;
1117 * clear any pre-existing error status on the buffer. This can occur if
1118 * the buffer is corrupt on disk and the repair process doesn't clear
1119 * the error before fixing and writing it back.
1123 bp
->b_ops
->verify_write(bp
);
1126 _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
1127 __func__
, bp
->b_ops
->name
,
1128 (long long)bp
->b_bn
, bp
->b_bcount
);
1133 if (!(bp
->b_flags
& LIBXFS_B_DISCONTIG
)) {
1134 bp
->b_error
= __write_buf(fd
, bp
->b_addr
, bp
->b_bcount
,
1135 LIBXFS_BBTOOFF64(bp
->b_bn
), bp
->b_flags
);
1138 void *buf
= bp
->b_addr
;
1140 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
1141 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
1142 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
1144 bp
->b_error
= __write_buf(fd
, buf
, len
, offset
,
1153 printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
1154 pthread_self(), __FUNCTION__
, bp
->b_bcount
,
1155 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
),
1156 (long long)bp
->b_bn
, bp
, bp
->b_error
);
1159 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
1160 bp
->b_flags
&= ~(LIBXFS_B_DIRTY
| LIBXFS_B_EXIT
|
1161 LIBXFS_B_UNCHECKED
);
1167 libxfs_writebuf_int(xfs_buf_t
*bp
, int flags
)
1170 * Clear any error hanging over from reading the buffer. This prevents
1171 * subsequent reads after this write from seeing stale errors.
1174 bp
->b_flags
&= ~LIBXFS_B_STALE
;
1175 bp
->b_flags
|= (LIBXFS_B_DIRTY
| flags
);
1180 libxfs_writebuf(xfs_buf_t
*bp
, int flags
)
1183 printf("%lx: %s: dirty blkno=%llu(%llu)\n",
1184 pthread_self(), __FUNCTION__
,
1185 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
),
1186 (long long)bp
->b_bn
);
1189 * Clear any error hanging over from reading the buffer. This prevents
1190 * subsequent reads after this write from seeing stale errors.
1193 bp
->b_flags
&= ~LIBXFS_B_STALE
;
1194 bp
->b_flags
|= (LIBXFS_B_DIRTY
| flags
);
1200 libxfs_iomove(xfs_buf_t
*bp
, uint boff
, int len
, void *data
, int flags
)
1203 if (boff
+ len
> bp
->b_bcount
) {
1204 printf("Badness, iomove out of range!\n"
1205 "bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
1206 (long long)bp
->b_bn
, bp
->b_bcount
, boff
, len
);
1212 memset(bp
->b_addr
+ boff
, 0, len
);
1215 memcpy(data
, bp
->b_addr
+ boff
, len
);
1218 memcpy(bp
->b_addr
+ boff
, data
, len
);
1225 struct cache_node
*node
)
1227 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
1231 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1233 "releasing dirty buffer to free list!");
1235 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
1236 list_add(&bp
->b_node
.cn_mru
, &xfs_buf_freelist
.cm_list
);
1237 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
1242 struct cache
*cache
,
1243 struct list_head
*list
)
1248 if (list_empty(list
))
1251 list_for_each_entry(bp
, list
, b_node
.cn_mru
) {
1252 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1254 "releasing dirty buffer (bulk) to free list!");
1258 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
1259 list_splice(list
, &xfs_buf_freelist
.cm_list
);
1260 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
1266 * Free everything from the xfs_buf_freelist MRU, used at final teardown
1269 libxfs_bcache_free(void)
1271 struct list_head
*cm_list
;
1272 xfs_buf_t
*bp
, *next
;
1274 cm_list
= &xfs_buf_freelist
.cm_list
;
1275 list_for_each_entry_safe(bp
, next
, cm_list
, b_node
.cn_mru
) {
1277 if (bp
->b_maps
!= &bp
->__b_map
)
1279 kmem_zone_free(xfs_buf_zone
, bp
);
1284 * When a buffer is marked dirty, the error is cleared. Hence if we are trying
1285 * to flush a buffer prior to cache reclaim that has an error on it it means
1286 * we've already tried to flush it and it failed. Prevent repeated corruption
1287 * errors from being reported by skipping such buffers - when the corruption is
1288 * fixed the buffer will be marked dirty again and we can write it again.
1292 struct cache_node
*node
)
1294 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
1296 if (!bp
->b_error
&& bp
->b_flags
& LIBXFS_B_DIRTY
)
1297 return libxfs_writebufr(bp
);
1302 libxfs_putbufr(xfs_buf_t
*bp
)
1304 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1305 libxfs_writebufr(bp
);
1306 libxfs_brelse((struct cache_node
*)bp
);
1311 libxfs_bcache_purge(void)
1313 cache_purge(libxfs_bcache
);
1317 libxfs_bcache_flush(void)
1319 cache_flush(libxfs_bcache
);
1323 libxfs_bcache_overflowed(void)
1325 return cache_overflowed(libxfs_bcache
);
1328 struct cache_operations libxfs_bcache_operations
= {
1329 .hash
= libxfs_bhash
,
1330 .alloc
= libxfs_balloc
,
1331 .flush
= libxfs_bflush
,
1332 .relse
= libxfs_brelse
,
1333 .compare
= libxfs_bcompare
,
1334 .bulkrelse
= libxfs_bulkrelse
1339 * Inode cache stubs.
1342 kmem_zone_t
*xfs_inode_zone
;
1343 extern kmem_zone_t
*xfs_ili_zone
;
1346 * If there are inline format data / attr forks attached to this inode,
1347 * make sure they're not corrupt.
1350 libxfs_inode_verify_forks(
1351 struct xfs_inode
*ip
,
1352 struct xfs_ifork_ops
*ops
)
1354 struct xfs_ifork
*ifp
;
1360 fa
= xfs_ifork_verify_data(ip
, ops
);
1362 ifp
= XFS_IFORK_PTR(ip
, XFS_DATA_FORK
);
1363 xfs_inode_verifier_error(ip
, -EFSCORRUPTED
, "data fork",
1364 ifp
->if_u1
.if_data
, ifp
->if_bytes
, fa
);
1368 fa
= xfs_ifork_verify_attr(ip
, ops
);
1370 ifp
= XFS_IFORK_PTR(ip
, XFS_ATTR_FORK
);
1371 xfs_inode_verifier_error(ip
, -EFSCORRUPTED
, "attr fork",
1372 ifp
? ifp
->if_u1
.if_data
: NULL
,
1373 ifp
? ifp
->if_bytes
: 0, fa
);
1381 struct xfs_mount
*mp
,
1382 struct xfs_trans
*tp
,
1385 struct xfs_inode
**ipp
,
1386 struct xfs_ifork_ops
*ifork_ops
)
1388 struct xfs_inode
*ip
;
1391 ip
= kmem_zone_zalloc(xfs_inode_zone
, 0);
1397 error
= xfs_iread(mp
, tp
, ip
, 0);
1399 kmem_zone_free(xfs_inode_zone
, ip
);
1404 if (!libxfs_inode_verify_forks(ip
, ifork_ops
)) {
1406 return -EFSCORRUPTED
;
1410 * set up the inode ops structure that the libxfs code relies on
1413 ip
->d_ops
= mp
->m_dir_inode_ops
;
1415 ip
->d_ops
= mp
->m_nondir_inode_ops
;
1422 libxfs_idestroy(xfs_inode_t
*ip
)
1424 switch (VFS_I(ip
)->i_mode
& S_IFMT
) {
1428 libxfs_idestroy_fork(ip
, XFS_DATA_FORK
);
1432 libxfs_idestroy_fork(ip
, XFS_ATTR_FORK
);
1434 xfs_idestroy_fork(ip
, XFS_COW_FORK
);
1439 struct xfs_inode
*ip
)
1442 kmem_zone_free(xfs_ili_zone
, ip
->i_itemp
);
1444 libxfs_idestroy(ip
);
1445 kmem_zone_free(xfs_inode_zone
, ip
);