1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
8 #include "libxfs_priv.h"
11 #include "xfs_shared.h"
12 #include "xfs_format.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans_resv.h"
15 #include "xfs_mount.h"
16 #include "xfs_inode_buf.h"
17 #include "xfs_inode_fork.h"
18 #include "xfs_inode.h"
19 #include "xfs_trans.h"
21 #include "libxfs.h" /* for LIBXFS_EXIT_ON_FAILURE */
24 * Important design/architecture note:
26 * The userspace code that uses the buffer cache is much less constrained than
27 * the kernel code. The userspace code is pretty nasty in places, especially
28 * when it comes to buffer error handling. Very little of the userspace code
29 * outside libxfs clears bp->b_error - very little code even checks it - so the
30 * libxfs code is tripping on stale errors left by the userspace code.
32 * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
33 * in the kernel, because those functions are used by the libxfs_readbuf_*
34 * functions and hence need to leave the buffers unchanged on cache hits. This
35 * is actually the only way to gather a write error from a libxfs_writebuf()
36 * call - you need to get the buffer again so you can check bp->b_error field -
37 * assuming that the buffer is still in the cache when you check, that is.
39 * This is very different to the kernel code which does not release buffers on a
40 * write so we can wait on IO and check errors. The kernel buffer cache also
41 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
44 * IOWs, userspace is behaving quite differently to the kernel and as a result
45 * it leaks errors from reads, invalidations and writes through
46 * libxfs_getbuf/libxfs_readbuf.
48 * The result of this is that until the userspace code outside libxfs is cleaned
49 * up, functions that release buffers from userspace control (i.e
50 * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
51 * propagation of stale errors into future buffer operations.
54 #define BDSTRAT_SIZE (256 * 1024)
56 #define IO_BCOMPARE_CHECK
58 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
60 libxfs_device_zero(struct xfs_buftarg
*btp
, xfs_daddr_t start
, uint len
)
62 xfs_off_t start_offset
, end_offset
, offset
;
67 zsize
= min(BDSTRAT_SIZE
, BBTOB(len
));
68 if ((z
= memalign(libxfs_device_alignment(), zsize
)) == NULL
) {
70 _("%s: %s can't memalign %d bytes: %s\n"),
71 progname
, __FUNCTION__
, (int)zsize
, strerror(errno
));
76 fd
= libxfs_device_to_fd(btp
->dev
);
77 start_offset
= LIBXFS_BBTOOFF64(start
);
79 if ((lseek(fd
, start_offset
, SEEK_SET
)) < 0) {
80 fprintf(stderr
, _("%s: %s seek to offset %llu failed: %s\n"),
81 progname
, __FUNCTION__
,
82 (unsigned long long)start_offset
, strerror(errno
));
86 end_offset
= LIBXFS_BBTOOFF64(start
+ len
) - start_offset
;
87 for (offset
= 0; offset
< end_offset
; ) {
88 bytes
= min((ssize_t
)(end_offset
- offset
), zsize
);
89 if ((bytes
= write(fd
, z
, bytes
)) < 0) {
90 fprintf(stderr
, _("%s: %s write failed: %s\n"),
91 progname
, __FUNCTION__
, strerror(errno
));
93 } else if (bytes
== 0) {
94 fprintf(stderr
, _("%s: %s not progressing?\n"),
95 progname
, __FUNCTION__
);
104 static void unmount_record(void *p
)
106 xlog_op_header_t
*op
= (xlog_op_header_t
*)p
;
107 /* the data section must be 32 bit size aligned */
111 uint32_t pad2
; /* may as well make it 64 bits */
112 } magic
= { XLOG_UNMOUNT_TYPE
, 0, 0 };
114 memset(p
, 0, BBSIZE
);
115 /* dummy tid to mark this as written from userspace */
116 op
->oh_tid
= cpu_to_be32(0xb0c0d0d0);
117 op
->oh_len
= cpu_to_be32(sizeof(magic
));
118 op
->oh_clientid
= XFS_LOG
;
119 op
->oh_flags
= XLOG_UNMOUNT_TRANS
;
122 /* and the data for this op */
123 memcpy((char *)p
+ sizeof(xlog_op_header_t
), &magic
, sizeof(magic
));
131 struct xfs_buf
*buf
= (struct xfs_buf
*)private;
134 (buf
->b_bcount
< (int)(ptr
- (char *)buf
->b_addr
) + offset
))
141 * Format the log. The caller provides either a buftarg which is used to access
142 * the log via buffers or a direct pointer to a buffer that encapsulates the
147 struct xfs_buftarg
*btp
,
150 uint length
, /* basic blocks */
153 int sunit
, /* bytes */
158 struct xfs_buf
*bp
= NULL
;
166 if (((btp
&& dptr
) || (!btp
&& !dptr
)) ||
167 (btp
&& !btp
->dev
) || !fs_uuid
)
170 /* first zero the log */
172 libxfs_device_zero(btp
, start
, length
);
174 memset(dptr
, 0, BBTOB(length
));
177 * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
178 * special reset case where we only write a single record where the lsn
179 * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
180 * the specified cycle and points tail_lsn at the last record of the
183 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 2;
185 lsn
= xlog_assign_lsn(cycle
, 0);
186 if (cycle
== XLOG_INIT_CYCLE
)
189 tail_lsn
= xlog_assign_lsn(cycle
- 1, length
- len
);
191 /* write out the first log record */
194 bp
= libxfs_getbufr(btp
, start
, len
);
197 libxfs_log_header(ptr
, fs_uuid
, version
, sunit
, fmt
, lsn
, tail_lsn
,
200 bp
->b_flags
|= LIBXFS_B_DIRTY
;
205 * There's nothing else to do if this is a log reset. The kernel detects
206 * the rest of the log is zeroed and starts at cycle 1.
208 if (cycle
== XLOG_INIT_CYCLE
)
212 * Bump the record size for a full log format if the caller allows it.
213 * This is primarily for performance reasons and most callers don't care
214 * about record size since the log is clean after we're done.
217 len
= BTOBB(BDSTRAT_SIZE
);
220 * Otherwise, fill everything beyond the initial record with records of
221 * the previous cycle so the kernel head/tail detection works correctly.
223 * We don't particularly care about the record size or content here.
224 * It's only important that the headers are in place such that the
225 * kernel finds 1.) a clean log and 2.) the correct current cycle value.
226 * Therefore, bump up the record size to the max to use larger I/Os and
227 * improve performance.
233 end_blk
= start
+ length
;
235 len
= min(end_blk
- blk
, len
);
236 while (blk
< end_blk
) {
237 lsn
= xlog_assign_lsn(cycle
, blk
- start
);
238 tail_lsn
= xlog_assign_lsn(cycle
, blk
- start
- len
);
242 bp
= libxfs_getbufr(btp
, blk
, len
);
246 * Note: pass the full buffer length as the sunit to initialize
249 libxfs_log_header(ptr
, fs_uuid
, version
, BBTOB(len
), fmt
, lsn
,
252 bp
->b_flags
|= LIBXFS_B_DIRTY
;
259 len
= min(end_blk
- blk
, len
);
274 libxfs_get_block_t
*nextfunc
,
277 xlog_rec_header_t
*head
= (xlog_rec_header_t
*)caddr
;
283 if (lsn
== NULLCOMMITLSN
)
284 lsn
= xlog_assign_lsn(XLOG_INIT_CYCLE
, 0);
285 if (tail_lsn
== NULLCOMMITLSN
)
288 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 1;
290 memset(p
, 0, BBSIZE
);
291 head
->h_magicno
= cpu_to_be32(XLOG_HEADER_MAGIC_NUM
);
292 head
->h_cycle
= cpu_to_be32(CYCLE_LSN(lsn
));
293 head
->h_version
= cpu_to_be32(version
);
294 head
->h_crc
= cpu_to_le32(0);
295 head
->h_prev_block
= cpu_to_be32(-1);
296 head
->h_num_logops
= cpu_to_be32(1);
297 head
->h_fmt
= cpu_to_be32(fmt
);
298 head
->h_size
= cpu_to_be32(max(sunit
, XLOG_BIG_RECORD_BSIZE
));
300 head
->h_lsn
= cpu_to_be64(lsn
);
301 head
->h_tail_lsn
= cpu_to_be64(tail_lsn
);
303 memcpy(&head
->h_fs_uuid
, fs_uuid
, sizeof(uuid_t
));
306 * The kernel expects to see either a log record header magic value or
307 * the LSN cycle at the top of every log block. The first word of each
308 * non-header block is copied to the record headers and replaced with
309 * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
312 * Even though we only ever write an unmount record (one block), we
313 * support writing log records up to the max log buffer size of 256k to
314 * improve log format performance. This means a record can require up
315 * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
316 * data (each header supports 32k of data).
318 cycle_lsn
= CYCLE_LSN_DISK(head
->h_lsn
);
319 if (version
== 2 && sunit
> XLOG_HEADER_CYCLE_SIZE
) {
320 hdrs
= sunit
/ XLOG_HEADER_CYCLE_SIZE
;
321 if (sunit
% XLOG_HEADER_CYCLE_SIZE
)
326 * A fixed number of extended headers is expected based on h_size. If
327 * required, format those now so the unmount record is located
330 * Since we only write an unmount record, we only need one h_cycle_data
331 * entry for the unmount record block. The subsequent record data
332 * blocks are zeroed, which means we can stamp them directly with the
333 * cycle and zero the rest of the cycle data in the extended headers.
336 for (i
= 1; i
< hdrs
; i
++) {
337 p
= nextfunc(p
, BBSIZE
, private);
338 memset(p
, 0, BBSIZE
);
339 /* xlog_rec_ext_header.xh_cycle */
340 *(__be32
*)p
= cycle_lsn
;
345 * The total length is the max of the stripe unit or 2 basic block
346 * minimum (1 hdr blk + 1 data blk). The record length is the total
347 * minus however many header blocks are required.
349 head
->h_len
= cpu_to_be32(max(BBTOB(2), sunit
) - hdrs
* BBSIZE
);
352 * Write out the unmount record, pack the first word into the record
353 * header and stamp the block with the cycle.
355 p
= nextfunc(p
, BBSIZE
, private);
358 head
->h_cycle_data
[0] = *(__be32
*)p
;
359 *(__be32
*)p
= cycle_lsn
;
362 * Finally, zero all remaining blocks in the record and stamp each with
363 * the cycle. We don't need to pack any of these blocks because the
364 * cycle data in the headers has already been zeroed.
366 len
= max(len
, hdrs
+ 1);
367 for (i
= hdrs
+ 1; i
< len
; i
++) {
368 p
= nextfunc(p
, BBSIZE
, private);
369 memset(p
, 0, BBSIZE
);
370 *(__be32
*)p
= cycle_lsn
;
377 * Simple I/O (buffer cache) interface
381 #ifdef XFS_BUF_TRACING
383 #undef libxfs_readbuf
384 #undef libxfs_readbuf_map
385 #undef libxfs_writebuf
387 #undef libxfs_getbuf_map
388 #undef libxfs_getbuf_flags
391 xfs_buf_t
*libxfs_readbuf(struct xfs_buftarg
*, xfs_daddr_t
, int, int,
392 const struct xfs_buf_ops
*);
393 xfs_buf_t
*libxfs_readbuf_map(struct xfs_buftarg
*, struct xfs_buf_map
*,
394 int, int, const struct xfs_buf_ops
*);
395 int libxfs_writebuf(xfs_buf_t
*, int);
396 xfs_buf_t
*libxfs_getbuf(struct xfs_buftarg
*, xfs_daddr_t
, int);
397 xfs_buf_t
*libxfs_getbuf_map(struct xfs_buftarg
*, struct xfs_buf_map
*,
399 xfs_buf_t
*libxfs_getbuf_flags(struct xfs_buftarg
*, xfs_daddr_t
, int,
401 void libxfs_putbuf (xfs_buf_t
*);
403 #define __add_trace(bp, func, file, line) \
406 (bp)->b_func = (func); \
407 (bp)->b_file = (file); \
408 (bp)->b_line = (line); \
413 libxfs_trace_readbuf(const char *func
, const char *file
, int line
,
414 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, int flags
,
415 const struct xfs_buf_ops
*ops
)
417 xfs_buf_t
*bp
= libxfs_readbuf(btp
, blkno
, len
, flags
, ops
);
418 __add_trace(bp
, func
, file
, line
);
423 libxfs_trace_readbuf_map(const char *func
, const char *file
, int line
,
424 struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
, int flags
,
425 const struct xfs_buf_ops
*ops
)
427 xfs_buf_t
*bp
= libxfs_readbuf_map(btp
, map
, nmaps
, flags
, ops
);
428 __add_trace(bp
, func
, file
, line
);
433 libxfs_trace_writebuf(const char *func
, const char *file
, int line
, xfs_buf_t
*bp
, int flags
)
435 __add_trace(bp
, func
, file
, line
);
436 return libxfs_writebuf(bp
, flags
);
440 libxfs_trace_getbuf(const char *func
, const char *file
, int line
,
441 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
)
443 xfs_buf_t
*bp
= libxfs_getbuf(btp
, blkno
, len
);
444 __add_trace(bp
, func
, file
, line
);
449 libxfs_trace_getbuf_map(const char *func
, const char *file
, int line
,
450 struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
,
453 xfs_buf_t
*bp
= libxfs_getbuf_map(btp
, map
, nmaps
, flags
);
454 __add_trace(bp
, func
, file
, line
);
459 libxfs_trace_getbuf_flags(const char *func
, const char *file
, int line
,
460 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, unsigned int flags
)
462 xfs_buf_t
*bp
= libxfs_getbuf_flags(btp
, blkno
, len
, flags
);
463 __add_trace(bp
, func
, file
, line
);
468 libxfs_trace_putbuf(const char *func
, const char *file
, int line
, xfs_buf_t
*bp
)
470 __add_trace(bp
, func
, file
, line
);
479 libxfs_getsb(xfs_mount_t
*mp
, int flags
)
481 return libxfs_readbuf(mp
->m_ddev_targp
, XFS_SB_DADDR
,
482 XFS_FSS_TO_BB(mp
, 1), flags
, &xfs_sb_buf_ops
);
485 kmem_zone_t
*xfs_buf_zone
;
487 static struct cache_mru xfs_buf_freelist
=
488 {{&xfs_buf_freelist
.cm_list
, &xfs_buf_freelist
.cm_list
},
489 0, PTHREAD_MUTEX_INITIALIZER
};
492 * The bufkey is used to pass the new buffer information to the cache object
493 * allocation routine. Because discontiguous buffers need to pass different
494 * information, we need fields to pass that information. However, because the
495 * blkno and bblen is needed for the initial cache entry lookup (i.e. for
496 * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
497 * buffer initialisation instead of a contiguous buffer.
500 struct xfs_buftarg
*buftarg
;
503 struct xfs_buf_map
*map
;
507 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
508 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
509 #define CACHE_LINE_SIZE 64
511 libxfs_bhash(cache_key_t key
, unsigned int hashsize
, unsigned int hashshift
)
513 uint64_t hashval
= ((struct xfs_bufkey
*)key
)->blkno
;
516 tmp
= hashval
^ (GOLDEN_RATIO_PRIME
+ hashval
) / CACHE_LINE_SIZE
;
517 tmp
= tmp
^ ((tmp
^ GOLDEN_RATIO_PRIME
) >> hashshift
);
518 return tmp
% hashsize
;
522 libxfs_bcompare(struct cache_node
*node
, cache_key_t key
)
524 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
525 struct xfs_bufkey
*bkey
= (struct xfs_bufkey
*)key
;
527 if (bp
->b_target
->dev
== bkey
->buftarg
->dev
&&
528 bp
->b_bn
== bkey
->blkno
) {
529 if (bp
->b_bcount
== BBTOB(bkey
->bblen
))
531 #ifdef IO_BCOMPARE_CHECK
532 if (!(libxfs_bcache
->c_flags
& CACHE_MISCOMPARE_PURGE
)) {
534 "%lx: Badness in key lookup (length)\n"
535 "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
537 (unsigned long long)bp
->b_bn
, (int)bp
->b_bcount
,
538 (unsigned long long)bkey
->blkno
,
548 __initbuf(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
553 bp
->b_bcount
= bytes
;
554 bp
->b_length
= BTOBB(bytes
);
558 bp
->b_addr
= memalign(libxfs_device_alignment(), bytes
);
561 _("%s: %s can't memalign %u bytes: %s\n"),
562 progname
, __FUNCTION__
, bytes
,
566 memset(bp
->b_addr
, 0, bytes
);
567 #ifdef XFS_BUF_TRACING
568 list_head_init(&bp
->b_lock_list
);
570 pthread_mutex_init(&bp
->b_lock
, NULL
);
577 bp
->b_maps
= &bp
->__b_map
;
578 bp
->b_maps
[0].bm_bn
= bp
->b_bn
;
579 bp
->b_maps
[0].bm_len
= bp
->b_length
;
584 libxfs_initbuf(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
587 __initbuf(bp
, btp
, bno
, bytes
);
591 libxfs_initbuf_map(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
,
592 struct xfs_buf_map
*map
, int nmaps
)
594 unsigned int bytes
= 0;
597 bytes
= sizeof(struct xfs_buf_map
) * nmaps
;
598 bp
->b_maps
= malloc(bytes
);
601 _("%s: %s can't malloc %u bytes: %s\n"),
602 progname
, __FUNCTION__
, bytes
,
609 for ( i
= 0; i
< nmaps
; i
++) {
610 bp
->b_maps
[i
].bm_bn
= map
[i
].bm_bn
;
611 bp
->b_maps
[i
].bm_len
= map
[i
].bm_len
;
612 bytes
+= BBTOB(map
[i
].bm_len
);
615 __initbuf(bp
, btp
, map
[0].bm_bn
, bytes
);
616 bp
->b_flags
|= LIBXFS_B_DISCONTIG
;
620 __libxfs_getbufr(int blen
)
625 * first look for a buffer that can be used as-is,
626 * if one cannot be found, see if there is a buffer,
627 * and if so, free its buffer and set b_addr to NULL
628 * before calling libxfs_initbuf.
630 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
631 if (!list_empty(&xfs_buf_freelist
.cm_list
)) {
632 list_for_each_entry(bp
, &xfs_buf_freelist
.cm_list
, b_node
.cn_mru
) {
633 if (bp
->b_bcount
== blen
) {
634 list_del_init(&bp
->b_node
.cn_mru
);
638 if (&bp
->b_node
.cn_mru
== &xfs_buf_freelist
.cm_list
) {
639 bp
= list_entry(xfs_buf_freelist
.cm_list
.next
,
640 xfs_buf_t
, b_node
.cn_mru
);
641 list_del_init(&bp
->b_node
.cn_mru
);
644 if (bp
->b_maps
!= &bp
->__b_map
)
649 bp
= kmem_zone_zalloc(xfs_buf_zone
, 0);
650 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
652 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
653 fprintf(stderr
, "found dirty buffer (bulk) on free list!");
659 libxfs_getbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
)
662 int blen
= BBTOB(bblen
);
664 bp
=__libxfs_getbufr(blen
);
666 libxfs_initbuf(bp
, btp
, blkno
, blen
);
668 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
669 pthread_self(), __FUNCTION__
, blen
,
670 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
677 libxfs_getbufr_map(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
,
678 struct xfs_buf_map
*map
, int nmaps
)
681 int blen
= BBTOB(bblen
);
683 if (!map
|| !nmaps
) {
685 _("%s: %s invalid map %p or nmaps %d\n"),
686 progname
, __FUNCTION__
, map
, nmaps
);
690 if (blkno
!= map
[0].bm_bn
) {
692 _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
693 progname
, __FUNCTION__
, (long long)map
[0].bm_bn
,
698 bp
=__libxfs_getbufr(blen
);
700 libxfs_initbuf_map(bp
, btp
, map
, nmaps
);
702 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
703 pthread_self(), __FUNCTION__
, blen
,
704 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
710 #ifdef XFS_BUF_TRACING
711 struct list_head lock_buf_list
= {&lock_buf_list
, &lock_buf_list
};
712 int lock_buf_count
= 0;
715 static struct xfs_buf
*
716 __cache_lookup(struct xfs_bufkey
*key
, unsigned int flags
)
720 cache_node_get(libxfs_bcache
, key
, (struct cache_node
**)&bp
);
724 if (use_xfs_buf_lock
) {
727 ret
= pthread_mutex_trylock(&bp
->b_lock
);
729 ASSERT(ret
== EAGAIN
);
730 if (flags
& LIBXFS_GETBUF_TRYLOCK
)
733 if (pthread_equal(bp
->b_holder
, pthread_self())) {
735 _("Warning: recursive buffer locking at block %" PRIu64
" detected\n"),
740 pthread_mutex_lock(&bp
->b_lock
);
744 bp
->b_holder
= pthread_self();
747 cache_node_set_priority(libxfs_bcache
, (struct cache_node
*)bp
,
748 cache_node_get_priority((struct cache_node
*)bp
) -
749 CACHE_PREFETCH_PRIORITY
);
750 #ifdef XFS_BUF_TRACING
751 pthread_mutex_lock(&libxfs_bcache
->c_mutex
);
753 list_add(&bp
->b_lock_list
, &lock_buf_list
);
754 pthread_mutex_unlock(&libxfs_bcache
->c_mutex
);
757 printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
758 pthread_self(), __FUNCTION__
,
759 bp
, bp
->b_bn
, (long long)LIBXFS_BBTOOFF64(key
->blkno
));
764 cache_node_put(libxfs_bcache
, (struct cache_node
*)bp
);
769 libxfs_getbuf_flags(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
,
772 struct xfs_bufkey key
= {NULL
};
778 return __cache_lookup(&key
, flags
);
782 * Clean the buffer flags for libxfs_getbuf*(), which wants to return
783 * an unused buffer with clean state. This prevents CRC errors on a
784 * re-read of a corrupt block that was prefetched and freed. This
785 * can happen with a massively corrupt directory that is discarded,
786 * but whose blocks are then recycled into expanding lost+found.
788 * Note however that if the buffer's dirty (prefetch calls getbuf)
789 * we'll leave the state alone because we don't want to discard blocks
790 * that have been fixed.
796 if (bp
&& !(bp
->b_flags
& LIBXFS_B_DIRTY
))
797 bp
->b_flags
&= ~(LIBXFS_B_UNCHECKED
| LIBXFS_B_STALE
|
802 libxfs_getbuf(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
)
806 bp
= libxfs_getbuf_flags(btp
, blkno
, len
, 0);
811 static struct xfs_buf
*
812 __libxfs_getbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
,
813 int nmaps
, int flags
)
815 struct xfs_bufkey key
= {NULL
};
819 return libxfs_getbuf_flags(btp
, map
[0].bm_bn
, map
[0].bm_len
,
823 key
.blkno
= map
[0].bm_bn
;
824 for (i
= 0; i
< nmaps
; i
++) {
825 key
.bblen
+= map
[i
].bm_len
;
830 return __cache_lookup(&key
, flags
);
834 libxfs_getbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
,
835 int nmaps
, int flags
)
839 bp
= __libxfs_getbuf_map(btp
, map
, nmaps
, flags
);
845 libxfs_putbuf(xfs_buf_t
*bp
)
848 * ensure that any errors on this use of the buffer don't carry
849 * over to the next user.
853 #ifdef XFS_BUF_TRACING
854 pthread_mutex_lock(&libxfs_bcache
->c_mutex
);
856 ASSERT(lock_buf_count
>= 0);
857 list_del_init(&bp
->b_lock_list
);
858 pthread_mutex_unlock(&libxfs_bcache
->c_mutex
);
860 if (use_xfs_buf_lock
) {
865 pthread_mutex_unlock(&bp
->b_lock
);
869 cache_node_put(libxfs_bcache
, (struct cache_node
*)bp
);
873 libxfs_purgebuf(xfs_buf_t
*bp
)
875 struct xfs_bufkey key
= {NULL
};
877 key
.buftarg
= bp
->b_target
;
878 key
.blkno
= bp
->b_bn
;
879 key
.bblen
= bp
->b_length
;
881 cache_node_purge(libxfs_bcache
, &key
, (struct cache_node
*)bp
);
884 static struct cache_node
*
885 libxfs_balloc(cache_key_t key
)
887 struct xfs_bufkey
*bufkey
= (struct xfs_bufkey
*)key
;
890 return (struct cache_node
*)
891 libxfs_getbufr_map(bufkey
->buftarg
,
892 bufkey
->blkno
, bufkey
->bblen
,
893 bufkey
->map
, bufkey
->nmaps
);
894 return (struct cache_node
*)libxfs_getbufr(bufkey
->buftarg
,
895 bufkey
->blkno
, bufkey
->bblen
);
900 __read_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
904 sts
= pread(fd
, buf
, len
, offset
);
907 fprintf(stderr
, _("%s: read failed: %s\n"),
908 progname
, strerror(error
));
909 if (flags
& LIBXFS_EXIT_ON_FAILURE
)
912 } else if (sts
!= len
) {
913 fprintf(stderr
, _("%s: error - read only %d of %d bytes\n"),
915 if (flags
& LIBXFS_EXIT_ON_FAILURE
)
923 libxfs_readbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, xfs_buf_t
*bp
,
926 int fd
= libxfs_device_to_fd(btp
->dev
);
927 int bytes
= BBTOB(len
);
930 ASSERT(BBTOB(len
) <= bp
->b_bcount
);
932 error
= __read_buf(fd
, bp
->b_addr
, bytes
, LIBXFS_BBTOOFF64(blkno
), flags
);
934 bp
->b_target
->dev
== btp
->dev
&&
936 bp
->b_bcount
== bytes
)
937 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
939 printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
940 pthread_self(), __FUNCTION__
, bytes
, error
,
941 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
947 libxfs_readbuf_verify(struct xfs_buf
*bp
, const struct xfs_buf_ops
*ops
)
952 bp
->b_ops
->verify_read(bp
);
953 bp
->b_flags
&= ~LIBXFS_B_UNCHECKED
;
958 libxfs_readbuf(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, int flags
,
959 const struct xfs_buf_ops
*ops
)
964 bp
= libxfs_getbuf_flags(btp
, blkno
, len
, 0);
969 * if the buffer was prefetched, it is likely that it was not validated.
970 * Hence if we are supplied an ops function and the buffer is marked as
971 * unchecked, we need to validate it now.
973 * We do this verification even if the buffer is dirty - the
974 * verification is almost certainly going to fail the CRC check in this
975 * case as a dirty buffer has not had the CRC recalculated. However, we
976 * should not be dirtying unchecked buffers and therefore failing it
977 * here because it's dirty and unchecked indicates we've screwed up
981 if ((bp
->b_flags
& (LIBXFS_B_UPTODATE
|LIBXFS_B_DIRTY
))) {
982 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
983 libxfs_readbuf_verify(bp
, ops
);
988 * Set the ops on a cache miss (i.e. first physical read) as the
989 * verifier may change the ops to match the type of buffer it contains.
990 * A cache hit might reset the verifier to the original type if we set
991 * it again, but it won't get called again and set to match the buffer
992 * contents. *cough* xfs_da_node_buf_ops *cough*.
994 error
= libxfs_readbufr(btp
, blkno
, bp
, len
, flags
);
998 libxfs_readbuf_verify(bp
, ops
);
1003 libxfs_readbufr_map(struct xfs_buftarg
*btp
, struct xfs_buf
*bp
, int flags
)
1010 fd
= libxfs_device_to_fd(btp
->dev
);
1012 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
1013 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
1014 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
1016 error
= __read_buf(fd
, buf
, len
, offset
, flags
);
1018 bp
->b_error
= error
;
1025 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
1027 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1028 pthread_self(), __FUNCTION__
, buf
- (char *)bp
->b_addr
, error
,
1029 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
), (long long)bp
->b_bn
, bp
);
1035 libxfs_readbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
,
1036 int flags
, const struct xfs_buf_ops
*ops
)
1042 return libxfs_readbuf(btp
, map
[0].bm_bn
, map
[0].bm_len
,
1045 bp
= __libxfs_getbuf_map(btp
, map
, nmaps
, 0);
1050 if ((bp
->b_flags
& (LIBXFS_B_UPTODATE
|LIBXFS_B_DIRTY
))) {
1051 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
1052 libxfs_readbuf_verify(bp
, ops
);
1055 error
= libxfs_readbufr_map(btp
, bp
, flags
);
1057 libxfs_readbuf_verify(bp
, ops
);
1060 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1061 pthread_self(), __FUNCTION__
, buf
- (char *)bp
->b_addr
, error
,
1062 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
), (long long)bp
->b_bn
, bp
);
1068 __write_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
1072 sts
= pwrite(fd
, buf
, len
, offset
);
1075 fprintf(stderr
, _("%s: pwrite failed: %s\n"),
1076 progname
, strerror(error
));
1077 if (flags
& LIBXFS_B_EXIT
)
1080 } else if (sts
!= len
) {
1081 fprintf(stderr
, _("%s: error - pwrite only %d of %d bytes\n"),
1082 progname
, sts
, len
);
1083 if (flags
& LIBXFS_B_EXIT
)
1091 libxfs_writebufr(xfs_buf_t
*bp
)
1093 int fd
= libxfs_device_to_fd(bp
->b_target
->dev
);
1096 * we never write buffers that are marked stale. This indicates they
1097 * contain data that has been invalidated, and even if the buffer is
1098 * dirty it must *never* be written. Verifiers are wonderful for finding
1099 * bugs like this. Make sure the error is obvious as to the cause.
1101 if (bp
->b_flags
& LIBXFS_B_STALE
) {
1102 bp
->b_error
= -ESTALE
;
1107 * clear any pre-existing error status on the buffer. This can occur if
1108 * the buffer is corrupt on disk and the repair process doesn't clear
1109 * the error before fixing and writing it back.
1113 bp
->b_ops
->verify_write(bp
);
1116 _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
1117 __func__
, bp
->b_ops
->name
,
1118 (long long)bp
->b_bn
, bp
->b_bcount
);
1123 if (!(bp
->b_flags
& LIBXFS_B_DISCONTIG
)) {
1124 bp
->b_error
= __write_buf(fd
, bp
->b_addr
, bp
->b_bcount
,
1125 LIBXFS_BBTOOFF64(bp
->b_bn
), bp
->b_flags
);
1128 void *buf
= bp
->b_addr
;
1130 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
1131 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
1132 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
1134 bp
->b_error
= __write_buf(fd
, buf
, len
, offset
,
1143 printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
1144 pthread_self(), __FUNCTION__
, bp
->b_bcount
,
1145 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
),
1146 (long long)bp
->b_bn
, bp
, bp
->b_error
);
1149 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
1150 bp
->b_flags
&= ~(LIBXFS_B_DIRTY
| LIBXFS_B_EXIT
|
1151 LIBXFS_B_UNCHECKED
);
1157 libxfs_writebuf_int(xfs_buf_t
*bp
, int flags
)
1160 * Clear any error hanging over from reading the buffer. This prevents
1161 * subsequent reads after this write from seeing stale errors.
1164 bp
->b_flags
&= ~LIBXFS_B_STALE
;
1165 bp
->b_flags
|= (LIBXFS_B_DIRTY
| flags
);
1170 libxfs_writebuf(xfs_buf_t
*bp
, int flags
)
1173 printf("%lx: %s: dirty blkno=%llu(%llu)\n",
1174 pthread_self(), __FUNCTION__
,
1175 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
),
1176 (long long)bp
->b_bn
);
1179 * Clear any error hanging over from reading the buffer. This prevents
1180 * subsequent reads after this write from seeing stale errors.
1183 bp
->b_flags
&= ~LIBXFS_B_STALE
;
1184 bp
->b_flags
|= (LIBXFS_B_DIRTY
| flags
);
1190 libxfs_iomove(xfs_buf_t
*bp
, uint boff
, int len
, void *data
, int flags
)
1193 if (boff
+ len
> bp
->b_bcount
) {
1194 printf("Badness, iomove out of range!\n"
1195 "bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
1196 (long long)bp
->b_bn
, bp
->b_bcount
, boff
, len
);
1202 memset(bp
->b_addr
+ boff
, 0, len
);
1205 memcpy(data
, bp
->b_addr
+ boff
, len
);
1208 memcpy(bp
->b_addr
+ boff
, data
, len
);
1215 struct cache_node
*node
)
1217 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
1221 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1223 "releasing dirty buffer to free list!");
1225 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
1226 list_add(&bp
->b_node
.cn_mru
, &xfs_buf_freelist
.cm_list
);
1227 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
1232 struct cache
*cache
,
1233 struct list_head
*list
)
1238 if (list_empty(list
))
1241 list_for_each_entry(bp
, list
, b_node
.cn_mru
) {
1242 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1244 "releasing dirty buffer (bulk) to free list!");
1248 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
1249 list_splice(list
, &xfs_buf_freelist
.cm_list
);
1250 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
1256 * Free everything from the xfs_buf_freelist MRU, used at final teardown
1259 libxfs_bcache_free(void)
1261 struct list_head
*cm_list
;
1262 xfs_buf_t
*bp
, *next
;
1264 cm_list
= &xfs_buf_freelist
.cm_list
;
1265 list_for_each_entry_safe(bp
, next
, cm_list
, b_node
.cn_mru
) {
1267 if (bp
->b_maps
!= &bp
->__b_map
)
1269 kmem_zone_free(xfs_buf_zone
, bp
);
1274 * When a buffer is marked dirty, the error is cleared. Hence if we are trying
1275 * to flush a buffer prior to cache reclaim that has an error on it it means
1276 * we've already tried to flush it and it failed. Prevent repeated corruption
1277 * errors from being reported by skipping such buffers - when the corruption is
1278 * fixed the buffer will be marked dirty again and we can write it again.
1282 struct cache_node
*node
)
1284 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
1286 if (!bp
->b_error
&& bp
->b_flags
& LIBXFS_B_DIRTY
)
1287 return libxfs_writebufr(bp
);
1292 libxfs_putbufr(xfs_buf_t
*bp
)
1294 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1295 libxfs_writebufr(bp
);
1296 libxfs_brelse((struct cache_node
*)bp
);
1301 libxfs_bcache_purge(void)
1303 cache_purge(libxfs_bcache
);
1307 libxfs_bcache_flush(void)
1309 cache_flush(libxfs_bcache
);
1313 libxfs_bcache_overflowed(void)
1315 return cache_overflowed(libxfs_bcache
);
1318 struct cache_operations libxfs_bcache_operations
= {
1319 .hash
= libxfs_bhash
,
1320 .alloc
= libxfs_balloc
,
1321 .flush
= libxfs_bflush
,
1322 .relse
= libxfs_brelse
,
1323 .compare
= libxfs_bcompare
,
1324 .bulkrelse
= libxfs_bulkrelse
1329 * Inode cache stubs.
1332 kmem_zone_t
*xfs_inode_zone
;
1333 extern kmem_zone_t
*xfs_ili_zone
;
1336 * If there are inline format data / attr forks attached to this inode,
1337 * make sure they're not corrupt.
1340 libxfs_inode_verify_forks(
1341 struct xfs_inode
*ip
,
1342 struct xfs_ifork_ops
*ops
)
1344 struct xfs_ifork
*ifp
;
1350 fa
= xfs_ifork_verify_data(ip
, ops
);
1352 ifp
= XFS_IFORK_PTR(ip
, XFS_DATA_FORK
);
1353 xfs_inode_verifier_error(ip
, -EFSCORRUPTED
, "data fork",
1354 ifp
->if_u1
.if_data
, ifp
->if_bytes
, fa
);
1358 fa
= xfs_ifork_verify_attr(ip
, ops
);
1360 ifp
= XFS_IFORK_PTR(ip
, XFS_ATTR_FORK
);
1361 xfs_inode_verifier_error(ip
, -EFSCORRUPTED
, "attr fork",
1362 ifp
? ifp
->if_u1
.if_data
: NULL
,
1363 ifp
? ifp
->if_bytes
: 0, fa
);
1371 struct xfs_mount
*mp
,
1372 struct xfs_trans
*tp
,
1375 struct xfs_inode
**ipp
,
1376 struct xfs_ifork_ops
*ifork_ops
)
1378 struct xfs_inode
*ip
;
1381 ip
= kmem_zone_zalloc(xfs_inode_zone
, 0);
1387 error
= xfs_iread(mp
, tp
, ip
, 0);
1389 kmem_zone_free(xfs_inode_zone
, ip
);
1394 if (!libxfs_inode_verify_forks(ip
, ifork_ops
)) {
1396 return -EFSCORRUPTED
;
1400 * set up the inode ops structure that the libxfs code relies on
1403 ip
->d_ops
= mp
->m_dir_inode_ops
;
1405 ip
->d_ops
= mp
->m_nondir_inode_ops
;
1412 libxfs_idestroy(xfs_inode_t
*ip
)
1414 switch (VFS_I(ip
)->i_mode
& S_IFMT
) {
1418 libxfs_idestroy_fork(ip
, XFS_DATA_FORK
);
1422 libxfs_idestroy_fork(ip
, XFS_ATTR_FORK
);
1424 xfs_idestroy_fork(ip
, XFS_COW_FORK
);
1429 struct xfs_inode
*ip
)
1432 kmem_zone_free(xfs_ili_zone
, ip
->i_itemp
);
1434 libxfs_idestroy(ip
);
1435 kmem_zone_free(xfs_inode_zone
, ip
);