2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include "libxfs_priv.h"
23 #include "xfs_shared.h"
24 #include "xfs_format.h"
25 #include "xfs_log_format.h"
26 #include "xfs_trans_resv.h"
27 #include "xfs_mount.h"
28 #include "xfs_inode_buf.h"
29 #include "xfs_inode_fork.h"
30 #include "xfs_inode.h"
31 #include "xfs_trans.h"
33 #include "libxfs.h" /* for LIBXFS_EXIT_ON_FAILURE */
36 * Important design/architecture note:
38 * The userspace code that uses the buffer cache is much less constrained than
39 * the kernel code. The userspace code is pretty nasty in places, especially
40 * when it comes to buffer error handling. Very little of the userspace code
41 * outside libxfs clears bp->b_error - very little code even checks it - so the
42 * libxfs code is tripping on stale errors left by the userspace code.
44 * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
45 * in the kernel, because those functions are used by the libxfs_readbuf_*
46 * functions and hence need to leave the buffers unchanged on cache hits. This
47 * is actually the only way to gather a write error from a libxfs_writebuf()
48 * call - you need to get the buffer again so you can check bp->b_error field -
49 * assuming that the buffer is still in the cache when you check, that is.
51 * This is very different to the kernel code which does not release buffers on a
52 * write so we can wait on IO and check errors. The kernel buffer cache also
53 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
56 * IOWs, userspace is behaving quite differently to the kernel and as a result
57 * it leaks errors from reads, invalidations and writes through
58 * libxfs_getbuf/libxfs_readbuf.
60 * The result of this is that until the userspace code outside libxfs is cleaned
61 * up, functions that release buffers from userspace control (i.e
62 * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
63 * propagation of stale errors into future buffer operations.
66 #define BDSTRAT_SIZE (256 * 1024)
68 #define IO_BCOMPARE_CHECK
70 /* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
72 libxfs_device_zero(struct xfs_buftarg
*btp
, xfs_daddr_t start
, uint len
)
74 xfs_off_t start_offset
, end_offset
, offset
;
79 zsize
= min(BDSTRAT_SIZE
, BBTOB(len
));
80 if ((z
= memalign(libxfs_device_alignment(), zsize
)) == NULL
) {
82 _("%s: %s can't memalign %d bytes: %s\n"),
83 progname
, __FUNCTION__
, (int)zsize
, strerror(errno
));
88 fd
= libxfs_device_to_fd(btp
->dev
);
89 start_offset
= LIBXFS_BBTOOFF64(start
);
91 if ((lseek(fd
, start_offset
, SEEK_SET
)) < 0) {
92 fprintf(stderr
, _("%s: %s seek to offset %llu failed: %s\n"),
93 progname
, __FUNCTION__
,
94 (unsigned long long)start_offset
, strerror(errno
));
98 end_offset
= LIBXFS_BBTOOFF64(start
+ len
) - start_offset
;
99 for (offset
= 0; offset
< end_offset
; ) {
100 bytes
= min((ssize_t
)(end_offset
- offset
), zsize
);
101 if ((bytes
= write(fd
, z
, bytes
)) < 0) {
102 fprintf(stderr
, _("%s: %s write failed: %s\n"),
103 progname
, __FUNCTION__
, strerror(errno
));
105 } else if (bytes
== 0) {
106 fprintf(stderr
, _("%s: %s not progressing?\n"),
107 progname
, __FUNCTION__
);
116 static void unmount_record(void *p
)
118 xlog_op_header_t
*op
= (xlog_op_header_t
*)p
;
119 /* the data section must be 32 bit size aligned */
123 uint32_t pad2
; /* may as well make it 64 bits */
124 } magic
= { XLOG_UNMOUNT_TYPE
, 0, 0 };
126 memset(p
, 0, BBSIZE
);
127 /* dummy tid to mark this as written from userspace */
128 op
->oh_tid
= cpu_to_be32(0xb0c0d0d0);
129 op
->oh_len
= cpu_to_be32(sizeof(magic
));
130 op
->oh_clientid
= XFS_LOG
;
131 op
->oh_flags
= XLOG_UNMOUNT_TRANS
;
134 /* and the data for this op */
135 memcpy((char *)p
+ sizeof(xlog_op_header_t
), &magic
, sizeof(magic
));
143 struct xfs_buf
*buf
= (struct xfs_buf
*)private;
146 (XFS_BUF_COUNT(buf
) < (int)(ptr
- XFS_BUF_PTR(buf
)) + offset
))
153 * Format the log. The caller provides either a buftarg which is used to access
154 * the log via buffers or a direct pointer to a buffer that encapsulates the
159 struct xfs_buftarg
*btp
,
162 uint length
, /* basic blocks */
165 int sunit
, /* bytes */
170 struct xfs_buf
*bp
= NULL
;
178 if (((btp
&& dptr
) || (!btp
&& !dptr
)) ||
179 (btp
&& !btp
->dev
) || !fs_uuid
)
182 /* first zero the log */
184 libxfs_device_zero(btp
, start
, length
);
186 memset(dptr
, 0, BBTOB(length
));
189 * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
190 * special reset case where we only write a single record where the lsn
191 * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
192 * the specified cycle and points tail_lsn at the last record of the
195 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 2;
197 lsn
= xlog_assign_lsn(cycle
, 0);
198 if (cycle
== XLOG_INIT_CYCLE
)
201 tail_lsn
= xlog_assign_lsn(cycle
- 1, length
- len
);
203 /* write out the first log record */
206 bp
= libxfs_getbufr(btp
, start
, len
);
207 ptr
= XFS_BUF_PTR(bp
);
209 libxfs_log_header(ptr
, fs_uuid
, version
, sunit
, fmt
, lsn
, tail_lsn
,
212 bp
->b_flags
|= LIBXFS_B_DIRTY
;
217 * There's nothing else to do if this is a log reset. The kernel detects
218 * the rest of the log is zeroed and starts at cycle 1.
220 if (cycle
== XLOG_INIT_CYCLE
)
224 * Bump the record size for a full log format if the caller allows it.
225 * This is primarily for performance reasons and most callers don't care
226 * about record size since the log is clean after we're done.
229 len
= BTOBB(BDSTRAT_SIZE
);
232 * Otherwise, fill everything beyond the initial record with records of
233 * the previous cycle so the kernel head/tail detection works correctly.
235 * We don't particularly care about the record size or content here.
236 * It's only important that the headers are in place such that the
237 * kernel finds 1.) a clean log and 2.) the correct current cycle value.
238 * Therefore, bump up the record size to the max to use larger I/Os and
239 * improve performance.
245 end_blk
= start
+ length
;
247 len
= min(end_blk
- blk
, len
);
248 while (blk
< end_blk
) {
249 lsn
= xlog_assign_lsn(cycle
, blk
- start
);
250 tail_lsn
= xlog_assign_lsn(cycle
, blk
- start
- len
);
254 bp
= libxfs_getbufr(btp
, blk
, len
);
255 ptr
= XFS_BUF_PTR(bp
);
258 * Note: pass the full buffer length as the sunit to initialize
261 libxfs_log_header(ptr
, fs_uuid
, version
, BBTOB(len
), fmt
, lsn
,
264 bp
->b_flags
|= LIBXFS_B_DIRTY
;
271 len
= min(end_blk
- blk
, len
);
286 libxfs_get_block_t
*nextfunc
,
289 xlog_rec_header_t
*head
= (xlog_rec_header_t
*)caddr
;
295 if (lsn
== NULLCOMMITLSN
)
296 lsn
= xlog_assign_lsn(XLOG_INIT_CYCLE
, 0);
297 if (tail_lsn
== NULLCOMMITLSN
)
300 len
= ((version
== 2) && sunit
) ? BTOBB(sunit
) : 1;
302 memset(p
, 0, BBSIZE
);
303 head
->h_magicno
= cpu_to_be32(XLOG_HEADER_MAGIC_NUM
);
304 head
->h_cycle
= cpu_to_be32(CYCLE_LSN(lsn
));
305 head
->h_version
= cpu_to_be32(version
);
306 head
->h_crc
= cpu_to_le32(0);
307 head
->h_prev_block
= cpu_to_be32(-1);
308 head
->h_num_logops
= cpu_to_be32(1);
309 head
->h_fmt
= cpu_to_be32(fmt
);
310 head
->h_size
= cpu_to_be32(MAX(sunit
, XLOG_BIG_RECORD_BSIZE
));
312 head
->h_lsn
= cpu_to_be64(lsn
);
313 head
->h_tail_lsn
= cpu_to_be64(tail_lsn
);
315 memcpy(&head
->h_fs_uuid
, fs_uuid
, sizeof(uuid_t
));
318 * The kernel expects to see either a log record header magic value or
319 * the LSN cycle at the top of every log block. The first word of each
320 * non-header block is copied to the record headers and replaced with
321 * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
324 * Even though we only ever write an unmount record (one block), we
325 * support writing log records up to the max log buffer size of 256k to
326 * improve log format performance. This means a record can require up
327 * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
328 * data (each header supports 32k of data).
330 cycle_lsn
= CYCLE_LSN_DISK(head
->h_lsn
);
331 if (version
== 2 && sunit
> XLOG_HEADER_CYCLE_SIZE
) {
332 hdrs
= sunit
/ XLOG_HEADER_CYCLE_SIZE
;
333 if (sunit
% XLOG_HEADER_CYCLE_SIZE
)
338 * A fixed number of extended headers is expected based on h_size. If
339 * required, format those now so the unmount record is located
342 * Since we only write an unmount record, we only need one h_cycle_data
343 * entry for the unmount record block. The subsequent record data
344 * blocks are zeroed, which means we can stamp them directly with the
345 * cycle and zero the rest of the cycle data in the extended headers.
348 for (i
= 1; i
< hdrs
; i
++) {
349 p
= nextfunc(p
, BBSIZE
, private);
350 memset(p
, 0, BBSIZE
);
351 /* xlog_rec_ext_header.xh_cycle */
352 *(__be32
*)p
= cycle_lsn
;
357 * The total length is the max of the stripe unit or 2 basic block
358 * minimum (1 hdr blk + 1 data blk). The record length is the total
359 * minus however many header blocks are required.
361 head
->h_len
= cpu_to_be32(MAX(BBTOB(2), sunit
) - hdrs
* BBSIZE
);
364 * Write out the unmount record, pack the first word into the record
365 * header and stamp the block with the cycle.
367 p
= nextfunc(p
, BBSIZE
, private);
370 head
->h_cycle_data
[0] = *(__be32
*)p
;
371 *(__be32
*)p
= cycle_lsn
;
374 * Finally, zero all remaining blocks in the record and stamp each with
375 * the cycle. We don't need to pack any of these blocks because the
376 * cycle data in the headers has already been zeroed.
378 len
= MAX(len
, hdrs
+ 1);
379 for (i
= hdrs
+ 1; i
< len
; i
++) {
380 p
= nextfunc(p
, BBSIZE
, private);
381 memset(p
, 0, BBSIZE
);
382 *(__be32
*)p
= cycle_lsn
;
389 * Simple I/O (buffer cache) interface
393 #ifdef XFS_BUF_TRACING
395 #undef libxfs_readbuf
396 #undef libxfs_readbuf_map
397 #undef libxfs_writebuf
399 #undef libxfs_getbuf_map
400 #undef libxfs_getbuf_flags
403 xfs_buf_t
*libxfs_readbuf(struct xfs_buftarg
*, xfs_daddr_t
, int, int,
404 const struct xfs_buf_ops
*);
405 xfs_buf_t
*libxfs_readbuf_map(struct xfs_buftarg
*, struct xfs_buf_map
*,
406 int, int, const struct xfs_buf_ops
*);
407 int libxfs_writebuf(xfs_buf_t
*, int);
408 xfs_buf_t
*libxfs_getbuf(struct xfs_buftarg
*, xfs_daddr_t
, int);
409 xfs_buf_t
*libxfs_getbuf_map(struct xfs_buftarg
*, struct xfs_buf_map
*,
411 xfs_buf_t
*libxfs_getbuf_flags(struct xfs_buftarg
*, xfs_daddr_t
, int,
413 void libxfs_putbuf (xfs_buf_t
*);
415 #define __add_trace(bp, func, file, line) \
418 (bp)->b_func = (func); \
419 (bp)->b_file = (file); \
420 (bp)->b_line = (line); \
425 libxfs_trace_readbuf(const char *func
, const char *file
, int line
,
426 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, int flags
,
427 const struct xfs_buf_ops
*ops
)
429 xfs_buf_t
*bp
= libxfs_readbuf(btp
, blkno
, len
, flags
, ops
);
430 __add_trace(bp
, func
, file
, line
);
435 libxfs_trace_readbuf_map(const char *func
, const char *file
, int line
,
436 struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
, int flags
,
437 const struct xfs_buf_ops
*ops
)
439 xfs_buf_t
*bp
= libxfs_readbuf_map(btp
, map
, nmaps
, flags
, ops
);
440 __add_trace(bp
, func
, file
, line
);
445 libxfs_trace_writebuf(const char *func
, const char *file
, int line
, xfs_buf_t
*bp
, int flags
)
447 __add_trace(bp
, func
, file
, line
);
448 return libxfs_writebuf(bp
, flags
);
452 libxfs_trace_getbuf(const char *func
, const char *file
, int line
,
453 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
)
455 xfs_buf_t
*bp
= libxfs_getbuf(btp
, blkno
, len
);
456 __add_trace(bp
, func
, file
, line
);
461 libxfs_trace_getbuf_map(const char *func
, const char *file
, int line
,
462 struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
,
465 xfs_buf_t
*bp
= libxfs_getbuf_map(btp
, map
, nmaps
, flags
);
466 __add_trace(bp
, func
, file
, line
);
471 libxfs_trace_getbuf_flags(const char *func
, const char *file
, int line
,
472 struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, unsigned int flags
)
474 xfs_buf_t
*bp
= libxfs_getbuf_flags(btp
, blkno
, len
, flags
);
475 __add_trace(bp
, func
, file
, line
);
480 libxfs_trace_putbuf(const char *func
, const char *file
, int line
, xfs_buf_t
*bp
)
482 __add_trace(bp
, func
, file
, line
);
491 libxfs_getsb(xfs_mount_t
*mp
, int flags
)
493 return libxfs_readbuf(mp
->m_ddev_targp
, XFS_SB_DADDR
,
494 XFS_FSS_TO_BB(mp
, 1), flags
, &xfs_sb_buf_ops
);
497 kmem_zone_t
*xfs_buf_zone
;
499 static struct cache_mru xfs_buf_freelist
=
500 {{&xfs_buf_freelist
.cm_list
, &xfs_buf_freelist
.cm_list
},
501 0, PTHREAD_MUTEX_INITIALIZER
};
504 * The bufkey is used to pass the new buffer information to the cache object
505 * allocation routine. Because discontiguous buffers need to pass different
506 * information, we need fields to pass that information. However, because the
507 * blkno and bblen is needed for the initial cache entry lookup (i.e. for
508 * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
509 * buffer initialisation instead of a contiguous buffer.
512 struct xfs_buftarg
*buftarg
;
515 struct xfs_buf_map
*map
;
519 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
520 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
521 #define CACHE_LINE_SIZE 64
523 libxfs_bhash(cache_key_t key
, unsigned int hashsize
, unsigned int hashshift
)
525 uint64_t hashval
= ((struct xfs_bufkey
*)key
)->blkno
;
528 tmp
= hashval
^ (GOLDEN_RATIO_PRIME
+ hashval
) / CACHE_LINE_SIZE
;
529 tmp
= tmp
^ ((tmp
^ GOLDEN_RATIO_PRIME
) >> hashshift
);
530 return tmp
% hashsize
;
534 libxfs_bcompare(struct cache_node
*node
, cache_key_t key
)
536 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
537 struct xfs_bufkey
*bkey
= (struct xfs_bufkey
*)key
;
539 if (bp
->b_target
->dev
== bkey
->buftarg
->dev
&&
540 bp
->b_bn
== bkey
->blkno
) {
541 if (bp
->b_bcount
== BBTOB(bkey
->bblen
))
543 #ifdef IO_BCOMPARE_CHECK
544 if (!(libxfs_bcache
->c_flags
& CACHE_MISCOMPARE_PURGE
)) {
546 "%lx: Badness in key lookup (length)\n"
547 "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
549 (unsigned long long)bp
->b_bn
, (int)bp
->b_bcount
,
550 (unsigned long long)bkey
->blkno
,
560 libxfs_bprint(xfs_buf_t
*bp
)
562 fprintf(stderr
, "Buffer 0x%p blkno=%llu bytes=%u flags=0x%x count=%u\n",
563 bp
, (unsigned long long)bp
->b_bn
, (unsigned)bp
->b_bcount
,
564 bp
->b_flags
, bp
->b_node
.cn_count
);
568 __initbuf(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
573 bp
->b_bcount
= bytes
;
574 bp
->b_length
= BTOBB(bytes
);
578 bp
->b_addr
= memalign(libxfs_device_alignment(), bytes
);
581 _("%s: %s can't memalign %u bytes: %s\n"),
582 progname
, __FUNCTION__
, bytes
,
586 memset(bp
->b_addr
, 0, bytes
);
587 #ifdef XFS_BUF_TRACING
588 list_head_init(&bp
->b_lock_list
);
590 pthread_mutex_init(&bp
->b_lock
, NULL
);
597 bp
->b_maps
= &bp
->__b_map
;
598 bp
->b_maps
[0].bm_bn
= bp
->b_bn
;
599 bp
->b_maps
[0].bm_len
= bp
->b_length
;
604 libxfs_initbuf(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
, xfs_daddr_t bno
,
607 __initbuf(bp
, btp
, bno
, bytes
);
611 libxfs_initbuf_map(xfs_buf_t
*bp
, struct xfs_buftarg
*btp
,
612 struct xfs_buf_map
*map
, int nmaps
)
614 unsigned int bytes
= 0;
617 bytes
= sizeof(struct xfs_buf_map
) * nmaps
;
618 bp
->b_maps
= malloc(bytes
);
621 _("%s: %s can't malloc %u bytes: %s\n"),
622 progname
, __FUNCTION__
, bytes
,
629 for ( i
= 0; i
< nmaps
; i
++) {
630 bp
->b_maps
[i
].bm_bn
= map
[i
].bm_bn
;
631 bp
->b_maps
[i
].bm_len
= map
[i
].bm_len
;
632 bytes
+= BBTOB(map
[i
].bm_len
);
635 __initbuf(bp
, btp
, map
[0].bm_bn
, bytes
);
636 bp
->b_flags
|= LIBXFS_B_DISCONTIG
;
640 __libxfs_getbufr(int blen
)
645 * first look for a buffer that can be used as-is,
646 * if one cannot be found, see if there is a buffer,
647 * and if so, free its buffer and set b_addr to NULL
648 * before calling libxfs_initbuf.
650 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
651 if (!list_empty(&xfs_buf_freelist
.cm_list
)) {
652 list_for_each_entry(bp
, &xfs_buf_freelist
.cm_list
, b_node
.cn_mru
) {
653 if (bp
->b_bcount
== blen
) {
654 list_del_init(&bp
->b_node
.cn_mru
);
658 if (&bp
->b_node
.cn_mru
== &xfs_buf_freelist
.cm_list
) {
659 bp
= list_entry(xfs_buf_freelist
.cm_list
.next
,
660 xfs_buf_t
, b_node
.cn_mru
);
661 list_del_init(&bp
->b_node
.cn_mru
);
664 if (bp
->b_maps
!= &bp
->__b_map
)
669 bp
= kmem_zone_zalloc(xfs_buf_zone
, 0);
670 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
672 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
673 fprintf(stderr
, "found dirty buffer (bulk) on free list!");
679 libxfs_getbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
)
682 int blen
= BBTOB(bblen
);
684 bp
=__libxfs_getbufr(blen
);
686 libxfs_initbuf(bp
, btp
, blkno
, blen
);
688 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
689 pthread_self(), __FUNCTION__
, blen
,
690 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
697 libxfs_getbufr_map(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int bblen
,
698 struct xfs_buf_map
*map
, int nmaps
)
701 int blen
= BBTOB(bblen
);
703 if (!map
|| !nmaps
) {
705 _("%s: %s invalid map %p or nmaps %d\n"),
706 progname
, __FUNCTION__
, map
, nmaps
);
710 if (blkno
!= map
[0].bm_bn
) {
712 _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
713 progname
, __FUNCTION__
, (long long)map
[0].bm_bn
,
718 bp
=__libxfs_getbufr(blen
);
720 libxfs_initbuf_map(bp
, btp
, map
, nmaps
);
722 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
723 pthread_self(), __FUNCTION__
, blen
,
724 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
730 #ifdef XFS_BUF_TRACING
731 struct list_head lock_buf_list
= {&lock_buf_list
, &lock_buf_list
};
732 int lock_buf_count
= 0;
735 extern int use_xfs_buf_lock
;
737 static struct xfs_buf
*
738 __cache_lookup(struct xfs_bufkey
*key
, unsigned int flags
)
742 cache_node_get(libxfs_bcache
, key
, (struct cache_node
**)&bp
);
746 if (use_xfs_buf_lock
) {
749 ret
= pthread_mutex_trylock(&bp
->b_lock
);
751 ASSERT(ret
== EAGAIN
);
752 if (flags
& LIBXFS_GETBUF_TRYLOCK
)
755 if (pthread_equal(bp
->b_holder
, pthread_self())) {
757 _("Warning: recursive buffer locking at block %" PRIu64
" detected\n"),
762 pthread_mutex_lock(&bp
->b_lock
);
766 bp
->b_holder
= pthread_self();
769 cache_node_set_priority(libxfs_bcache
, (struct cache_node
*)bp
,
770 cache_node_get_priority((struct cache_node
*)bp
) -
771 CACHE_PREFETCH_PRIORITY
);
772 #ifdef XFS_BUF_TRACING
773 pthread_mutex_lock(&libxfs_bcache
->c_mutex
);
775 list_add(&bp
->b_lock_list
, &lock_buf_list
);
776 pthread_mutex_unlock(&libxfs_bcache
->c_mutex
);
779 printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
780 pthread_self(), __FUNCTION__
,
781 bp
, bp
->b_bn
, (long long)LIBXFS_BBTOOFF64(key
->blkno
));
786 cache_node_put(libxfs_bcache
, (struct cache_node
*)bp
);
791 libxfs_getbuf_flags(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
,
794 struct xfs_bufkey key
= {0};
800 return __cache_lookup(&key
, flags
);
804 * Clean the buffer flags for libxfs_getbuf*(), which wants to return
805 * an unused buffer with clean state. This prevents CRC errors on a
806 * re-read of a corrupt block that was prefetched and freed. This
807 * can happen with a massively corrupt directory that is discarded,
808 * but whose blocks are then recycled into expanding lost+found.
810 * Note however that if the buffer's dirty (prefetch calls getbuf)
811 * we'll leave the state alone because we don't want to discard blocks
812 * that have been fixed.
818 if (bp
&& !(bp
->b_flags
& LIBXFS_B_DIRTY
))
819 bp
->b_flags
&= ~(LIBXFS_B_UNCHECKED
| LIBXFS_B_STALE
|
824 libxfs_getbuf(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
)
828 bp
= libxfs_getbuf_flags(btp
, blkno
, len
, 0);
833 static struct xfs_buf
*
834 __libxfs_getbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
,
835 int nmaps
, int flags
)
837 struct xfs_bufkey key
= {0};
841 return libxfs_getbuf_flags(btp
, map
[0].bm_bn
, map
[0].bm_len
,
845 key
.blkno
= map
[0].bm_bn
;
846 for (i
= 0; i
< nmaps
; i
++) {
847 key
.bblen
+= map
[i
].bm_len
;
852 return __cache_lookup(&key
, flags
);
856 libxfs_getbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
,
857 int nmaps
, int flags
)
861 bp
= __libxfs_getbuf_map(btp
, map
, nmaps
, flags
);
867 libxfs_putbuf(xfs_buf_t
*bp
)
870 * ensure that any errors on this use of the buffer don't carry
871 * over to the next user.
875 #ifdef XFS_BUF_TRACING
876 pthread_mutex_lock(&libxfs_bcache
->c_mutex
);
878 ASSERT(lock_buf_count
>= 0);
879 list_del_init(&bp
->b_lock_list
);
880 pthread_mutex_unlock(&libxfs_bcache
->c_mutex
);
882 if (use_xfs_buf_lock
) {
887 pthread_mutex_unlock(&bp
->b_lock
);
891 cache_node_put(libxfs_bcache
, (struct cache_node
*)bp
);
895 libxfs_purgebuf(xfs_buf_t
*bp
)
897 struct xfs_bufkey key
= {0};
899 key
.buftarg
= bp
->b_target
;
900 key
.blkno
= bp
->b_bn
;
901 key
.bblen
= bp
->b_length
;
903 cache_node_purge(libxfs_bcache
, &key
, (struct cache_node
*)bp
);
906 static struct cache_node
*
907 libxfs_balloc(cache_key_t key
)
909 struct xfs_bufkey
*bufkey
= (struct xfs_bufkey
*)key
;
912 return (struct cache_node
*)
913 libxfs_getbufr_map(bufkey
->buftarg
,
914 bufkey
->blkno
, bufkey
->bblen
,
915 bufkey
->map
, bufkey
->nmaps
);
916 return (struct cache_node
*)libxfs_getbufr(bufkey
->buftarg
,
917 bufkey
->blkno
, bufkey
->bblen
);
922 __read_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
926 sts
= pread(fd
, buf
, len
, offset
);
929 fprintf(stderr
, _("%s: read failed: %s\n"),
930 progname
, strerror(error
));
931 if (flags
& LIBXFS_EXIT_ON_FAILURE
)
934 } else if (sts
!= len
) {
935 fprintf(stderr
, _("%s: error - read only %d of %d bytes\n"),
937 if (flags
& LIBXFS_EXIT_ON_FAILURE
)
945 libxfs_readbufr(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, xfs_buf_t
*bp
,
948 int fd
= libxfs_device_to_fd(btp
->dev
);
949 int bytes
= BBTOB(len
);
952 ASSERT(BBTOB(len
) <= bp
->b_bcount
);
954 error
= __read_buf(fd
, bp
->b_addr
, bytes
, LIBXFS_BBTOOFF64(blkno
), flags
);
956 bp
->b_target
->dev
== btp
->dev
&&
958 bp
->b_bcount
== bytes
)
959 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
961 printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
962 pthread_self(), __FUNCTION__
, bytes
, error
,
963 (long long)LIBXFS_BBTOOFF64(blkno
), (long long)blkno
, bp
);
969 libxfs_readbuf_verify(struct xfs_buf
*bp
, const struct xfs_buf_ops
*ops
)
974 bp
->b_ops
->verify_read(bp
);
975 bp
->b_flags
&= ~LIBXFS_B_UNCHECKED
;
980 libxfs_readbuf(struct xfs_buftarg
*btp
, xfs_daddr_t blkno
, int len
, int flags
,
981 const struct xfs_buf_ops
*ops
)
986 bp
= libxfs_getbuf_flags(btp
, blkno
, len
, 0);
991 * if the buffer was prefetched, it is likely that it was not validated.
992 * Hence if we are supplied an ops function and the buffer is marked as
993 * unchecked, we need to validate it now.
995 * We do this verification even if the buffer is dirty - the
996 * verification is almost certainly going to fail the CRC check in this
997 * case as a dirty buffer has not had the CRC recalculated. However, we
998 * should not be dirtying unchecked buffers and therefore failing it
999 * here because it's dirty and unchecked indicates we've screwed up
1003 if ((bp
->b_flags
& (LIBXFS_B_UPTODATE
|LIBXFS_B_DIRTY
))) {
1004 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
1005 libxfs_readbuf_verify(bp
, ops
);
1010 * Set the ops on a cache miss (i.e. first physical read) as the
1011 * verifier may change the ops to match the type of buffer it contains.
1012 * A cache hit might reset the verifier to the original type if we set
1013 * it again, but it won't get called again and set to match the buffer
1014 * contents. *cough* xfs_da_node_buf_ops *cough*.
1016 error
= libxfs_readbufr(btp
, blkno
, bp
, len
, flags
);
1018 bp
->b_error
= error
;
1020 libxfs_readbuf_verify(bp
, ops
);
1025 libxfs_readbufr_map(struct xfs_buftarg
*btp
, struct xfs_buf
*bp
, int flags
)
1032 fd
= libxfs_device_to_fd(btp
->dev
);
1034 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
1035 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
1036 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
1038 error
= __read_buf(fd
, buf
, len
, offset
, flags
);
1040 bp
->b_error
= error
;
1047 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
1049 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1050 pthread_self(), __FUNCTION__
, buf
- (char *)bp
->b_addr
, error
,
1051 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
), (long long)bp
->b_bn
, bp
);
1057 libxfs_readbuf_map(struct xfs_buftarg
*btp
, struct xfs_buf_map
*map
, int nmaps
,
1058 int flags
, const struct xfs_buf_ops
*ops
)
1064 return libxfs_readbuf(btp
, map
[0].bm_bn
, map
[0].bm_len
,
1067 bp
= __libxfs_getbuf_map(btp
, map
, nmaps
, 0);
1072 if ((bp
->b_flags
& (LIBXFS_B_UPTODATE
|LIBXFS_B_DIRTY
))) {
1073 if (bp
->b_flags
& LIBXFS_B_UNCHECKED
)
1074 libxfs_readbuf_verify(bp
, ops
);
1077 error
= libxfs_readbufr_map(btp
, bp
, flags
);
1079 libxfs_readbuf_verify(bp
, ops
);
1082 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1083 pthread_self(), __FUNCTION__
, buf
- (char *)bp
->b_addr
, error
,
1084 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
), (long long)bp
->b_bn
, bp
);
1090 __write_buf(int fd
, void *buf
, int len
, off64_t offset
, int flags
)
1094 sts
= pwrite(fd
, buf
, len
, offset
);
1097 fprintf(stderr
, _("%s: pwrite failed: %s\n"),
1098 progname
, strerror(error
));
1099 if (flags
& LIBXFS_B_EXIT
)
1102 } else if (sts
!= len
) {
1103 fprintf(stderr
, _("%s: error - pwrite only %d of %d bytes\n"),
1104 progname
, sts
, len
);
1105 if (flags
& LIBXFS_B_EXIT
)
1113 libxfs_writebufr(xfs_buf_t
*bp
)
1115 int fd
= libxfs_device_to_fd(bp
->b_target
->dev
);
1118 * we never write buffers that are marked stale. This indicates they
1119 * contain data that has been invalidated, and even if the buffer is
1120 * dirty it must *never* be written. Verifiers are wonderful for finding
1121 * bugs like this. Make sure the error is obvious as to the cause.
1123 if (bp
->b_flags
& LIBXFS_B_STALE
) {
1124 bp
->b_error
= -ESTALE
;
1129 * clear any pre-existing error status on the buffer. This can occur if
1130 * the buffer is corrupt on disk and the repair process doesn't clear
1131 * the error before fixing and writing it back.
1135 bp
->b_ops
->verify_write(bp
);
1138 _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
1139 __func__
, bp
->b_ops
->name
,
1140 (long long)bp
->b_bn
, bp
->b_bcount
);
1145 if (!(bp
->b_flags
& LIBXFS_B_DISCONTIG
)) {
1146 bp
->b_error
= __write_buf(fd
, bp
->b_addr
, bp
->b_bcount
,
1147 LIBXFS_BBTOOFF64(bp
->b_bn
), bp
->b_flags
);
1150 char *buf
= bp
->b_addr
;
1152 for (i
= 0; i
< bp
->b_nmaps
; i
++) {
1153 off64_t offset
= LIBXFS_BBTOOFF64(bp
->b_maps
[i
].bm_bn
);
1154 int len
= BBTOB(bp
->b_maps
[i
].bm_len
);
1156 bp
->b_error
= __write_buf(fd
, buf
, len
, offset
,
1165 printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
1166 pthread_self(), __FUNCTION__
, bp
->b_bcount
,
1167 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
),
1168 (long long)bp
->b_bn
, bp
, bp
->b_error
);
1171 bp
->b_flags
|= LIBXFS_B_UPTODATE
;
1172 bp
->b_flags
&= ~(LIBXFS_B_DIRTY
| LIBXFS_B_EXIT
|
1173 LIBXFS_B_UNCHECKED
);
1179 libxfs_writebuf_int(xfs_buf_t
*bp
, int flags
)
1182 * Clear any error hanging over from reading the buffer. This prevents
1183 * subsequent reads after this write from seeing stale errors.
1186 bp
->b_flags
&= ~LIBXFS_B_STALE
;
1187 bp
->b_flags
|= (LIBXFS_B_DIRTY
| flags
);
1192 libxfs_writebuf(xfs_buf_t
*bp
, int flags
)
1195 printf("%lx: %s: dirty blkno=%llu(%llu)\n",
1196 pthread_self(), __FUNCTION__
,
1197 (long long)LIBXFS_BBTOOFF64(bp
->b_bn
),
1198 (long long)bp
->b_bn
);
1201 * Clear any error hanging over from reading the buffer. This prevents
1202 * subsequent reads after this write from seeing stale errors.
1205 bp
->b_flags
&= ~LIBXFS_B_STALE
;
1206 bp
->b_flags
|= (LIBXFS_B_DIRTY
| flags
);
1212 libxfs_iomove(xfs_buf_t
*bp
, uint boff
, int len
, void *data
, int flags
)
1215 if (boff
+ len
> bp
->b_bcount
) {
1216 printf("Badness, iomove out of range!\n"
1217 "bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
1218 (long long)bp
->b_bn
, bp
->b_bcount
, boff
, len
);
1224 memset(bp
->b_addr
+ boff
, 0, len
);
1227 memcpy(data
, bp
->b_addr
+ boff
, len
);
1230 memcpy(bp
->b_addr
+ boff
, data
, len
);
1237 struct cache_node
*node
)
1239 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
1243 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1245 "releasing dirty buffer to free list!");
1247 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
1248 list_add(&bp
->b_node
.cn_mru
, &xfs_buf_freelist
.cm_list
);
1249 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
1254 struct cache
*cache
,
1255 struct list_head
*list
)
1260 if (list_empty(list
))
1263 list_for_each_entry(bp
, list
, b_node
.cn_mru
) {
1264 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1266 "releasing dirty buffer (bulk) to free list!");
1270 pthread_mutex_lock(&xfs_buf_freelist
.cm_mutex
);
1271 list_splice(list
, &xfs_buf_freelist
.cm_list
);
1272 pthread_mutex_unlock(&xfs_buf_freelist
.cm_mutex
);
1278 * When a buffer is marked dirty, the error is cleared. Hence if we are trying
1279 * to flush a buffer prior to cache reclaim that has an error on it it means
1280 * we've already tried to flush it and it failed. Prevent repeated corruption
1281 * errors from being reported by skipping such buffers - when the corruption is
1282 * fixed the buffer will be marked dirty again and we can write it again.
1286 struct cache_node
*node
)
1288 struct xfs_buf
*bp
= (struct xfs_buf
*)node
;
1290 if (!bp
->b_error
&& bp
->b_flags
& LIBXFS_B_DIRTY
)
1291 return libxfs_writebufr(bp
);
1296 libxfs_putbufr(xfs_buf_t
*bp
)
1298 if (bp
->b_flags
& LIBXFS_B_DIRTY
)
1299 libxfs_writebufr(bp
);
1300 libxfs_brelse((struct cache_node
*)bp
);
1305 libxfs_bcache_purge(void)
1307 cache_purge(libxfs_bcache
);
1311 libxfs_bcache_flush(void)
1313 cache_flush(libxfs_bcache
);
1317 libxfs_bcache_overflowed(void)
1319 return cache_overflowed(libxfs_bcache
);
1322 struct cache_operations libxfs_bcache_operations
= {
1323 .hash
= libxfs_bhash
,
1324 .alloc
= libxfs_balloc
,
1325 .flush
= libxfs_bflush
,
1326 .relse
= libxfs_brelse
,
1327 .compare
= libxfs_bcompare
,
1328 .bulkrelse
= libxfs_bulkrelse
1333 * Inode cache stubs.
1336 extern kmem_zone_t
*xfs_ili_zone
;
1337 extern kmem_zone_t
*xfs_inode_zone
;
1340 libxfs_iget(xfs_mount_t
*mp
, xfs_trans_t
*tp
, xfs_ino_t ino
, uint lock_flags
,
1346 ip
= kmem_zone_zalloc(xfs_inode_zone
, 0);
1352 error
= xfs_iread(mp
, tp
, ip
, 0);
1354 kmem_zone_free(xfs_inode_zone
, ip
);
1360 * set up the inode ops structure that the libxfs code relies on
1363 ip
->d_ops
= mp
->m_dir_inode_ops
;
1365 ip
->d_ops
= mp
->m_nondir_inode_ops
;
1372 libxfs_idestroy(xfs_inode_t
*ip
)
1374 switch (VFS_I(ip
)->i_mode
& S_IFMT
) {
1378 libxfs_idestroy_fork(ip
, XFS_DATA_FORK
);
1382 libxfs_idestroy_fork(ip
, XFS_ATTR_FORK
);
1384 xfs_idestroy_fork(ip
, XFS_COW_FORK
);
1388 libxfs_iput(xfs_inode_t
*ip
)
1391 kmem_zone_free(xfs_ili_zone
, ip
->i_itemp
);
1393 libxfs_idestroy(ip
);
1394 kmem_zone_free(xfs_inode_zone
, ip
);