]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - libxfs/rdwr.c
xfs: cleanup xfs_idestroy_fork
[thirdparty/xfsprogs-dev.git] / libxfs / rdwr.c
CommitLineData
959ef981 1// SPDX-License-Identifier: GPL-2.0
2bd0ea18 2/*
f1b058f9 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
da23017d 4 * All Rights Reserved.
2bd0ea18
NS
5 */
6
b626fb59 7
9c799827 8#include "libxfs_priv.h"
1aef52f8 9#include "init.h"
b626fb59
DC
10#include "xfs_fs.h"
11#include "xfs_shared.h"
12#include "xfs_format.h"
13#include "xfs_log_format.h"
14#include "xfs_trans_resv.h"
15#include "xfs_mount.h"
16#include "xfs_inode_buf.h"
17#include "xfs_inode_fork.h"
18#include "xfs_inode.h"
19#include "xfs_trans.h"
c335b673 20#include "libfrog/platform.h"
b626fb59 21
ac7ad9aa 22#include "libxfs.h"
2bd0ea18 23
1a12e432
DW
24static void libxfs_brelse(struct cache_node *node);
25
6af7c1ea
DC
26/*
27 * Important design/architecture note:
28 *
29 * The userspace code that uses the buffer cache is much less constrained than
30 * the kernel code. The userspace code is pretty nasty in places, especially
31 * when it comes to buffer error handling. Very little of the userspace code
32 * outside libxfs clears bp->b_error - very little code even checks it - so the
33 * libxfs code is tripping on stale errors left by the userspace code.
34 *
8b4de37c 35 * We can't clear errors or zero buffer contents in libxfs_buf_get-* like we do
6af7c1ea
DC
36 * in the kernel, because those functions are used by the libxfs_readbuf_*
37 * functions and hence need to leave the buffers unchanged on cache hits. This
38 * is actually the only way to gather a write error from a libxfs_writebuf()
39 * call - you need to get the buffer again so you can check bp->b_error field -
40 * assuming that the buffer is still in the cache when you check, that is.
41 *
42 * This is very different to the kernel code which does not release buffers on a
43 * write so we can wait on IO and check errors. The kernel buffer cache also
44 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
45 * cache hit.
46 *
47 * IOWs, userspace is behaving quite differently to the kernel and as a result
48 * it leaks errors from reads, invalidations and writes through
361379e0 49 * libxfs_buf_get/libxfs_buf_read.
6af7c1ea
DC
50 *
51 * The result of this is that until the userspace code outside libxfs is cleaned
52 * up, functions that release buffers from userspace control (i.e
e02ba985 53 * libxfs_writebuf/libxfs_buf_relse) need to zero bp->b_error to prevent
6af7c1ea
DC
54 * propagation of stale errors into future buffer operations.
55 */
56
5000d01d 57#define BDSTRAT_SIZE (256 * 1024)
2bd0ea18 58
2556c98b
BN
59#define IO_BCOMPARE_CHECK
60
9542ae13
DC
61/* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
62int
75c8b434 63libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
2bd0ea18 64{
3cc4d0db
NS
65 xfs_off_t start_offset, end_offset, offset;
66 ssize_t zsize, bytes;
9d6023a8 67 size_t len_bytes;
2bd0ea18 68 char *z;
9d6023a8
ES
69 int error, fd;
70
71 fd = libxfs_device_to_fd(btp->dev);
72 start_offset = LIBXFS_BBTOOFF64(start);
73
74 /* try to use special zeroing methods, fall back to writes if needed */
75 len_bytes = LIBXFS_BBTOOFF64(len);
76 error = platform_zero_range(fd, start_offset, len_bytes);
77 if (!error)
78 return 0;
2bd0ea18 79
3cc4d0db 80 zsize = min(BDSTRAT_SIZE, BBTOB(len));
b74a1f6a 81 if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
9440d84d
NS
82 fprintf(stderr,
83 _("%s: %s can't memalign %d bytes: %s\n"),
7dfd8291 84 progname, __FUNCTION__, (int)zsize, strerror(errno));
2bd0ea18
NS
85 exit(1);
86 }
3cc4d0db
NS
87 memset(z, 0, zsize);
88
dc8878f4 89 if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
3cc4d0db 90 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
7dfd8291
NS
91 progname, __FUNCTION__,
92 (unsigned long long)start_offset, strerror(errno));
3cc4d0db
NS
93 exit(1);
94 }
95
cb5b3ef4 96 end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
3cc4d0db
NS
97 for (offset = 0; offset < end_offset; ) {
98 bytes = min((ssize_t)(end_offset - offset), zsize);
99 if ((bytes = write(fd, z, bytes)) < 0) {
100 fprintf(stderr, _("%s: %s write failed: %s\n"),
9440d84d 101 progname, __FUNCTION__, strerror(errno));
2bd0ea18 102 exit(1);
3cc4d0db
NS
103 } else if (bytes == 0) {
104 fprintf(stderr, _("%s: %s not progressing?\n"),
105 progname, __FUNCTION__);
106 exit(1);
2bd0ea18 107 }
3cc4d0db 108 offset += bytes;
2bd0ea18
NS
109 }
110 free(z);
9542ae13 111 return 0;
2bd0ea18
NS
112}
113
989b74bc 114static void unmount_record(void *p)
2bd0ea18 115{
989b74bc 116 xlog_op_header_t *op = (xlog_op_header_t *)p;
5000d01d
SL
117 /* the data section must be 32 bit size aligned */
118 struct {
14f8b681
DW
119 uint16_t magic;
120 uint16_t pad1;
121 uint32_t pad2; /* may as well make it 64 bits */
5000d01d
SL
122 } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
123
989b74bc 124 memset(p, 0, BBSIZE);
ad9b88eb
BF
125 /* dummy tid to mark this as written from userspace */
126 op->oh_tid = cpu_to_be32(0xb0c0d0d0);
5e656dbb
BN
127 op->oh_len = cpu_to_be32(sizeof(magic));
128 op->oh_clientid = XFS_LOG;
129 op->oh_flags = XLOG_UNMOUNT_TRANS;
130 op->oh_res2 = 0;
989b74bc
NS
131
132 /* and the data for this op */
1552a820 133 memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
989b74bc
NS
134}
135
1c12a814
BF
136static char *next(
137 char *ptr,
138 int offset,
139 void *private)
989b74bc 140{
1c12a814 141 struct xfs_buf *buf = (struct xfs_buf *)private;
989b74bc 142
1c12a814 143 if (buf &&
135e4bfe 144 (buf->b_bcount < (int)(ptr - (char *)buf->b_addr) + offset))
989b74bc 145 abort();
1c12a814 146
989b74bc
NS
147 return ptr + offset;
148}
149
2556c98b
BN
150/*
151 * Simple I/O (buffer cache) interface
152 */
153
154
155#ifdef XFS_BUF_TRACING
156
8288ea3d 157#undef libxfs_buf_read_map
2556c98b 158#undef libxfs_writebuf
f315ae4f 159#undef libxfs_buf_get_map
2556c98b 160
4c947857
DW
161int libxfs_buf_read_map(struct xfs_buftarg *btp,
162 struct xfs_buf_map *maps, int nmaps, int flags,
163 struct xfs_buf **bpp,
164 const struct xfs_buf_ops *ops);
2556c98b 165int libxfs_writebuf(xfs_buf_t *, int);
583ca112
DW
166int libxfs_buf_get_map(struct xfs_buftarg *btp,
167 struct xfs_buf_map *maps, int nmaps, int flags,
168 struct xfs_buf **bpp);
e02ba985 169void libxfs_buf_relse(struct xfs_buf *bp);
2556c98b 170
a2ceac1f
DC
171#define __add_trace(bp, func, file, line) \
172do { \
173 if (bp) { \
174 (bp)->b_func = (func); \
175 (bp)->b_file = (file); \
176 (bp)->b_line = (line); \
177 } \
178} while (0)
179
31079e67 180int
361379e0
DW
181libxfs_trace_readbuf(
182 const char *func,
183 const char *file,
184 int line,
185 struct xfs_buftarg *btp,
186 xfs_daddr_t blkno,
187 size_t len,
188 int flags,
31079e67
DW
189 const struct xfs_buf_ops *ops,
190 struct xfs_buf **bpp)
2556c98b 191{
31079e67 192 int error;
361379e0 193 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
2556c98b 194
31079e67
DW
195 error = libxfs_buf_read_map(btp, &map, 1, flags, bpp, ops);
196 __add_trace(*bpp, func, file, line);
197 return error;
2556c98b
BN
198}
199
4c947857
DW
200int
201libxfs_trace_readbuf_map(
202 const char *func,
203 const char *file,
204 int line,
205 struct xfs_buftarg *btp,
206 struct xfs_buf_map *map,
207 int nmaps,
208 int flags,
209 struct xfs_buf **bpp,
210 const struct xfs_buf_ops *ops)
211{
212 int error;
213
214 error = libxfs_buf_read_map(btp, map, nmaps, flags, bpp, ops);
215 __add_trace(*bpp, func, file, line);
216 return error;
217}
218
18b4f688
DW
219void
220libxfs_trace_dirtybuf(
221 const char *func,
222 const char *file,
223 int line,
f524ae04 224 struct xfs_buf *bp)
2556c98b 225{
a2ceac1f 226 __add_trace(bp, func, file, line);
f524ae04 227 libxfs_buf_mark_dirty(bp);
2556c98b
BN
228}
229
58a8b31f 230int
8b4de37c
DW
231libxfs_trace_getbuf(
232 const char *func,
233 const char *file,
234 int line,
235 struct xfs_buftarg *btp,
236 xfs_daddr_t blkno,
58a8b31f
DW
237 size_t len,
238 struct xfs_buf **bpp)
2556c98b 239{
58a8b31f 240 int error;
8b4de37c
DW
241 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
242
58a8b31f 243 error = libxfs_buf_get_map(target, &map, 1, 0, bpp);
a2ceac1f 244 __add_trace(bp, func, file, line);
58a8b31f 245 return error;
a2ceac1f 246}
2556c98b 247
583ca112
DW
248int
249libxfs_trace_getbuf_map(
250 const char *func,
251 const char *file,
252 int line,
253 struct xfs_buftarg *btp,
254 struct xfs_buf_map *map,
255 int nmaps,
256 int flags,
257 struct xfs_buf **bpp)
a2ceac1f 258{
583ca112
DW
259 int error;
260
261 error = libxfs_buf_get_map(btp, map, nmaps, flags, bpp);
262 __add_trace(*bpp, func, file, line);
263 return error;
2556c98b
BN
264}
265
266void
267libxfs_trace_putbuf(const char *func, const char *file, int line, xfs_buf_t *bp)
268{
a2ceac1f 269 __add_trace(bp, func, file, line);
e02ba985 270 libxfs_buf_relse(bp);
2556c98b
BN
271}
272
273
274#endif
275
276
361379e0
DW
277struct xfs_buf *
278libxfs_getsb(
279 struct xfs_mount *mp)
f1b058f9 280{
31079e67
DW
281 struct xfs_buf *bp;
282
283 libxfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, XFS_FSS_TO_BB(mp, 1),
284 0, &bp, &xfs_sb_buf_ops);
285 return bp;
f1b058f9
NS
286}
287
5e656dbb 288kmem_zone_t *xfs_buf_zone;
69ec88b5
BN
289
290static struct cache_mru xfs_buf_freelist =
291 {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
292 0, PTHREAD_MUTEX_INITIALIZER };
f1b058f9 293
a2ceac1f
DC
294/*
295 * The bufkey is used to pass the new buffer information to the cache object
296 * allocation routine. Because discontiguous buffers need to pass different
297 * information, we need fields to pass that information. However, because the
298 * blkno and bblen is needed for the initial cache entry lookup (i.e. for
299 * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
300 * buffer initialisation instead of a contiguous buffer.
301 */
302struct xfs_bufkey {
75c8b434 303 struct xfs_buftarg *buftarg;
a2ceac1f
DC
304 xfs_daddr_t blkno;
305 unsigned int bblen;
306 struct xfs_buf_map *map;
307 int nmaps;
308};
f1b058f9 309
602dcc0e
DC
310/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
311#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
312#define CACHE_LINE_SIZE 64
f1b058f9 313static unsigned int
602dcc0e 314libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
f1b058f9 315{
602dcc0e
DC
316 uint64_t hashval = ((struct xfs_bufkey *)key)->blkno;
317 uint64_t tmp;
318
319 tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
320 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
321 return tmp % hashsize;
f1b058f9
NS
322}
323
324static int
325libxfs_bcompare(struct cache_node *node, cache_key_t key)
326{
063516bb
DW
327 struct xfs_buf *bp = container_of(node, struct xfs_buf,
328 b_node);
329 struct xfs_bufkey *bkey = (struct xfs_bufkey *)key;
f1b058f9 330
75c8b434 331 if (bp->b_target->dev == bkey->buftarg->dev &&
ba9ecd40
DC
332 bp->b_bn == bkey->blkno) {
333 if (bp->b_bcount == BBTOB(bkey->bblen))
334 return CACHE_HIT;
335#ifdef IO_BCOMPARE_CHECK
336 if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
337 fprintf(stderr,
338 "%lx: Badness in key lookup (length)\n"
339 "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
340 pthread_self(),
341 (unsigned long long)bp->b_bn, (int)bp->b_bcount,
342 (unsigned long long)bkey->blkno,
343 BBTOB(bkey->bblen));
344 }
f1b058f9 345#endif
ba9ecd40
DC
346 return CACHE_PURGE;
347 }
348 return CACHE_MISS;
f1b058f9
NS
349}
350
e6b359b3 351static void
75c8b434
DC
352__initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
353 unsigned int bytes)
e6b359b3
NS
354{
355 bp->b_flags = 0;
5dfa5cd2 356 bp->b_bn = bno;
e6b359b3 357 bp->b_bcount = bytes;
a2ceac1f 358 bp->b_length = BTOBB(bytes);
75c8b434 359 bp->b_target = btp;
7861ef77 360 bp->b_mount = btp->bt_mount;
a6a7776a 361 bp->b_error = 0;
69ec88b5
BN
362 if (!bp->b_addr)
363 bp->b_addr = memalign(libxfs_device_alignment(), bytes);
e6b359b3
NS
364 if (!bp->b_addr) {
365 fprintf(stderr,
366 _("%s: %s can't memalign %u bytes: %s\n"),
367 progname, __FUNCTION__, bytes,
368 strerror(errno));
369 exit(1);
370 }
bf43fd28 371 memset(bp->b_addr, 0, bytes);
2556c98b
BN
372#ifdef XFS_BUF_TRACING
373 list_head_init(&bp->b_lock_list);
374#endif
375 pthread_mutex_init(&bp->b_lock, NULL);
50722af1
CH
376 bp->b_holder = 0;
377 bp->b_recur = 0;
75c8b434 378 bp->b_ops = NULL;
2c6c6328
BF
379
380 if (!bp->b_maps) {
381 bp->b_nmaps = 1;
382 bp->b_maps = &bp->__b_map;
383 bp->b_maps[0].bm_bn = bp->b_bn;
384 bp->b_maps[0].bm_len = bp->b_length;
385 }
e6b359b3
NS
386}
387
a2ceac1f 388static void
75c8b434
DC
389libxfs_initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
390 unsigned int bytes)
a2ceac1f 391{
75c8b434 392 __initbuf(bp, btp, bno, bytes);
a2ceac1f
DC
393}
394
395static void
75c8b434
DC
396libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
397 struct xfs_buf_map *map, int nmaps)
a2ceac1f
DC
398{
399 unsigned int bytes = 0;
400 int i;
401
402 bytes = sizeof(struct xfs_buf_map) * nmaps;
85428dd2
DC
403 bp->b_maps = malloc(bytes);
404 if (!bp->b_maps) {
a2ceac1f
DC
405 fprintf(stderr,
406 _("%s: %s can't malloc %u bytes: %s\n"),
407 progname, __FUNCTION__, bytes,
408 strerror(errno));
409 exit(1);
410 }
411 bp->b_nmaps = nmaps;
412
413 bytes = 0;
414 for ( i = 0; i < nmaps; i++) {
85428dd2
DC
415 bp->b_maps[i].bm_bn = map[i].bm_bn;
416 bp->b_maps[i].bm_len = map[i].bm_len;
a2ceac1f
DC
417 bytes += BBTOB(map[i].bm_len);
418 }
419
75c8b434 420 __initbuf(bp, btp, map[0].bm_bn, bytes);
a2ceac1f
DC
421 bp->b_flags |= LIBXFS_B_DISCONTIG;
422}
423
00ff2b10 424static xfs_buf_t *
a2ceac1f 425__libxfs_getbufr(int blen)
e6b359b3
NS
426{
427 xfs_buf_t *bp;
69ec88b5
BN
428
429 /*
430 * first look for a buffer that can be used as-is,
431 * if one cannot be found, see if there is a buffer,
ff1f79a7 432 * and if so, free its buffer and set b_addr to NULL
69ec88b5
BN
433 * before calling libxfs_initbuf.
434 */
435 pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
436 if (!list_empty(&xfs_buf_freelist.cm_list)) {
437 list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
438 if (bp->b_bcount == blen) {
439 list_del_init(&bp->b_node.cn_mru);
440 break;
441 }
442 }
443 if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
444 bp = list_entry(xfs_buf_freelist.cm_list.next,
445 xfs_buf_t, b_node.cn_mru);
446 list_del_init(&bp->b_node.cn_mru);
447 free(bp->b_addr);
448 bp->b_addr = NULL;
2c6c6328
BF
449 if (bp->b_maps != &bp->__b_map)
450 free(bp->b_maps);
85428dd2 451 bp->b_maps = NULL;
69ec88b5
BN
452 }
453 } else
5e656dbb 454 bp = kmem_zone_zalloc(xfs_buf_zone, 0);
69ec88b5 455 pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
e0607266 456 bp->b_ops = NULL;
0a7942b3 457 if (bp->b_flags & LIBXFS_B_DIRTY)
2ab6ea6a 458 fprintf(stderr, "found dirty buffer (bulk) on free list!\n");
e6b359b3 459
a2ceac1f
DC
460 return bp;
461}
462
1a12e432 463static xfs_buf_t *
75c8b434 464libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
a2ceac1f
DC
465{
466 xfs_buf_t *bp;
467 int blen = BBTOB(bblen);
468
469 bp =__libxfs_getbufr(blen);
470 if (bp)
75c8b434 471 libxfs_initbuf(bp, btp, blkno, blen);
2556c98b 472#ifdef IO_DEBUG
a2ceac1f 473 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
f63fd268 474 pthread_self(), __FUNCTION__, blen,
2556c98b
BN
475 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
476#endif
69ec88b5 477
e6b359b3
NS
478 return bp;
479}
480
00ff2b10 481static xfs_buf_t *
75c8b434 482libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
a2ceac1f
DC
483 struct xfs_buf_map *map, int nmaps)
484{
485 xfs_buf_t *bp;
486 int blen = BBTOB(bblen);
487
488 if (!map || !nmaps) {
489 fprintf(stderr,
490 _("%s: %s invalid map %p or nmaps %d\n"),
491 progname, __FUNCTION__, map, nmaps);
492 exit(1);
493 }
494
495 if (blkno != map[0].bm_bn) {
496 fprintf(stderr,
b47c8cae
NS
497 _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
498 progname, __FUNCTION__, (long long)map[0].bm_bn,
499 (long long)blkno);
a2ceac1f
DC
500 exit(1);
501 }
502
503 bp =__libxfs_getbufr(blen);
504 if (bp)
75c8b434 505 libxfs_initbuf_map(bp, btp, map, nmaps);
a2ceac1f
DC
506#ifdef IO_DEBUG
507 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
508 pthread_self(), __FUNCTION__, blen,
509 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
510#endif
511
512 return bp;
513}
2556c98b
BN
514
515#ifdef XFS_BUF_TRACING
516struct list_head lock_buf_list = {&lock_buf_list, &lock_buf_list};
517int lock_buf_count = 0;
518#endif
e6b359b3 519
b3b1affe
DW
520static int
521__cache_lookup(
522 struct xfs_bufkey *key,
523 unsigned int flags,
524 struct xfs_buf **bpp)
2bd0ea18 525{
b3b1affe
DW
526 struct cache_node *cn = NULL;
527 struct xfs_buf *bp;
2556c98b 528
b3b1affe
DW
529 *bpp = NULL;
530
531 cache_node_get(libxfs_bcache, key, &cn);
532 if (!cn)
533 return -ENOMEM;
534 bp = container_of(cn, struct xfs_buf, b_node);
2ae22647
CH
535
536 if (use_xfs_buf_lock) {
b3b1affe 537 int ret;
50722af1
CH
538
539 ret = pthread_mutex_trylock(&bp->b_lock);
540 if (ret) {
541 ASSERT(ret == EAGAIN);
b3b1affe
DW
542 if (flags & LIBXFS_GETBUF_TRYLOCK) {
543 cache_node_put(libxfs_bcache, cn);
544 return -EAGAIN;
545 }
50722af1
CH
546
547 if (pthread_equal(bp->b_holder, pthread_self())) {
548 fprintf(stderr,
549 _("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
a2ceac1f 550 key->blkno);
50722af1 551 bp->b_recur++;
b3b1affe
DW
552 *bpp = bp;
553 return 0;
50722af1
CH
554 } else {
555 pthread_mutex_lock(&bp->b_lock);
2ae22647 556 }
2ae22647 557 }
50722af1
CH
558
559 bp->b_holder = pthread_self();
2ae22647
CH
560 }
561
b3b1affe
DW
562 cache_node_set_priority(libxfs_bcache, cn,
563 cache_node_get_priority(cn) - CACHE_PREFETCH_PRIORITY);
2556c98b 564#ifdef XFS_BUF_TRACING
2ae22647
CH
565 pthread_mutex_lock(&libxfs_bcache->c_mutex);
566 lock_buf_count++;
567 list_add(&bp->b_lock_list, &lock_buf_list);
568 pthread_mutex_unlock(&libxfs_bcache->c_mutex);
2556c98b 569#endif
2bd0ea18 570#ifdef IO_DEBUG
a2ceac1f
DC
571 printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
572 pthread_self(), __FUNCTION__,
573 bp, bp->b_bn, (long long)LIBXFS_BBTOOFF64(key->blkno));
2bd0ea18 574#endif
2556c98b 575
b3b1affe
DW
576 *bpp = bp;
577 return 0;
f1b058f9
NS
578}
579
a5ab418c 580static int
b45650ab
DW
581libxfs_getbuf_flags(
582 struct xfs_buftarg *btp,
583 xfs_daddr_t blkno,
584 int len,
a5ab418c
DW
585 unsigned int flags,
586 struct xfs_buf **bpp)
a2ceac1f 587{
b3b1affe 588 struct xfs_bufkey key = {NULL};
a2ceac1f 589
75c8b434 590 key.buftarg = btp;
a2ceac1f
DC
591 key.blkno = blkno;
592 key.bblen = len;
593
a5ab418c 594 return __cache_lookup(&key, flags, bpp);
a2ceac1f
DC
595}
596
e8ecd760
DW
597/*
598 * Clean the buffer flags for libxfs_getbuf*(), which wants to return
599 * an unused buffer with clean state. This prevents CRC errors on a
600 * re-read of a corrupt block that was prefetched and freed. This
601 * can happen with a massively corrupt directory that is discarded,
602 * but whose blocks are then recycled into expanding lost+found.
603 *
604 * Note however that if the buffer's dirty (prefetch calls getbuf)
605 * we'll leave the state alone because we don't want to discard blocks
606 * that have been fixed.
607 */
608static void
609reset_buf_state(
610 struct xfs_buf *bp)
611{
612 if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
613 bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
614 LIBXFS_B_UPTODATE);
615}
616
583ca112 617static int
b3b1affe
DW
618__libxfs_buf_get_map(
619 struct xfs_buftarg *btp,
620 struct xfs_buf_map *map,
621 int nmaps,
583ca112
DW
622 int flags,
623 struct xfs_buf **bpp)
a2ceac1f 624{
b3b1affe 625 struct xfs_bufkey key = {NULL};
b3b1affe 626 int i;
a2ceac1f 627
583ca112
DW
628 if (nmaps == 1)
629 return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
630 flags, bpp);
f388124d 631
75c8b434 632 key.buftarg = btp;
a2ceac1f
DC
633 key.blkno = map[0].bm_bn;
634 for (i = 0; i < nmaps; i++) {
635 key.bblen += map[i].bm_len;
636 }
637 key.map = map;
638 key.nmaps = nmaps;
639
583ca112 640 return __cache_lookup(&key, flags, bpp);
a2ceac1f
DC
641}
642
583ca112
DW
643int
644libxfs_buf_get_map(
645 struct xfs_buftarg *btp,
646 struct xfs_buf_map *map,
647 int nmaps,
648 int flags,
649 struct xfs_buf **bpp)
e8ecd760 650{
583ca112 651 int error;
e8ecd760 652
583ca112
DW
653 error = __libxfs_buf_get_map(btp, map, nmaps, flags, bpp);
654 if (error)
655 return error;
656
657 reset_buf_state(*bpp);
658 return 0;
e8ecd760
DW
659}
660
f1b058f9 661void
e02ba985
DW
662libxfs_buf_relse(
663 struct xfs_buf *bp)
f1b058f9 664{
cee99cfa
DC
665 /*
666 * ensure that any errors on this use of the buffer don't carry
667 * over to the next user.
668 */
669 bp->b_error = 0;
670
2556c98b
BN
671#ifdef XFS_BUF_TRACING
672 pthread_mutex_lock(&libxfs_bcache->c_mutex);
673 lock_buf_count--;
674 ASSERT(lock_buf_count >= 0);
675 list_del_init(&bp->b_lock_list);
676 pthread_mutex_unlock(&libxfs_bcache->c_mutex);
677#endif
50722af1
CH
678 if (use_xfs_buf_lock) {
679 if (bp->b_recur) {
680 bp->b_recur--;
681 } else {
682 bp->b_holder = 0;
683 pthread_mutex_unlock(&bp->b_lock);
684 }
685 }
6af7c1ea 686
cad15696 687 if (!list_empty(&bp->b_node.cn_hash))
e7e49100 688 cache_node_put(libxfs_bcache, &bp->b_node);
1a12e432
DW
689 else if (--bp->b_node.cn_count == 0) {
690 if (bp->b_flags & LIBXFS_B_DIRTY)
691 libxfs_bwrite(bp);
692 libxfs_brelse(&bp->b_node);
693 }
f1b058f9
NS
694}
695
f1b058f9 696static struct cache_node *
e7e49100
DW
697libxfs_balloc(
698 cache_key_t key)
f1b058f9 699{
e7e49100
DW
700 struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key;
701 struct xfs_buf *bp;
2556c98b 702
a2ceac1f 703 if (bufkey->map)
e7e49100
DW
704 bp = libxfs_getbufr_map(bufkey->buftarg, bufkey->blkno,
705 bufkey->bblen, bufkey->map, bufkey->nmaps);
706 else
707 bp = libxfs_getbufr(bufkey->buftarg, bufkey->blkno,
708 bufkey->bblen);
709 return &bp->b_node;
2bd0ea18
NS
710}
711
a2ceac1f
DC
712
713static int
714__read_buf(int fd, void *buf, int len, off64_t offset, int flags)
2bd0ea18 715{
bcea58c7 716 int sts;
2bd0ea18 717
2f9a125c 718 sts = pread(fd, buf, len, offset);
bcea58c7 719 if (sts < 0) {
11202ec2 720 int error = errno;
9440d84d 721 fprintf(stderr, _("%s: read failed: %s\n"),
c3928e39 722 progname, strerror(error));
11202ec2 723 return -error;
a2ceac1f 724 } else if (sts != len) {
bcea58c7 725 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
a2ceac1f 726 progname, sts, len);
12b53197 727 return -EIO;
2bd0ea18 728 }
a2ceac1f
DC
729 return 0;
730}
731
732int
75c8b434
DC
733libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, xfs_buf_t *bp,
734 int len, int flags)
a2ceac1f 735{
75c8b434 736 int fd = libxfs_device_to_fd(btp->dev);
a2ceac1f
DC
737 int bytes = BBTOB(len);
738 int error;
739
740 ASSERT(BBTOB(len) <= bp->b_bcount);
741
742 error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
743 if (!error &&
75c8b434 744 bp->b_target->dev == btp->dev &&
5dfa5cd2 745 bp->b_bn == blkno &&
f1b058f9
NS
746 bp->b_bcount == bytes)
747 bp->b_flags |= LIBXFS_B_UPTODATE;
a2ceac1f
DC
748#ifdef IO_DEBUG
749 printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
750 pthread_self(), __FUNCTION__, bytes, error,
751 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
752#endif
e214b18a 753 bp->b_error = error;
a2ceac1f 754 return error;
2bd0ea18
NS
755}
756
456371d8
DW
757int
758libxfs_readbuf_verify(
759 struct xfs_buf *bp,
760 const struct xfs_buf_ops *ops)
adbb3573
DC
761{
762 if (!ops)
456371d8
DW
763 return bp->b_error;
764
adbb3573
DC
765 bp->b_ops = ops;
766 bp->b_ops->verify_read(bp);
767 bp->b_flags &= ~LIBXFS_B_UNCHECKED;
456371d8 768 return bp->b_error;
adbb3573
DC
769}
770
800db1c1 771int
6d5e5ee0 772libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
a2ceac1f 773{
d0bbcbcb 774 int fd;
800db1c1 775 int error = 0;
04338619 776 void *buf;
800db1c1 777 int i;
75c8b434 778
75c8b434 779 fd = libxfs_device_to_fd(btp->dev);
a2ceac1f
DC
780 buf = bp->b_addr;
781 for (i = 0; i < bp->b_nmaps; i++) {
85428dd2
DC
782 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
783 int len = BBTOB(bp->b_maps[i].bm_len);
a2ceac1f 784
a2ceac1f
DC
785 error = __read_buf(fd, buf, len, offset, flags);
786 if (error) {
787 bp->b_error = error;
788 break;
789 }
790 buf += len;
a2ceac1f
DC
791 }
792
64eb960f 793 if (!error)
800db1c1
DC
794 bp->b_flags |= LIBXFS_B_UPTODATE;
795#ifdef IO_DEBUG
15028317
DW
796 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
797 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
798 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
800db1c1
DC
799#endif
800 return error;
801}
802
4c947857
DW
803int
804libxfs_buf_read_map(
805 struct xfs_buftarg *btp,
806 struct xfs_buf_map *map,
807 int nmaps,
808 int flags,
809 struct xfs_buf **bpp,
810 const struct xfs_buf_ops *ops)
800db1c1 811{
4c947857
DW
812 struct xfs_buf *bp;
813 bool salvage = flags & LIBXFS_READBUF_SALVAGE;
814 int error = 0;
800db1c1 815
4c947857 816 *bpp = NULL;
800db1c1 817 if (nmaps == 1)
e5008359
DW
818 error = libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
819 0, &bp);
820 else
821 error = __libxfs_buf_get_map(btp, map, nmaps, 0, &bp);
583ca112 822 if (error)
4c947857 823 return error;
800db1c1 824
e5008359
DW
825 /*
826 * If the buffer was prefetched, it is likely that it was not validated.
827 * Hence if we are supplied an ops function and the buffer is marked as
828 * unchecked, we need to validate it now.
829 *
830 * We do this verification even if the buffer is dirty - the
831 * verification is almost certainly going to fail the CRC check in this
832 * case as a dirty buffer has not had the CRC recalculated. However, we
833 * should not be dirtying unchecked buffers and therefore failing it
834 * here because it's dirty and unchecked indicates we've screwed up
835 * somewhere else.
4c947857
DW
836 *
837 * Note that if the caller passes in LIBXFS_READBUF_SALVAGE, that means
838 * they want the buffer even if it fails verification.
e5008359 839 */
800db1c1 840 bp->b_error = 0;
e5008359 841 if (bp->b_flags & (LIBXFS_B_UPTODATE | LIBXFS_B_DIRTY)) {
adbb3573 842 if (bp->b_flags & LIBXFS_B_UNCHECKED)
4c947857
DW
843 error = libxfs_readbuf_verify(bp, ops);
844 if (error && !salvage)
845 goto err;
846 goto ok;
75c8b434 847 }
e5008359
DW
848
849 /*
850 * Set the ops on a cache miss (i.e. first physical read) as the
851 * verifier may change the ops to match the type of buffer it contains.
852 * A cache hit might reset the verifier to the original type if we set
853 * it again, but it won't get called again and set to match the buffer
854 * contents. *cough* xfs_da_node_buf_ops *cough*.
855 */
856 if (nmaps == 1)
857 error = libxfs_readbufr(btp, map[0].bm_bn, bp, map[0].bm_len,
858 flags);
859 else
860 error = libxfs_readbufr_map(btp, bp, flags);
4c947857
DW
861 if (error)
862 goto err;
adbb3573 863
4c947857
DW
864 error = libxfs_readbuf_verify(bp, ops);
865 if (error && !salvage)
866 goto err;
867
868ok:
15028317 869#ifdef IO_DEBUGX
a2ceac1f
DC
870 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
871 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
5dfa5cd2 872 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
a2ceac1f 873#endif
4c947857
DW
874 *bpp = bp;
875 return 0;
876err:
877 libxfs_buf_relse(bp);
878 return error;
a2ceac1f
DC
879}
880
3f8a028e
DW
881/* Allocate a raw uncached buffer. */
882static inline struct xfs_buf *
883libxfs_getbufr_uncached(
884 struct xfs_buftarg *targ,
885 xfs_daddr_t daddr,
886 size_t bblen)
887{
888 struct xfs_buf *bp;
889
890 bp = libxfs_getbufr(targ, daddr, bblen);
891 if (!bp)
892 return NULL;
893
894 INIT_LIST_HEAD(&bp->b_node.cn_hash);
895 bp->b_node.cn_count = 1;
896 return bp;
897}
898
899/*
900 * Allocate an uncached buffer that points nowhere. The refcount will be 1,
901 * and the cache node hash list will be empty to indicate that it's uncached.
902 */
d918bc57 903int
3f8a028e
DW
904libxfs_buf_get_uncached(
905 struct xfs_buftarg *targ,
906 size_t bblen,
d918bc57
DW
907 int flags,
908 struct xfs_buf **bpp)
3f8a028e 909{
d918bc57
DW
910 *bpp = libxfs_getbufr_uncached(targ, XFS_BUF_DADDR_NULL, bblen);
911 return *bpp != NULL ? 0 : -ENOMEM;
3f8a028e
DW
912}
913
914/*
915 * Allocate and read an uncached buffer. The refcount will be 1, and the cache
916 * node hash list will be empty to indicate that it's uncached.
917 */
918int
919libxfs_buf_read_uncached(
920 struct xfs_buftarg *targ,
921 xfs_daddr_t daddr,
922 size_t bblen,
923 int flags,
924 struct xfs_buf **bpp,
925 const struct xfs_buf_ops *ops)
926{
927 struct xfs_buf *bp;
928 int error;
929
930 *bpp = NULL;
931 bp = libxfs_getbufr_uncached(targ, daddr, bblen);
932 if (!bp)
933 return -ENOMEM;
934
935 error = libxfs_readbufr(targ, daddr, bp, bblen, flags);
936 if (error)
937 goto err;
938
939 error = libxfs_readbuf_verify(bp, ops);
940 if (error)
941 goto err;
942
943 *bpp = bp;
944 return 0;
945err:
946 libxfs_buf_relse(bp);
947 return error;
948}
949
a2ceac1f
DC
950static int
951__write_buf(int fd, void *buf, int len, off64_t offset, int flags)
2bd0ea18
NS
952{
953 int sts;
2bd0ea18 954
2f9a125c 955 sts = pwrite(fd, buf, len, offset);
2bd0ea18 956 if (sts < 0) {
11202ec2 957 int error = errno;
2f9a125c 958 fprintf(stderr, _("%s: pwrite failed: %s\n"),
c3928e39 959 progname, strerror(error));
11202ec2 960 return -error;
a2ceac1f 961 } else if (sts != len) {
2f9a125c 962 fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
a2ceac1f 963 progname, sts, len);
12b53197 964 return -EIO;
2bd0ea18 965 }
a2ceac1f
DC
966 return 0;
967}
968
969int
331d5956
DW
970libxfs_bwrite(
971 struct xfs_buf *bp)
a2ceac1f 972{
331d5956 973 int fd = libxfs_device_to_fd(bp->b_target->dev);
a2ceac1f 974
75c8b434
DC
975 /*
976 * we never write buffers that are marked stale. This indicates they
977 * contain data that has been invalidated, and even if the buffer is
978 * dirty it must *never* be written. Verifiers are wonderful for finding
979 * bugs like this. Make sure the error is obvious as to the cause.
980 */
981 if (bp->b_flags & LIBXFS_B_STALE) {
12b53197 982 bp->b_error = -ESTALE;
75c8b434
DC
983 return bp->b_error;
984 }
985
986 /*
987 * clear any pre-existing error status on the buffer. This can occur if
988 * the buffer is corrupt on disk and the repair process doesn't clear
989 * the error before fixing and writing it back.
990 */
991 bp->b_error = 0;
992 if (bp->b_ops) {
993 bp->b_ops->verify_write(bp);
994 if (bp->b_error) {
995 fprintf(stderr,
eb20c4ca 996 _("%s: write verifier failed on %s bno 0x%llx/0x%x\n"),
a3fac935
ES
997 __func__, bp->b_ops->name,
998 (long long)bp->b_bn, bp->b_bcount);
75c8b434
DC
999 return bp->b_error;
1000 }
1001 }
1002
a2ceac1f 1003 if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
e8f1e8aa 1004 bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
5dfa5cd2 1005 LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
a2ceac1f
DC
1006 } else {
1007 int i;
04338619 1008 void *buf = bp->b_addr;
a2ceac1f
DC
1009
1010 for (i = 0; i < bp->b_nmaps; i++) {
85428dd2
DC
1011 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
1012 int len = BBTOB(bp->b_maps[i].bm_len);
a2ceac1f 1013
e8f1e8aa
DC
1014 bp->b_error = __write_buf(fd, buf, len, offset,
1015 bp->b_flags);
1016 if (bp->b_error)
a2ceac1f 1017 break;
a2ceac1f 1018 buf += len;
a2ceac1f
DC
1019 }
1020 }
1021
f1b058f9 1022#ifdef IO_DEBUG
d085fb48 1023 printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
2556c98b 1024 pthread_self(), __FUNCTION__, bp->b_bcount,
5dfa5cd2 1025 (long long)LIBXFS_BBTOOFF64(bp->b_bn),
e8f1e8aa 1026 (long long)bp->b_bn, bp, bp->b_error);
f1b058f9 1027#endif
9a54569a
DW
1028 if (bp->b_error) {
1029 fprintf(stderr,
1030 _("%s: write failed on %s bno 0x%llx/0x%x, err=%d\n"),
0a82d75e 1031 __func__, bp->b_ops ? bp->b_ops->name : "(unknown)",
9a54569a
DW
1032 (long long)bp->b_bn, bp->b_bcount, -bp->b_error);
1033 } else {
a2ceac1f 1034 bp->b_flags |= LIBXFS_B_UPTODATE;
b98336dd 1035 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED);
a2ceac1f 1036 }
e8f1e8aa 1037 return bp->b_error;
2bd0ea18
NS
1038}
1039
18b4f688
DW
1040/*
1041 * Mark a buffer dirty. The dirty data will be written out when the cache
1042 * is flushed (or at release time if the buffer is uncached).
1043 */
1044void
1045libxfs_buf_mark_dirty(
f524ae04 1046 struct xfs_buf *bp)
f1b058f9 1047{
e0607266
DC
1048#ifdef IO_DEBUG
1049 printf("%lx: %s: dirty blkno=%llu(%llu)\n",
1050 pthread_self(), __FUNCTION__,
1051 (long long)LIBXFS_BBTOOFF64(bp->b_bn),
1052 (long long)bp->b_bn);
1053#endif
203d38cc
DC
1054 /*
1055 * Clear any error hanging over from reading the buffer. This prevents
1056 * subsequent reads after this write from seeing stale errors.
1057 */
1058 bp->b_error = 0;
6af7c1ea 1059 bp->b_flags &= ~LIBXFS_B_STALE;
f524ae04 1060 bp->b_flags |= LIBXFS_B_DIRTY;
2bd0ea18
NS
1061}
1062
c335b673
DW
1063/* Complain about (and remember) dropping dirty buffers. */
1064static void
1065libxfs_whine_dirty_buf(
1066 struct xfs_buf *bp)
1067{
1068 fprintf(stderr, _("%s: Releasing dirty buffer to free list!\n"),
1069 progname);
1070
1071 if (bp->b_error == -EFSCORRUPTED)
1072 bp->b_target->flags |= XFS_BUFTARG_CORRUPT_WRITE;
1073 bp->b_target->flags |= XFS_BUFTARG_LOST_WRITE;
1074}
1075
33165ec3 1076static void
0a7942b3
DC
1077libxfs_brelse(
1078 struct cache_node *node)
33165ec3 1079{
063516bb
DW
1080 struct xfs_buf *bp = container_of(node, struct xfs_buf,
1081 b_node);
33165ec3 1082
0a7942b3
DC
1083 if (!bp)
1084 return;
1085 if (bp->b_flags & LIBXFS_B_DIRTY)
c335b673 1086 libxfs_whine_dirty_buf(bp);
0a7942b3
DC
1087
1088 pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
1089 list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
1090 pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
33165ec3
BN
1091}
1092
e08f5594 1093static unsigned int
69ec88b5 1094libxfs_bulkrelse(
0a7942b3
DC
1095 struct cache *cache,
1096 struct list_head *list)
2556c98b 1097{
69ec88b5 1098 xfs_buf_t *bp;
e08f5594 1099 int count = 0;
2556c98b 1100
69ec88b5 1101 if (list_empty(list))
e08f5594 1102 return 0 ;
69ec88b5
BN
1103
1104 list_for_each_entry(bp, list, b_node.cn_mru) {
2556c98b 1105 if (bp->b_flags & LIBXFS_B_DIRTY)
c335b673 1106 libxfs_whine_dirty_buf(bp);
e08f5594 1107 count++;
2556c98b 1108 }
69ec88b5
BN
1109
1110 pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
0b90dda6 1111 list_splice(list, &xfs_buf_freelist.cm_list);
69ec88b5 1112 pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
e08f5594
BN
1113
1114 return count;
69ec88b5
BN
1115}
1116
864028ed
ES
1117/*
1118 * Free everything from the xfs_buf_freelist MRU, used at final teardown
1119 */
1120void
1121libxfs_bcache_free(void)
1122{
1123 struct list_head *cm_list;
1124 xfs_buf_t *bp, *next;
1125
1126 cm_list = &xfs_buf_freelist.cm_list;
1127 list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
1128 free(bp->b_addr);
1129 if (bp->b_maps != &bp->__b_map)
1130 free(bp->b_maps);
cef0cc3b 1131 kmem_cache_free(xfs_buf_zone, bp);
864028ed
ES
1132 }
1133}
1134
e8f1e8aa
DC
1135/*
1136 * When a buffer is marked dirty, the error is cleared. Hence if we are trying
1137 * to flush a buffer prior to cache reclaim that has an error on it it means
1138 * we've already tried to flush it and it failed. Prevent repeated corruption
1139 * errors from being reported by skipping such buffers - when the corruption is
1140 * fixed the buffer will be marked dirty again and we can write it again.
1141 */
0a7942b3
DC
1142static int
1143libxfs_bflush(
1144 struct cache_node *node)
69ec88b5 1145{
063516bb
DW
1146 struct xfs_buf *bp = container_of(node, struct xfs_buf,
1147 b_node);
69ec88b5 1148
e8f1e8aa 1149 if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
331d5956 1150 return libxfs_bwrite(bp);
e8f1e8aa 1151 return bp->b_error;
2556c98b
BN
1152}
1153
f1b058f9
NS
1154void
1155libxfs_bcache_purge(void)
1156{
1157 cache_purge(libxfs_bcache);
1158}
1159
e8cb94ee 1160void
33165ec3
BN
1161libxfs_bcache_flush(void)
1162{
1163 cache_flush(libxfs_bcache);
1164}
1165
2556c98b
BN
1166int
1167libxfs_bcache_overflowed(void)
1168{
1169 return cache_overflowed(libxfs_bcache);
1170}
1171
f1b058f9 1172struct cache_operations libxfs_bcache_operations = {
bd9cc49a
ES
1173 .hash = libxfs_bhash,
1174 .alloc = libxfs_balloc,
1175 .flush = libxfs_bflush,
1176 .relse = libxfs_brelse,
1177 .compare = libxfs_bcompare,
1178 .bulkrelse = libxfs_bulkrelse
f1b058f9
NS
1179};
1180
c08793bd
BF
1181/*
1182 * Verify an on-disk magic value against the magic value specified in the
1183 * verifier structure. The verifier magic is in disk byte order so the caller is
1184 * expected to pass the value directly from disk.
1185 */
1186bool
1187xfs_verify_magic(
1188 struct xfs_buf *bp,
9e26de8f 1189 __be32 dmagic)
c08793bd 1190{
7861ef77 1191 struct xfs_mount *mp = bp->b_mount;
c08793bd
BF
1192 int idx;
1193
1194 idx = xfs_sb_version_hascrc(&mp->m_sb);
1195 if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
1196 return false;
1197 return dmagic == bp->b_ops->magic[idx];
9e26de8f
DW
1198}
1199
1200/*
1201 * Verify an on-disk magic value against the magic value specified in the
1202 * verifier structure. The verifier magic is in disk byte order so the caller is
1203 * expected to pass the value directly from disk.
1204 */
1205bool
1206xfs_verify_magic16(
1207 struct xfs_buf *bp,
1208 __be16 dmagic)
1209{
7861ef77 1210 struct xfs_mount *mp = bp->b_mount;
9e26de8f
DW
1211 int idx;
1212
1213 idx = xfs_sb_version_hascrc(&mp->m_sb);
1214 if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
1215 return false;
1216 return dmagic == bp->b_ops->magic16[idx];
c08793bd 1217}
2bd0ea18 1218
f1b058f9 1219/*
3a19fb7d 1220 * Inode cache stubs.
f1b058f9
NS
1221 */
1222
bf0e024f 1223kmem_zone_t *xfs_inode_zone;
5e656dbb 1224extern kmem_zone_t *xfs_ili_zone;
f1b058f9 1225
2bd0ea18 1226int
12ac6e04
DW
1227libxfs_iget(
1228 struct xfs_mount *mp,
1229 struct xfs_trans *tp,
1230 xfs_ino_t ino,
1231 uint lock_flags,
1fecabf9 1232 struct xfs_inode **ipp)
2bd0ea18 1233{
12ac6e04 1234 struct xfs_inode *ip;
89522615
CH
1235 struct xfs_dinode *dip;
1236 struct xfs_buf *bp;
12ac6e04 1237 int error = 0;
2bd0ea18 1238
3a19fb7d
CH
1239 ip = kmem_zone_zalloc(xfs_inode_zone, 0);
1240 if (!ip)
12b53197 1241 return -ENOMEM;
2bd0ea18 1242
3a19fb7d
CH
1243 ip->i_ino = ino;
1244 ip->i_mount = mp;
89522615
CH
1245 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, 0);
1246 if (error)
1247 goto out_destroy;
1248
1249 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0);
1250 if (error)
1251 goto out_destroy;
1252
1253 error = xfs_inode_from_disk(ip, dip);
1254 if (!error)
1255 xfs_buf_set_ref(bp, XFS_INO_REF);
1256 xfs_trans_brelse(tp, bp);
1257
1258 if (error)
1259 goto out_destroy;
f1b058f9 1260
3a19fb7d
CH
1261 *ipp = ip;
1262 return 0;
89522615
CH
1263
1264out_destroy:
1265 kmem_cache_free(xfs_inode_zone, ip);
1266 *ipp = NULL;
1267 return error;
f1b058f9
NS
1268}
1269
1270static void
014e5f6d
ES
1271libxfs_idestroy(xfs_inode_t *ip)
1272{
e37bf53c 1273 switch (VFS_I(ip)->i_mode & S_IFMT) {
014e5f6d
ES
1274 case S_IFREG:
1275 case S_IFDIR:
1276 case S_IFLNK:
a87a40a2 1277 libxfs_idestroy_fork(&ip->i_df);
014e5f6d
ES
1278 break;
1279 }
a87a40a2
CH
1280 if (ip->i_afp) {
1281 libxfs_idestroy_fork(ip->i_afp);
1282 kmem_cache_free(xfs_ifork_zone, ip->i_afp);
1283 }
1284 if (ip->i_cowfp) {
1285 libxfs_idestroy_fork(ip->i_cowfp);
1286 kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
1287 }
014e5f6d
ES
1288}
1289
2bd0ea18 1290void
31845e4c
DW
1291libxfs_irele(
1292 struct xfs_inode *ip)
2bd0ea18 1293{
b1a24a81 1294 ASSERT(ip->i_itemp == NULL);
3a19fb7d 1295 libxfs_idestroy(ip);
cef0cc3b 1296 kmem_cache_free(xfs_inode_zone, ip);
2bd0ea18 1297}
7db2e3c1 1298
c335b673
DW
1299/*
1300 * Flush everything dirty in the kernel and disk write caches to stable media.
1301 * Returns 0 for success or a negative error code.
1302 */
1303int
1304libxfs_blkdev_issue_flush(
1305 struct xfs_buftarg *btp)
1306{
1307 int fd, ret;
1308
1309 if (btp->dev == 0)
1310 return 0;
1311
1312 fd = libxfs_device_to_fd(btp->dev);
1313 ret = platform_flush_device(fd, btp->dev);
1314 return ret ? -errno : 0;
1315}
1316
7db2e3c1
DW
1317/*
1318 * Write out a buffer list synchronously.
1319 *
1320 * This will take the @buffer_list, write all buffers out and wait for I/O
1321 * completion on all of the buffers. @buffer_list is consumed by the function,
1322 * so callers must have some other way of tracking buffers if they require such
1323 * functionality.
1324 */
1325int
1326xfs_buf_delwri_submit(
1327 struct list_head *buffer_list)
1328{
1329 struct xfs_buf *bp, *n;
1330 int error = 0, error2;
1331
1332 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1333 list_del_init(&bp->b_list);
331d5956 1334 error2 = libxfs_bwrite(bp);
7db2e3c1
DW
1335 if (!error)
1336 error = error2;
e02ba985 1337 libxfs_buf_relse(bp);
7db2e3c1
DW
1338 }
1339
1340 return error;
1341}
c92c796e 1342
e6d5a6f8
DW
1343/*
1344 * Cancel a delayed write list.
1345 *
1346 * Remove each buffer from the list, clear the delwri queue flag and drop the
1347 * associated buffer reference.
1348 */
1349void
1350xfs_buf_delwri_cancel(
1351 struct list_head *list)
1352{
1353 struct xfs_buf *bp;
1354
1355 while (!list_empty(list)) {
1356 bp = list_first_entry(list, struct xfs_buf, b_list);
1357
1358 list_del_init(&bp->b_list);
1359 libxfs_buf_relse(bp);
1360 }
1361}
1362
c92c796e
DW
1363/*
1364 * Format the log. The caller provides either a buftarg which is used to access
1365 * the log via buffers or a direct pointer to a buffer that encapsulates the
1366 * entire log.
1367 */
1368int
1369libxfs_log_clear(
1370 struct xfs_buftarg *btp,
1371 char *dptr,
1372 xfs_daddr_t start,
1373 uint length, /* basic blocks */
1374 uuid_t *fs_uuid,
1375 int version,
1376 int sunit, /* bytes */
1377 int fmt,
1378 int cycle,
1379 bool max)
1380{
1381 struct xfs_buf *bp = NULL;
1382 int len;
1383 xfs_lsn_t lsn;
1384 xfs_lsn_t tail_lsn;
1385 xfs_daddr_t blk;
1386 xfs_daddr_t end_blk;
1387 char *ptr;
1388
1389 if (((btp && dptr) || (!btp && !dptr)) ||
1390 (btp && !btp->dev) || !fs_uuid)
1391 return -EINVAL;
1392
1393 /* first zero the log */
1394 if (btp)
1395 libxfs_device_zero(btp, start, length);
1396 else
1397 memset(dptr, 0, BBTOB(length));
1398
1399 /*
1400 * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
1401 * special reset case where we only write a single record where the lsn
1402 * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
1403 * the specified cycle and points tail_lsn at the last record of the
1404 * previous cycle.
1405 */
1406 len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
1407 len = max(len, 2);
1408 lsn = xlog_assign_lsn(cycle, 0);
1409 if (cycle == XLOG_INIT_CYCLE)
1410 tail_lsn = lsn;
1411 else
1412 tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
1413
1414 /* write out the first log record */
1415 ptr = dptr;
1416 if (btp) {
de319479 1417 bp = libxfs_getbufr_uncached(btp, start, len);
c92c796e
DW
1418 ptr = bp->b_addr;
1419 }
1420 libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
1421 next, bp);
18b4f688 1422 if (bp) {
f524ae04 1423 libxfs_buf_mark_dirty(bp);
18b4f688
DW
1424 libxfs_buf_relse(bp);
1425 }
c92c796e
DW
1426
1427 /*
1428 * There's nothing else to do if this is a log reset. The kernel detects
1429 * the rest of the log is zeroed and starts at cycle 1.
1430 */
1431 if (cycle == XLOG_INIT_CYCLE)
1432 return 0;
1433
1434 /*
1435 * Bump the record size for a full log format if the caller allows it.
1436 * This is primarily for performance reasons and most callers don't care
1437 * about record size since the log is clean after we're done.
1438 */
1439 if (max)
1440 len = BTOBB(BDSTRAT_SIZE);
1441
1442 /*
1443 * Otherwise, fill everything beyond the initial record with records of
1444 * the previous cycle so the kernel head/tail detection works correctly.
1445 *
1446 * We don't particularly care about the record size or content here.
1447 * It's only important that the headers are in place such that the
1448 * kernel finds 1.) a clean log and 2.) the correct current cycle value.
1449 * Therefore, bump up the record size to the max to use larger I/Os and
1450 * improve performance.
1451 */
1452 cycle--;
1453 blk = start + len;
1454 if (dptr)
1455 dptr += BBTOB(len);
1456 end_blk = start + length;
1457
1458 len = min(end_blk - blk, len);
1459 while (blk < end_blk) {
1460 lsn = xlog_assign_lsn(cycle, blk - start);
1461 tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
1462
1463 ptr = dptr;
1464 if (btp) {
de319479 1465 bp = libxfs_getbufr_uncached(btp, blk, len);
c92c796e
DW
1466 ptr = bp->b_addr;
1467 }
1468 /*
1469 * Note: pass the full buffer length as the sunit to initialize
1470 * the entire buffer.
1471 */
1472 libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
1473 tail_lsn, next, bp);
18b4f688 1474 if (bp) {
f524ae04 1475 libxfs_buf_mark_dirty(bp);
18b4f688
DW
1476 libxfs_buf_relse(bp);
1477 }
c92c796e
DW
1478
1479 blk += len;
1480 if (dptr)
1481 dptr += BBTOB(len);
1482 len = min(end_blk - blk, len);
1483 }
1484
1485 return 0;
1486}
1487
1488int
1489libxfs_log_header(
1490 char *caddr,
1491 uuid_t *fs_uuid,
1492 int version,
1493 int sunit,
1494 int fmt,
1495 xfs_lsn_t lsn,
1496 xfs_lsn_t tail_lsn,
1497 libxfs_get_block_t *nextfunc,
1498 void *private)
1499{
1500 xlog_rec_header_t *head = (xlog_rec_header_t *)caddr;
1501 char *p = caddr;
1502 __be32 cycle_lsn;
1503 int i, len;
1504 int hdrs = 1;
1505
1506 if (lsn == NULLCOMMITLSN)
1507 lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
1508 if (tail_lsn == NULLCOMMITLSN)
1509 tail_lsn = lsn;
1510
1511 len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
1512
1513 memset(p, 0, BBSIZE);
1514 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1515 head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
1516 head->h_version = cpu_to_be32(version);
1517 head->h_crc = cpu_to_le32(0);
1518 head->h_prev_block = cpu_to_be32(-1);
1519 head->h_num_logops = cpu_to_be32(1);
1520 head->h_fmt = cpu_to_be32(fmt);
1521 head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
1522
1523 head->h_lsn = cpu_to_be64(lsn);
1524 head->h_tail_lsn = cpu_to_be64(tail_lsn);
1525
1526 memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
1527
1528 /*
1529 * The kernel expects to see either a log record header magic value or
1530 * the LSN cycle at the top of every log block. The first word of each
1531 * non-header block is copied to the record headers and replaced with
1532 * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
1533 * details).
1534 *
1535 * Even though we only ever write an unmount record (one block), we
1536 * support writing log records up to the max log buffer size of 256k to
1537 * improve log format performance. This means a record can require up
1538 * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
1539 * data (each header supports 32k of data).
1540 */
1541 cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
1542 if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
1543 hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
1544 if (sunit % XLOG_HEADER_CYCLE_SIZE)
1545 hdrs++;
1546 }
1547
1548 /*
1549 * A fixed number of extended headers is expected based on h_size. If
1550 * required, format those now so the unmount record is located
1551 * correctly.
1552 *
1553 * Since we only write an unmount record, we only need one h_cycle_data
1554 * entry for the unmount record block. The subsequent record data
1555 * blocks are zeroed, which means we can stamp them directly with the
1556 * cycle and zero the rest of the cycle data in the extended headers.
1557 */
1558 if (hdrs > 1) {
1559 for (i = 1; i < hdrs; i++) {
1560 p = nextfunc(p, BBSIZE, private);
1561 memset(p, 0, BBSIZE);
1562 /* xlog_rec_ext_header.xh_cycle */
1563 *(__be32 *)p = cycle_lsn;
1564 }
1565 }
1566
1567 /*
1568 * The total length is the max of the stripe unit or 2 basic block
1569 * minimum (1 hdr blk + 1 data blk). The record length is the total
1570 * minus however many header blocks are required.
1571 */
1572 head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
1573
1574 /*
1575 * Write out the unmount record, pack the first word into the record
1576 * header and stamp the block with the cycle.
1577 */
1578 p = nextfunc(p, BBSIZE, private);
1579 unmount_record(p);
1580
1581 head->h_cycle_data[0] = *(__be32 *)p;
1582 *(__be32 *)p = cycle_lsn;
1583
1584 /*
1585 * Finally, zero all remaining blocks in the record and stamp each with
1586 * the cycle. We don't need to pack any of these blocks because the
1587 * cycle data in the headers has already been zeroed.
1588 */
1589 len = max(len, hdrs + 1);
1590 for (i = hdrs + 1; i < len; i++) {
1591 p = nextfunc(p, BBSIZE, private);
1592 memset(p, 0, BBSIZE);
1593 *(__be32 *)p = cycle_lsn;
1594 }
1595
1596 return BBTOB(len);
1597}
1598
af60a998
DW
1599void
1600libxfs_buf_set_priority(
1601 struct xfs_buf *bp,
1602 int priority)
1603{
1604 cache_node_set_priority(libxfs_bcache, &bp->b_node, priority);
1605}
1606
1607int
1608libxfs_buf_priority(
1609 struct xfs_buf *bp)
1610{
1611 return cache_node_get_priority(&bp->b_node);
1612}
1be76d11
DW
1613
1614/*
1615 * Log a message about and stale a buffer that a caller has decided is corrupt.
1616 *
1617 * This function should be called for the kinds of metadata corruption that
1618 * cannot be detect from a verifier, such as incorrect inter-block relationship
1619 * data. Do /not/ call this function from a verifier function.
1620 *
1621 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
1622 * be marked stale, but b_error will not be set. The caller is responsible for
1623 * releasing the buffer or fixing it.
1624 */
1625void
1626__xfs_buf_mark_corrupt(
1627 struct xfs_buf *bp,
1628 xfs_failaddr_t fa)
1629{
1630 ASSERT(bp->b_flags & XBF_DONE);
1631
9f09216e 1632 xfs_buf_corruption_error(bp, fa);
1be76d11
DW
1633 xfs_buf_stale(bp);
1634}