]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - libxfs/rdwr.c
libxfs: stash away the device fd in struct xfs_buftarg
[thirdparty/xfsprogs-dev.git] / libxfs / rdwr.c
CommitLineData
959ef981 1// SPDX-License-Identifier: GPL-2.0
2bd0ea18 2/*
f1b058f9 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
da23017d 4 * All Rights Reserved.
2bd0ea18
NS
5 */
6
b626fb59 7
9c799827 8#include "libxfs_priv.h"
1aef52f8 9#include "init.h"
b626fb59
DC
10#include "xfs_fs.h"
11#include "xfs_shared.h"
12#include "xfs_format.h"
13#include "xfs_log_format.h"
14#include "xfs_trans_resv.h"
15#include "xfs_mount.h"
16#include "xfs_inode_buf.h"
17#include "xfs_inode_fork.h"
18#include "xfs_inode.h"
19#include "xfs_trans.h"
c335b673 20#include "libfrog/platform.h"
b626fb59 21
ac7ad9aa 22#include "libxfs.h"
2bd0ea18 23
1a12e432
DW
24static void libxfs_brelse(struct cache_node *node);
25
6af7c1ea
DC
26/*
27 * Important design/architecture note:
28 *
29 * The userspace code that uses the buffer cache is much less constrained than
30 * the kernel code. The userspace code is pretty nasty in places, especially
31 * when it comes to buffer error handling. Very little of the userspace code
32 * outside libxfs clears bp->b_error - very little code even checks it - so the
33 * libxfs code is tripping on stale errors left by the userspace code.
34 *
8b4de37c 35 * We can't clear errors or zero buffer contents in libxfs_buf_get-* like we do
6af7c1ea
DC
36 * in the kernel, because those functions are used by the libxfs_readbuf_*
37 * functions and hence need to leave the buffers unchanged on cache hits. This
38 * is actually the only way to gather a write error from a libxfs_writebuf()
39 * call - you need to get the buffer again so you can check bp->b_error field -
40 * assuming that the buffer is still in the cache when you check, that is.
41 *
42 * This is very different to the kernel code which does not release buffers on a
43 * write so we can wait on IO and check errors. The kernel buffer cache also
44 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
45 * cache hit.
46 *
47 * IOWs, userspace is behaving quite differently to the kernel and as a result
48 * it leaks errors from reads, invalidations and writes through
361379e0 49 * libxfs_buf_get/libxfs_buf_read.
6af7c1ea
DC
50 *
51 * The result of this is that until the userspace code outside libxfs is cleaned
52 * up, functions that release buffers from userspace control (i.e
e02ba985 53 * libxfs_writebuf/libxfs_buf_relse) need to zero bp->b_error to prevent
6af7c1ea
DC
54 * propagation of stale errors into future buffer operations.
55 */
56
5000d01d 57#define BDSTRAT_SIZE (256 * 1024)
2bd0ea18 58
2556c98b
BN
59#define IO_BCOMPARE_CHECK
60
9542ae13
DC
61/* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
62int
75c8b434 63libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
2bd0ea18 64{
7b47b1bc 65 int fd = btp->bt_bdev_fd;
3cc4d0db
NS
66 xfs_off_t start_offset, end_offset, offset;
67 ssize_t zsize, bytes;
9d6023a8 68 size_t len_bytes;
2bd0ea18 69 char *z;
7b47b1bc 70 int error;
9d6023a8 71
9d6023a8
ES
72 start_offset = LIBXFS_BBTOOFF64(start);
73
74 /* try to use special zeroing methods, fall back to writes if needed */
75 len_bytes = LIBXFS_BBTOOFF64(len);
76 error = platform_zero_range(fd, start_offset, len_bytes);
704e4cef
DW
77 if (!error) {
78 xfs_buftarg_trip_write(btp);
9d6023a8 79 return 0;
704e4cef 80 }
2bd0ea18 81
3cc4d0db 82 zsize = min(BDSTRAT_SIZE, BBTOB(len));
b74a1f6a 83 if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
9440d84d
NS
84 fprintf(stderr,
85 _("%s: %s can't memalign %d bytes: %s\n"),
7dfd8291 86 progname, __FUNCTION__, (int)zsize, strerror(errno));
2bd0ea18
NS
87 exit(1);
88 }
3cc4d0db
NS
89 memset(z, 0, zsize);
90
dc8878f4 91 if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
3cc4d0db 92 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
7dfd8291
NS
93 progname, __FUNCTION__,
94 (unsigned long long)start_offset, strerror(errno));
3cc4d0db
NS
95 exit(1);
96 }
97
cb5b3ef4 98 end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
3cc4d0db
NS
99 for (offset = 0; offset < end_offset; ) {
100 bytes = min((ssize_t)(end_offset - offset), zsize);
101 if ((bytes = write(fd, z, bytes)) < 0) {
102 fprintf(stderr, _("%s: %s write failed: %s\n"),
9440d84d 103 progname, __FUNCTION__, strerror(errno));
2bd0ea18 104 exit(1);
3cc4d0db
NS
105 } else if (bytes == 0) {
106 fprintf(stderr, _("%s: %s not progressing?\n"),
107 progname, __FUNCTION__);
108 exit(1);
2bd0ea18 109 }
704e4cef 110 xfs_buftarg_trip_write(btp);
3cc4d0db 111 offset += bytes;
2bd0ea18
NS
112 }
113 free(z);
9542ae13 114 return 0;
2bd0ea18
NS
115}
116
989b74bc 117static void unmount_record(void *p)
2bd0ea18 118{
989b74bc 119 xlog_op_header_t *op = (xlog_op_header_t *)p;
5000d01d
SL
120 /* the data section must be 32 bit size aligned */
121 struct {
14f8b681
DW
122 uint16_t magic;
123 uint16_t pad1;
124 uint32_t pad2; /* may as well make it 64 bits */
5000d01d
SL
125 } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
126
989b74bc 127 memset(p, 0, BBSIZE);
ad9b88eb
BF
128 /* dummy tid to mark this as written from userspace */
129 op->oh_tid = cpu_to_be32(0xb0c0d0d0);
5e656dbb
BN
130 op->oh_len = cpu_to_be32(sizeof(magic));
131 op->oh_clientid = XFS_LOG;
132 op->oh_flags = XLOG_UNMOUNT_TRANS;
133 op->oh_res2 = 0;
989b74bc
NS
134
135 /* and the data for this op */
1552a820 136 memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
989b74bc
NS
137}
138
1c12a814
BF
139static char *next(
140 char *ptr,
141 int offset,
142 void *private)
989b74bc 143{
1c12a814 144 struct xfs_buf *buf = (struct xfs_buf *)private;
989b74bc 145
1c12a814 146 if (buf &&
c0594dd6 147 (BBTOB(buf->b_length) < (int)(ptr - (char *)buf->b_addr) + offset))
989b74bc 148 abort();
1c12a814 149
989b74bc
NS
150 return ptr + offset;
151}
152
361379e0
DW
153struct xfs_buf *
154libxfs_getsb(
155 struct xfs_mount *mp)
f1b058f9 156{
31079e67
DW
157 struct xfs_buf *bp;
158
159 libxfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, XFS_FSS_TO_BB(mp, 1),
160 0, &bp, &xfs_sb_buf_ops);
161 return bp;
f1b058f9
NS
162}
163
2e1394fc 164struct kmem_cache *xfs_buf_cache;
69ec88b5
BN
165
166static struct cache_mru xfs_buf_freelist =
167 {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
168 0, PTHREAD_MUTEX_INITIALIZER };
f1b058f9 169
a2ceac1f
DC
170/*
171 * The bufkey is used to pass the new buffer information to the cache object
172 * allocation routine. Because discontiguous buffers need to pass different
173 * information, we need fields to pass that information. However, because the
174 * blkno and bblen is needed for the initial cache entry lookup (i.e. for
175 * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
176 * buffer initialisation instead of a contiguous buffer.
177 */
178struct xfs_bufkey {
75c8b434 179 struct xfs_buftarg *buftarg;
a2ceac1f
DC
180 xfs_daddr_t blkno;
181 unsigned int bblen;
182 struct xfs_buf_map *map;
183 int nmaps;
184};
f1b058f9 185
602dcc0e
DC
186/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
187#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
188#define CACHE_LINE_SIZE 64
f1b058f9 189static unsigned int
602dcc0e 190libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
f1b058f9 191{
602dcc0e
DC
192 uint64_t hashval = ((struct xfs_bufkey *)key)->blkno;
193 uint64_t tmp;
194
195 tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
196 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
197 return tmp % hashsize;
f1b058f9
NS
198}
199
200static int
201libxfs_bcompare(struct cache_node *node, cache_key_t key)
202{
063516bb
DW
203 struct xfs_buf *bp = container_of(node, struct xfs_buf,
204 b_node);
205 struct xfs_bufkey *bkey = (struct xfs_bufkey *)key;
f1b058f9 206
ab434d12 207 if (bp->b_target->bt_bdev == bkey->buftarg->bt_bdev &&
246e2283 208 bp->b_cache_key == bkey->blkno) {
c0594dd6 209 if (bp->b_length == bkey->bblen)
ba9ecd40
DC
210 return CACHE_HIT;
211#ifdef IO_BCOMPARE_CHECK
212 if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
213 fprintf(stderr,
214 "%lx: Badness in key lookup (length)\n"
215 "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
216 pthread_self(),
f1208396 217 (unsigned long long)xfs_buf_daddr(bp),
c0594dd6 218 BBTOB(bp->b_length),
ba9ecd40
DC
219 (unsigned long long)bkey->blkno,
220 BBTOB(bkey->bblen));
221 }
f1b058f9 222#endif
ba9ecd40
DC
223 return CACHE_PURGE;
224 }
225 return CACHE_MISS;
f1b058f9
NS
226}
227
e6b359b3 228static void
167137fe 229__initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
75c8b434 230 unsigned int bytes)
e6b359b3
NS
231{
232 bp->b_flags = 0;
246e2283 233 bp->b_cache_key = bno;
a2ceac1f 234 bp->b_length = BTOBB(bytes);
75c8b434 235 bp->b_target = btp;
7861ef77 236 bp->b_mount = btp->bt_mount;
a6a7776a 237 bp->b_error = 0;
69ec88b5
BN
238 if (!bp->b_addr)
239 bp->b_addr = memalign(libxfs_device_alignment(), bytes);
e6b359b3
NS
240 if (!bp->b_addr) {
241 fprintf(stderr,
242 _("%s: %s can't memalign %u bytes: %s\n"),
243 progname, __FUNCTION__, bytes,
244 strerror(errno));
245 exit(1);
246 }
bf43fd28 247 memset(bp->b_addr, 0, bytes);
2556c98b 248 pthread_mutex_init(&bp->b_lock, NULL);
50722af1
CH
249 bp->b_holder = 0;
250 bp->b_recur = 0;
75c8b434 251 bp->b_ops = NULL;
2efa10f3 252 INIT_LIST_HEAD(&bp->b_li_list);
2c6c6328 253
f043c63e 254 if (!bp->b_maps)
2c6c6328 255 bp->b_maps = &bp->__b_map;
f043c63e
DW
256
257 if (bp->b_maps == &bp->__b_map) {
258 bp->b_nmaps = 1;
246e2283 259 bp->b_maps[0].bm_bn = bno;
2c6c6328
BF
260 bp->b_maps[0].bm_len = bp->b_length;
261 }
e6b359b3
NS
262}
263
a2ceac1f 264static void
167137fe 265libxfs_initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
75c8b434 266 unsigned int bytes)
a2ceac1f 267{
75c8b434 268 __initbuf(bp, btp, bno, bytes);
a2ceac1f
DC
269}
270
271static void
167137fe 272libxfs_initbuf_map(struct xfs_buf *bp, struct xfs_buftarg *btp,
75c8b434 273 struct xfs_buf_map *map, int nmaps)
a2ceac1f
DC
274{
275 unsigned int bytes = 0;
276 int i;
277
278 bytes = sizeof(struct xfs_buf_map) * nmaps;
85428dd2
DC
279 bp->b_maps = malloc(bytes);
280 if (!bp->b_maps) {
a2ceac1f
DC
281 fprintf(stderr,
282 _("%s: %s can't malloc %u bytes: %s\n"),
283 progname, __FUNCTION__, bytes,
284 strerror(errno));
285 exit(1);
286 }
287 bp->b_nmaps = nmaps;
288
289 bytes = 0;
290 for ( i = 0; i < nmaps; i++) {
85428dd2
DC
291 bp->b_maps[i].bm_bn = map[i].bm_bn;
292 bp->b_maps[i].bm_len = map[i].bm_len;
a2ceac1f
DC
293 bytes += BBTOB(map[i].bm_len);
294 }
295
75c8b434 296 __initbuf(bp, btp, map[0].bm_bn, bytes);
a2ceac1f
DC
297 bp->b_flags |= LIBXFS_B_DISCONTIG;
298}
299
167137fe 300static struct xfs_buf *
a2ceac1f 301__libxfs_getbufr(int blen)
e6b359b3 302{
167137fe 303 struct xfs_buf *bp;
69ec88b5
BN
304
305 /*
306 * first look for a buffer that can be used as-is,
307 * if one cannot be found, see if there is a buffer,
ff1f79a7 308 * and if so, free its buffer and set b_addr to NULL
69ec88b5
BN
309 * before calling libxfs_initbuf.
310 */
311 pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
312 if (!list_empty(&xfs_buf_freelist.cm_list)) {
313 list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
c0594dd6 314 if (bp->b_length == BTOBB(blen)) {
69ec88b5
BN
315 list_del_init(&bp->b_node.cn_mru);
316 break;
317 }
318 }
319 if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
320 bp = list_entry(xfs_buf_freelist.cm_list.next,
167137fe 321 struct xfs_buf, b_node.cn_mru);
69ec88b5
BN
322 list_del_init(&bp->b_node.cn_mru);
323 free(bp->b_addr);
324 bp->b_addr = NULL;
2c6c6328
BF
325 if (bp->b_maps != &bp->__b_map)
326 free(bp->b_maps);
85428dd2 327 bp->b_maps = NULL;
69ec88b5
BN
328 }
329 } else
2e1394fc 330 bp = kmem_cache_zalloc(xfs_buf_cache, 0);
69ec88b5 331 pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
e0607266 332 bp->b_ops = NULL;
0a7942b3 333 if (bp->b_flags & LIBXFS_B_DIRTY)
2ab6ea6a 334 fprintf(stderr, "found dirty buffer (bulk) on free list!\n");
e6b359b3 335
a2ceac1f
DC
336 return bp;
337}
338
167137fe 339static struct xfs_buf *
75c8b434 340libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
a2ceac1f 341{
167137fe 342 struct xfs_buf *bp;
a2ceac1f
DC
343 int blen = BBTOB(bblen);
344
345 bp =__libxfs_getbufr(blen);
346 if (bp)
75c8b434 347 libxfs_initbuf(bp, btp, blkno, blen);
e6b359b3
NS
348 return bp;
349}
350
167137fe 351static struct xfs_buf *
75c8b434 352libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
a2ceac1f
DC
353 struct xfs_buf_map *map, int nmaps)
354{
167137fe 355 struct xfs_buf *bp;
a2ceac1f
DC
356 int blen = BBTOB(bblen);
357
358 if (!map || !nmaps) {
359 fprintf(stderr,
360 _("%s: %s invalid map %p or nmaps %d\n"),
361 progname, __FUNCTION__, map, nmaps);
362 exit(1);
363 }
364
365 if (blkno != map[0].bm_bn) {
366 fprintf(stderr,
b47c8cae
NS
367 _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
368 progname, __FUNCTION__, (long long)map[0].bm_bn,
369 (long long)blkno);
a2ceac1f
DC
370 exit(1);
371 }
372
373 bp =__libxfs_getbufr(blen);
374 if (bp)
75c8b434 375 libxfs_initbuf_map(bp, btp, map, nmaps);
a2ceac1f
DC
376 return bp;
377}
2556c98b 378
9c0383ad
AH
379void
380xfs_buf_lock(
381 struct xfs_buf *bp)
382{
383 if (use_xfs_buf_lock)
384 pthread_mutex_lock(&bp->b_lock);
385}
386
945c7341
DW
387void
388xfs_buf_unlock(
389 struct xfs_buf *bp)
390{
391 if (use_xfs_buf_lock)
392 pthread_mutex_unlock(&bp->b_lock);
393}
394
b3b1affe
DW
395static int
396__cache_lookup(
397 struct xfs_bufkey *key,
398 unsigned int flags,
399 struct xfs_buf **bpp)
2bd0ea18 400{
b3b1affe
DW
401 struct cache_node *cn = NULL;
402 struct xfs_buf *bp;
2556c98b 403
b3b1affe
DW
404 *bpp = NULL;
405
406 cache_node_get(libxfs_bcache, key, &cn);
407 if (!cn)
408 return -ENOMEM;
409 bp = container_of(cn, struct xfs_buf, b_node);
2ae22647
CH
410
411 if (use_xfs_buf_lock) {
b3b1affe 412 int ret;
50722af1
CH
413
414 ret = pthread_mutex_trylock(&bp->b_lock);
415 if (ret) {
416 ASSERT(ret == EAGAIN);
b3b1affe
DW
417 if (flags & LIBXFS_GETBUF_TRYLOCK) {
418 cache_node_put(libxfs_bcache, cn);
419 return -EAGAIN;
420 }
50722af1
CH
421
422 if (pthread_equal(bp->b_holder, pthread_self())) {
423 fprintf(stderr,
424 _("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
a2ceac1f 425 key->blkno);
50722af1 426 bp->b_recur++;
b3b1affe
DW
427 *bpp = bp;
428 return 0;
50722af1
CH
429 } else {
430 pthread_mutex_lock(&bp->b_lock);
2ae22647 431 }
2ae22647 432 }
50722af1
CH
433
434 bp->b_holder = pthread_self();
2ae22647
CH
435 }
436
b3b1affe
DW
437 cache_node_set_priority(libxfs_bcache, cn,
438 cache_node_get_priority(cn) - CACHE_PREFETCH_PRIORITY);
b3b1affe
DW
439 *bpp = bp;
440 return 0;
f1b058f9
NS
441}
442
a5ab418c 443static int
b45650ab
DW
444libxfs_getbuf_flags(
445 struct xfs_buftarg *btp,
446 xfs_daddr_t blkno,
447 int len,
a5ab418c
DW
448 unsigned int flags,
449 struct xfs_buf **bpp)
a2ceac1f 450{
b3b1affe 451 struct xfs_bufkey key = {NULL};
f8b581d6 452 int ret;
a2ceac1f 453
75c8b434 454 key.buftarg = btp;
a2ceac1f
DC
455 key.blkno = blkno;
456 key.bblen = len;
457
f8b581d6
DW
458 ret = __cache_lookup(&key, flags, bpp);
459 if (ret)
460 return ret;
461
462 if (btp == btp->bt_mount->m_ddev_targp) {
463 (*bpp)->b_pag = xfs_perag_get(btp->bt_mount,
464 xfs_daddr_to_agno(btp->bt_mount, blkno));
465 }
466
467 return 0;
a2ceac1f
DC
468}
469
e8ecd760
DW
470/*
471 * Clean the buffer flags for libxfs_getbuf*(), which wants to return
472 * an unused buffer with clean state. This prevents CRC errors on a
473 * re-read of a corrupt block that was prefetched and freed. This
474 * can happen with a massively corrupt directory that is discarded,
475 * but whose blocks are then recycled into expanding lost+found.
476 *
477 * Note however that if the buffer's dirty (prefetch calls getbuf)
478 * we'll leave the state alone because we don't want to discard blocks
479 * that have been fixed.
480 */
481static void
482reset_buf_state(
483 struct xfs_buf *bp)
484{
485 if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
486 bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
487 LIBXFS_B_UPTODATE);
488}
489
583ca112 490static int
b3b1affe
DW
491__libxfs_buf_get_map(
492 struct xfs_buftarg *btp,
493 struct xfs_buf_map *map,
494 int nmaps,
583ca112
DW
495 int flags,
496 struct xfs_buf **bpp)
a2ceac1f 497{
b3b1affe 498 struct xfs_bufkey key = {NULL};
b3b1affe 499 int i;
a2ceac1f 500
583ca112
DW
501 if (nmaps == 1)
502 return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
503 flags, bpp);
f388124d 504
75c8b434 505 key.buftarg = btp;
a2ceac1f
DC
506 key.blkno = map[0].bm_bn;
507 for (i = 0; i < nmaps; i++) {
508 key.bblen += map[i].bm_len;
509 }
510 key.map = map;
511 key.nmaps = nmaps;
512
583ca112 513 return __cache_lookup(&key, flags, bpp);
a2ceac1f
DC
514}
515
583ca112
DW
516int
517libxfs_buf_get_map(
518 struct xfs_buftarg *btp,
519 struct xfs_buf_map *map,
520 int nmaps,
521 int flags,
522 struct xfs_buf **bpp)
e8ecd760 523{
583ca112 524 int error;
e8ecd760 525
583ca112
DW
526 error = __libxfs_buf_get_map(btp, map, nmaps, flags, bpp);
527 if (error)
528 return error;
529
530 reset_buf_state(*bpp);
531 return 0;
e8ecd760
DW
532}
533
f1b058f9 534void
e02ba985
DW
535libxfs_buf_relse(
536 struct xfs_buf *bp)
f1b058f9 537{
cee99cfa
DC
538 /*
539 * ensure that any errors on this use of the buffer don't carry
540 * over to the next user.
541 */
542 bp->b_error = 0;
50722af1
CH
543 if (use_xfs_buf_lock) {
544 if (bp->b_recur) {
545 bp->b_recur--;
546 } else {
547 bp->b_holder = 0;
548 pthread_mutex_unlock(&bp->b_lock);
549 }
550 }
6af7c1ea 551
cad15696 552 if (!list_empty(&bp->b_node.cn_hash))
e7e49100 553 cache_node_put(libxfs_bcache, &bp->b_node);
1a12e432
DW
554 else if (--bp->b_node.cn_count == 0) {
555 if (bp->b_flags & LIBXFS_B_DIRTY)
556 libxfs_bwrite(bp);
557 libxfs_brelse(&bp->b_node);
558 }
f1b058f9
NS
559}
560
f1b058f9 561static struct cache_node *
e7e49100
DW
562libxfs_balloc(
563 cache_key_t key)
f1b058f9 564{
e7e49100
DW
565 struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key;
566 struct xfs_buf *bp;
2556c98b 567
a2ceac1f 568 if (bufkey->map)
e7e49100
DW
569 bp = libxfs_getbufr_map(bufkey->buftarg, bufkey->blkno,
570 bufkey->bblen, bufkey->map, bufkey->nmaps);
571 else
572 bp = libxfs_getbufr(bufkey->buftarg, bufkey->blkno,
573 bufkey->bblen);
574 return &bp->b_node;
2bd0ea18
NS
575}
576
a2ceac1f
DC
577
578static int
579__read_buf(int fd, void *buf, int len, off64_t offset, int flags)
2bd0ea18 580{
bcea58c7 581 int sts;
2bd0ea18 582
2f9a125c 583 sts = pread(fd, buf, len, offset);
bcea58c7 584 if (sts < 0) {
11202ec2 585 int error = errno;
9440d84d 586 fprintf(stderr, _("%s: read failed: %s\n"),
c3928e39 587 progname, strerror(error));
11202ec2 588 return -error;
a2ceac1f 589 } else if (sts != len) {
bcea58c7 590 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
a2ceac1f 591 progname, sts, len);
12b53197 592 return -EIO;
2bd0ea18 593 }
a2ceac1f
DC
594 return 0;
595}
596
597int
167137fe 598libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, struct xfs_buf *bp,
75c8b434 599 int len, int flags)
a2ceac1f 600{
7b47b1bc 601 int fd = btp->bt_bdev_fd;
a2ceac1f
DC
602 int bytes = BBTOB(len);
603 int error;
604
c0594dd6 605 ASSERT(len <= bp->b_length);
a2ceac1f
DC
606
607 error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
608 if (!error &&
ab434d12 609 bp->b_target->bt_bdev == btp->bt_bdev &&
246e2283 610 bp->b_cache_key == blkno &&
c0594dd6 611 bp->b_length == len)
f1b058f9 612 bp->b_flags |= LIBXFS_B_UPTODATE;
e214b18a 613 bp->b_error = error;
a2ceac1f 614 return error;
2bd0ea18
NS
615}
616
456371d8
DW
617int
618libxfs_readbuf_verify(
619 struct xfs_buf *bp,
620 const struct xfs_buf_ops *ops)
adbb3573
DC
621{
622 if (!ops)
456371d8
DW
623 return bp->b_error;
624
adbb3573
DC
625 bp->b_ops = ops;
626 bp->b_ops->verify_read(bp);
627 bp->b_flags &= ~LIBXFS_B_UNCHECKED;
456371d8 628 return bp->b_error;
adbb3573
DC
629}
630
800db1c1 631int
6d5e5ee0 632libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
a2ceac1f 633{
7b47b1bc 634 int fd = btp->bt_bdev_fd;
800db1c1 635 int error = 0;
04338619 636 void *buf;
800db1c1 637 int i;
75c8b434 638
a2ceac1f
DC
639 buf = bp->b_addr;
640 for (i = 0; i < bp->b_nmaps; i++) {
85428dd2
DC
641 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
642 int len = BBTOB(bp->b_maps[i].bm_len);
a2ceac1f 643
a2ceac1f
DC
644 error = __read_buf(fd, buf, len, offset, flags);
645 if (error) {
646 bp->b_error = error;
647 break;
648 }
649 buf += len;
a2ceac1f
DC
650 }
651
64eb960f 652 if (!error)
800db1c1 653 bp->b_flags |= LIBXFS_B_UPTODATE;
800db1c1
DC
654 return error;
655}
656
4c947857
DW
657int
658libxfs_buf_read_map(
659 struct xfs_buftarg *btp,
660 struct xfs_buf_map *map,
661 int nmaps,
662 int flags,
663 struct xfs_buf **bpp,
664 const struct xfs_buf_ops *ops)
800db1c1 665{
4c947857
DW
666 struct xfs_buf *bp;
667 bool salvage = flags & LIBXFS_READBUF_SALVAGE;
668 int error = 0;
800db1c1 669
4c947857 670 *bpp = NULL;
800db1c1 671 if (nmaps == 1)
e5008359
DW
672 error = libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
673 0, &bp);
674 else
675 error = __libxfs_buf_get_map(btp, map, nmaps, 0, &bp);
583ca112 676 if (error)
4c947857 677 return error;
800db1c1 678
e5008359
DW
679 /*
680 * If the buffer was prefetched, it is likely that it was not validated.
681 * Hence if we are supplied an ops function and the buffer is marked as
682 * unchecked, we need to validate it now.
683 *
684 * We do this verification even if the buffer is dirty - the
685 * verification is almost certainly going to fail the CRC check in this
686 * case as a dirty buffer has not had the CRC recalculated. However, we
687 * should not be dirtying unchecked buffers and therefore failing it
688 * here because it's dirty and unchecked indicates we've screwed up
689 * somewhere else.
4c947857
DW
690 *
691 * Note that if the caller passes in LIBXFS_READBUF_SALVAGE, that means
692 * they want the buffer even if it fails verification.
e5008359 693 */
800db1c1 694 bp->b_error = 0;
e5008359 695 if (bp->b_flags & (LIBXFS_B_UPTODATE | LIBXFS_B_DIRTY)) {
adbb3573 696 if (bp->b_flags & LIBXFS_B_UNCHECKED)
4c947857
DW
697 error = libxfs_readbuf_verify(bp, ops);
698 if (error && !salvage)
699 goto err;
700 goto ok;
75c8b434 701 }
e5008359
DW
702
703 /*
704 * Set the ops on a cache miss (i.e. first physical read) as the
705 * verifier may change the ops to match the type of buffer it contains.
706 * A cache hit might reset the verifier to the original type if we set
707 * it again, but it won't get called again and set to match the buffer
708 * contents. *cough* xfs_da_node_buf_ops *cough*.
709 */
710 if (nmaps == 1)
711 error = libxfs_readbufr(btp, map[0].bm_bn, bp, map[0].bm_len,
712 flags);
713 else
714 error = libxfs_readbufr_map(btp, bp, flags);
4c947857
DW
715 if (error)
716 goto err;
adbb3573 717
4c947857
DW
718 error = libxfs_readbuf_verify(bp, ops);
719 if (error && !salvage)
720 goto err;
721
722ok:
4c947857
DW
723 *bpp = bp;
724 return 0;
725err:
726 libxfs_buf_relse(bp);
727 return error;
a2ceac1f
DC
728}
729
3f8a028e
DW
730/* Allocate a raw uncached buffer. */
731static inline struct xfs_buf *
732libxfs_getbufr_uncached(
733 struct xfs_buftarg *targ,
734 xfs_daddr_t daddr,
735 size_t bblen)
736{
737 struct xfs_buf *bp;
738
739 bp = libxfs_getbufr(targ, daddr, bblen);
740 if (!bp)
741 return NULL;
742
743 INIT_LIST_HEAD(&bp->b_node.cn_hash);
744 bp->b_node.cn_count = 1;
745 return bp;
746}
747
748/*
749 * Allocate an uncached buffer that points nowhere. The refcount will be 1,
750 * and the cache node hash list will be empty to indicate that it's uncached.
751 */
d918bc57 752int
3f8a028e
DW
753libxfs_buf_get_uncached(
754 struct xfs_buftarg *targ,
755 size_t bblen,
d918bc57
DW
756 int flags,
757 struct xfs_buf **bpp)
3f8a028e 758{
d918bc57
DW
759 *bpp = libxfs_getbufr_uncached(targ, XFS_BUF_DADDR_NULL, bblen);
760 return *bpp != NULL ? 0 : -ENOMEM;
3f8a028e
DW
761}
762
763/*
764 * Allocate and read an uncached buffer. The refcount will be 1, and the cache
765 * node hash list will be empty to indicate that it's uncached.
766 */
767int
768libxfs_buf_read_uncached(
769 struct xfs_buftarg *targ,
770 xfs_daddr_t daddr,
771 size_t bblen,
772 int flags,
773 struct xfs_buf **bpp,
774 const struct xfs_buf_ops *ops)
775{
776 struct xfs_buf *bp;
777 int error;
778
779 *bpp = NULL;
780 bp = libxfs_getbufr_uncached(targ, daddr, bblen);
781 if (!bp)
782 return -ENOMEM;
783
784 error = libxfs_readbufr(targ, daddr, bp, bblen, flags);
785 if (error)
786 goto err;
787
788 error = libxfs_readbuf_verify(bp, ops);
789 if (error)
790 goto err;
791
792 *bpp = bp;
793 return 0;
794err:
795 libxfs_buf_relse(bp);
796 return error;
797}
798
a2ceac1f
DC
799static int
800__write_buf(int fd, void *buf, int len, off64_t offset, int flags)
2bd0ea18
NS
801{
802 int sts;
2bd0ea18 803
2f9a125c 804 sts = pwrite(fd, buf, len, offset);
2bd0ea18 805 if (sts < 0) {
11202ec2 806 int error = errno;
2f9a125c 807 fprintf(stderr, _("%s: pwrite failed: %s\n"),
c3928e39 808 progname, strerror(error));
11202ec2 809 return -error;
a2ceac1f 810 } else if (sts != len) {
2f9a125c 811 fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
a2ceac1f 812 progname, sts, len);
12b53197 813 return -EIO;
2bd0ea18 814 }
a2ceac1f
DC
815 return 0;
816}
817
818int
331d5956
DW
819libxfs_bwrite(
820 struct xfs_buf *bp)
a2ceac1f 821{
7b47b1bc 822 int fd = bp->b_target->bt_bdev_fd;
a2ceac1f 823
75c8b434
DC
824 /*
825 * we never write buffers that are marked stale. This indicates they
826 * contain data that has been invalidated, and even if the buffer is
827 * dirty it must *never* be written. Verifiers are wonderful for finding
828 * bugs like this. Make sure the error is obvious as to the cause.
829 */
830 if (bp->b_flags & LIBXFS_B_STALE) {
12b53197 831 bp->b_error = -ESTALE;
75c8b434
DC
832 return bp->b_error;
833 }
834
3b7667cb
DW
835 /* Trigger the writeback hook if there is one. */
836 if (bp->b_mount->m_buf_writeback_fn)
837 bp->b_mount->m_buf_writeback_fn(bp);
838
75c8b434
DC
839 /*
840 * clear any pre-existing error status on the buffer. This can occur if
841 * the buffer is corrupt on disk and the repair process doesn't clear
842 * the error before fixing and writing it back.
843 */
844 bp->b_error = 0;
845 if (bp->b_ops) {
846 bp->b_ops->verify_write(bp);
847 if (bp->b_error) {
848 fprintf(stderr,
eb20c4ca 849 _("%s: write verifier failed on %s bno 0x%llx/0x%x\n"),
a3fac935 850 __func__, bp->b_ops->name,
f1208396
DC
851 (unsigned long long)xfs_buf_daddr(bp),
852 bp->b_length);
75c8b434
DC
853 return bp->b_error;
854 }
855 }
856
a2ceac1f 857 if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
c0594dd6 858 bp->b_error = __write_buf(fd, bp->b_addr, BBTOB(bp->b_length),
f1208396
DC
859 LIBXFS_BBTOOFF64(xfs_buf_daddr(bp)),
860 bp->b_flags);
a2ceac1f
DC
861 } else {
862 int i;
04338619 863 void *buf = bp->b_addr;
a2ceac1f
DC
864
865 for (i = 0; i < bp->b_nmaps; i++) {
85428dd2
DC
866 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
867 int len = BBTOB(bp->b_maps[i].bm_len);
a2ceac1f 868
e8f1e8aa
DC
869 bp->b_error = __write_buf(fd, buf, len, offset,
870 bp->b_flags);
871 if (bp->b_error)
a2ceac1f 872 break;
a2ceac1f 873 buf += len;
a2ceac1f
DC
874 }
875 }
876
9a54569a
DW
877 if (bp->b_error) {
878 fprintf(stderr,
879 _("%s: write failed on %s bno 0x%llx/0x%x, err=%d\n"),
0a82d75e 880 __func__, bp->b_ops ? bp->b_ops->name : "(unknown)",
f1208396
DC
881 (unsigned long long)xfs_buf_daddr(bp),
882 bp->b_length, -bp->b_error);
9a54569a 883 } else {
a2ceac1f 884 bp->b_flags |= LIBXFS_B_UPTODATE;
b98336dd 885 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED);
704e4cef 886 xfs_buftarg_trip_write(bp->b_target);
a2ceac1f 887 }
e8f1e8aa 888 return bp->b_error;
2bd0ea18
NS
889}
890
18b4f688
DW
891/*
892 * Mark a buffer dirty. The dirty data will be written out when the cache
893 * is flushed (or at release time if the buffer is uncached).
894 */
895void
896libxfs_buf_mark_dirty(
f524ae04 897 struct xfs_buf *bp)
f1b058f9 898{
203d38cc
DC
899 /*
900 * Clear any error hanging over from reading the buffer. This prevents
901 * subsequent reads after this write from seeing stale errors.
902 */
903 bp->b_error = 0;
6af7c1ea 904 bp->b_flags &= ~LIBXFS_B_STALE;
f524ae04 905 bp->b_flags |= LIBXFS_B_DIRTY;
2bd0ea18
NS
906}
907
4bac42ba
DW
908/* Prepare a buffer to be sent to the MRU list. */
909static inline void
910libxfs_buf_prepare_mru(
c335b673
DW
911 struct xfs_buf *bp)
912{
4bac42ba
DW
913 if (bp->b_pag)
914 xfs_perag_put(bp->b_pag);
915 bp->b_pag = NULL;
916
917 if (!(bp->b_flags & LIBXFS_B_DIRTY))
918 return;
919
920 /* Complain about (and remember) dropping dirty buffers. */
c335b673
DW
921 fprintf(stderr, _("%s: Releasing dirty buffer to free list!\n"),
922 progname);
923
924 if (bp->b_error == -EFSCORRUPTED)
925 bp->b_target->flags |= XFS_BUFTARG_CORRUPT_WRITE;
926 bp->b_target->flags |= XFS_BUFTARG_LOST_WRITE;
927}
928
33165ec3 929static void
0a7942b3
DC
930libxfs_brelse(
931 struct cache_node *node)
33165ec3 932{
063516bb
DW
933 struct xfs_buf *bp = container_of(node, struct xfs_buf,
934 b_node);
33165ec3 935
0a7942b3
DC
936 if (!bp)
937 return;
4bac42ba 938 libxfs_buf_prepare_mru(bp);
0a7942b3
DC
939
940 pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
941 list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
942 pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
33165ec3
BN
943}
944
e08f5594 945static unsigned int
69ec88b5 946libxfs_bulkrelse(
0a7942b3
DC
947 struct cache *cache,
948 struct list_head *list)
2556c98b 949{
167137fe 950 struct xfs_buf *bp;
e08f5594 951 int count = 0;
2556c98b 952
69ec88b5 953 if (list_empty(list))
e08f5594 954 return 0 ;
69ec88b5
BN
955
956 list_for_each_entry(bp, list, b_node.cn_mru) {
4bac42ba 957 libxfs_buf_prepare_mru(bp);
e08f5594 958 count++;
2556c98b 959 }
69ec88b5
BN
960
961 pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
0b90dda6 962 list_splice(list, &xfs_buf_freelist.cm_list);
69ec88b5 963 pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
e08f5594
BN
964
965 return count;
69ec88b5
BN
966}
967
864028ed
ES
968/*
969 * Free everything from the xfs_buf_freelist MRU, used at final teardown
970 */
971void
972libxfs_bcache_free(void)
973{
974 struct list_head *cm_list;
167137fe 975 struct xfs_buf *bp, *next;
864028ed
ES
976
977 cm_list = &xfs_buf_freelist.cm_list;
978 list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
979 free(bp->b_addr);
980 if (bp->b_maps != &bp->__b_map)
981 free(bp->b_maps);
2e1394fc 982 kmem_cache_free(xfs_buf_cache, bp);
864028ed
ES
983 }
984}
985
e8f1e8aa
DC
986/*
987 * When a buffer is marked dirty, the error is cleared. Hence if we are trying
988 * to flush a buffer prior to cache reclaim that has an error on it it means
989 * we've already tried to flush it and it failed. Prevent repeated corruption
990 * errors from being reported by skipping such buffers - when the corruption is
991 * fixed the buffer will be marked dirty again and we can write it again.
992 */
0a7942b3
DC
993static int
994libxfs_bflush(
995 struct cache_node *node)
69ec88b5 996{
063516bb
DW
997 struct xfs_buf *bp = container_of(node, struct xfs_buf,
998 b_node);
69ec88b5 999
e8f1e8aa 1000 if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
331d5956 1001 return libxfs_bwrite(bp);
e8f1e8aa 1002 return bp->b_error;
2556c98b
BN
1003}
1004
f1b058f9
NS
1005void
1006libxfs_bcache_purge(void)
1007{
1008 cache_purge(libxfs_bcache);
1009}
1010
e8cb94ee 1011void
33165ec3
BN
1012libxfs_bcache_flush(void)
1013{
1014 cache_flush(libxfs_bcache);
1015}
1016
2556c98b
BN
1017int
1018libxfs_bcache_overflowed(void)
1019{
1020 return cache_overflowed(libxfs_bcache);
1021}
1022
f1b058f9 1023struct cache_operations libxfs_bcache_operations = {
bd9cc49a
ES
1024 .hash = libxfs_bhash,
1025 .alloc = libxfs_balloc,
1026 .flush = libxfs_bflush,
1027 .relse = libxfs_brelse,
1028 .compare = libxfs_bcompare,
1029 .bulkrelse = libxfs_bulkrelse
f1b058f9
NS
1030};
1031
c08793bd
BF
1032/*
1033 * Verify an on-disk magic value against the magic value specified in the
1034 * verifier structure. The verifier magic is in disk byte order so the caller is
1035 * expected to pass the value directly from disk.
1036 */
1037bool
1038xfs_verify_magic(
1039 struct xfs_buf *bp,
9e26de8f 1040 __be32 dmagic)
c08793bd 1041{
7861ef77 1042 struct xfs_mount *mp = bp->b_mount;
c08793bd
BF
1043 int idx;
1044
2660e653 1045 idx = xfs_has_crc(mp);
c08793bd
BF
1046 if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
1047 return false;
1048 return dmagic == bp->b_ops->magic[idx];
9e26de8f
DW
1049}
1050
1051/*
1052 * Verify an on-disk magic value against the magic value specified in the
1053 * verifier structure. The verifier magic is in disk byte order so the caller is
1054 * expected to pass the value directly from disk.
1055 */
1056bool
1057xfs_verify_magic16(
1058 struct xfs_buf *bp,
1059 __be16 dmagic)
1060{
7861ef77 1061 struct xfs_mount *mp = bp->b_mount;
9e26de8f
DW
1062 int idx;
1063
2660e653 1064 idx = xfs_has_crc(mp);
9e26de8f
DW
1065 if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
1066 return false;
1067 return dmagic == bp->b_ops->magic16[idx];
c08793bd 1068}
2bd0ea18 1069
f1b058f9 1070/*
3a19fb7d 1071 * Inode cache stubs.
f1b058f9
NS
1072 */
1073
2e1394fc
DW
1074struct kmem_cache *xfs_inode_cache;
1075extern struct kmem_cache *xfs_ili_cache;
f1b058f9 1076
2bd0ea18 1077int
12ac6e04
DW
1078libxfs_iget(
1079 struct xfs_mount *mp,
1080 struct xfs_trans *tp,
1081 xfs_ino_t ino,
1082 uint lock_flags,
1fecabf9 1083 struct xfs_inode **ipp)
2bd0ea18 1084{
12ac6e04 1085 struct xfs_inode *ip;
89522615 1086 struct xfs_buf *bp;
e7722e28 1087 struct xfs_perag *pag;
12ac6e04 1088 int error = 0;
2bd0ea18 1089
e7722e28
DC
1090 /* reject inode numbers outside existing AGs */
1091 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
1092 return -EINVAL;
1093
2e1394fc 1094 ip = kmem_cache_zalloc(xfs_inode_cache, 0);
3a19fb7d 1095 if (!ip)
12b53197 1096 return -ENOMEM;
2bd0ea18 1097
50edfee5 1098 VFS_I(ip)->i_count = 1;
3a19fb7d
CH
1099 ip->i_ino = ino;
1100 ip->i_mount = mp;
7ff5f1ed 1101 ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
686bddf9
DC
1102 spin_lock_init(&VFS_I(ip)->i_lock);
1103
e7722e28
DC
1104 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1105 error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, 0);
1106 xfs_perag_put(pag);
1107
89522615
CH
1108 if (error)
1109 goto out_destroy;
1110
c074900b 1111 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
89522615
CH
1112 if (error)
1113 goto out_destroy;
1114
c074900b
CH
1115 error = xfs_inode_from_disk(ip,
1116 xfs_buf_offset(bp, ip->i_imap.im_boffset));
89522615
CH
1117 if (!error)
1118 xfs_buf_set_ref(bp, XFS_INO_REF);
1119 xfs_trans_brelse(tp, bp);
1120
1121 if (error)
1122 goto out_destroy;
f1b058f9 1123
3a19fb7d
CH
1124 *ipp = ip;
1125 return 0;
89522615
CH
1126
1127out_destroy:
2e1394fc 1128 kmem_cache_free(xfs_inode_cache, ip);
89522615
CH
1129 *ipp = NULL;
1130 return error;
f1b058f9
NS
1131}
1132
1133static void
014e5f6d
ES
1134libxfs_idestroy(xfs_inode_t *ip)
1135{
e37bf53c 1136 switch (VFS_I(ip)->i_mode & S_IFMT) {
014e5f6d
ES
1137 case S_IFREG:
1138 case S_IFDIR:
1139 case S_IFLNK:
a87a40a2 1140 libxfs_idestroy_fork(&ip->i_df);
014e5f6d
ES
1141 break;
1142 }
d4292c66 1143
d4292c66
DW
1144 libxfs_ifork_zap_attr(ip);
1145
a87a40a2
CH
1146 if (ip->i_cowfp) {
1147 libxfs_idestroy_fork(ip->i_cowfp);
2e1394fc 1148 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
a87a40a2 1149 }
014e5f6d
ES
1150}
1151
2bd0ea18 1152void
31845e4c
DW
1153libxfs_irele(
1154 struct xfs_inode *ip)
2bd0ea18 1155{
50edfee5
DW
1156 VFS_I(ip)->i_count--;
1157
1158 if (VFS_I(ip)->i_count == 0) {
1159 ASSERT(ip->i_itemp == NULL);
1160 libxfs_idestroy(ip);
2e1394fc 1161 kmem_cache_free(xfs_inode_cache, ip);
50edfee5 1162 }
2bd0ea18 1163}
7db2e3c1 1164
c335b673
DW
1165/*
1166 * Flush everything dirty in the kernel and disk write caches to stable media.
1167 * Returns 0 for success or a negative error code.
1168 */
1169int
1170libxfs_blkdev_issue_flush(
1171 struct xfs_buftarg *btp)
1172{
7b47b1bc 1173 int ret;
c335b673 1174
ab434d12 1175 if (btp->bt_bdev == 0)
c335b673
DW
1176 return 0;
1177
7b47b1bc 1178 ret = platform_flush_device(btp->bt_bdev_fd, btp->bt_bdev);
c335b673
DW
1179 return ret ? -errno : 0;
1180}
1181
7db2e3c1
DW
1182/*
1183 * Write out a buffer list synchronously.
1184 *
1185 * This will take the @buffer_list, write all buffers out and wait for I/O
1186 * completion on all of the buffers. @buffer_list is consumed by the function,
1187 * so callers must have some other way of tracking buffers if they require such
1188 * functionality.
1189 */
1190int
1191xfs_buf_delwri_submit(
1192 struct list_head *buffer_list)
1193{
1194 struct xfs_buf *bp, *n;
1195 int error = 0, error2;
1196
1197 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1198 list_del_init(&bp->b_list);
331d5956 1199 error2 = libxfs_bwrite(bp);
7db2e3c1
DW
1200 if (!error)
1201 error = error2;
e02ba985 1202 libxfs_buf_relse(bp);
7db2e3c1
DW
1203 }
1204
1205 return error;
1206}
c92c796e 1207
e6d5a6f8
DW
1208/*
1209 * Cancel a delayed write list.
1210 *
1211 * Remove each buffer from the list, clear the delwri queue flag and drop the
1212 * associated buffer reference.
1213 */
1214void
1215xfs_buf_delwri_cancel(
1216 struct list_head *list)
1217{
1218 struct xfs_buf *bp;
1219
1220 while (!list_empty(list)) {
1221 bp = list_first_entry(list, struct xfs_buf, b_list);
1222
1223 list_del_init(&bp->b_list);
1224 libxfs_buf_relse(bp);
1225 }
1226}
1227
c92c796e
DW
1228/*
1229 * Format the log. The caller provides either a buftarg which is used to access
1230 * the log via buffers or a direct pointer to a buffer that encapsulates the
1231 * entire log.
1232 */
1233int
1234libxfs_log_clear(
1235 struct xfs_buftarg *btp,
1236 char *dptr,
1237 xfs_daddr_t start,
1238 uint length, /* basic blocks */
1239 uuid_t *fs_uuid,
1240 int version,
1241 int sunit, /* bytes */
1242 int fmt,
1243 int cycle,
1244 bool max)
1245{
1246 struct xfs_buf *bp = NULL;
1247 int len;
1248 xfs_lsn_t lsn;
1249 xfs_lsn_t tail_lsn;
1250 xfs_daddr_t blk;
1251 xfs_daddr_t end_blk;
1252 char *ptr;
1253
1254 if (((btp && dptr) || (!btp && !dptr)) ||
ab434d12 1255 (btp && !btp->bt_bdev) || !fs_uuid)
c92c796e
DW
1256 return -EINVAL;
1257
1258 /* first zero the log */
1259 if (btp)
1260 libxfs_device_zero(btp, start, length);
1261 else
1262 memset(dptr, 0, BBTOB(length));
1263
1264 /*
1265 * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
1266 * special reset case where we only write a single record where the lsn
1267 * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
1268 * the specified cycle and points tail_lsn at the last record of the
1269 * previous cycle.
1270 */
1271 len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
1272 len = max(len, 2);
1273 lsn = xlog_assign_lsn(cycle, 0);
1274 if (cycle == XLOG_INIT_CYCLE)
1275 tail_lsn = lsn;
1276 else
1277 tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
1278
1279 /* write out the first log record */
1280 ptr = dptr;
1281 if (btp) {
de319479 1282 bp = libxfs_getbufr_uncached(btp, start, len);
c92c796e
DW
1283 ptr = bp->b_addr;
1284 }
1285 libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
1286 next, bp);
18b4f688 1287 if (bp) {
f524ae04 1288 libxfs_buf_mark_dirty(bp);
18b4f688
DW
1289 libxfs_buf_relse(bp);
1290 }
c92c796e
DW
1291
1292 /*
1293 * There's nothing else to do if this is a log reset. The kernel detects
1294 * the rest of the log is zeroed and starts at cycle 1.
1295 */
1296 if (cycle == XLOG_INIT_CYCLE)
1297 return 0;
1298
1299 /*
1300 * Bump the record size for a full log format if the caller allows it.
1301 * This is primarily for performance reasons and most callers don't care
1302 * about record size since the log is clean after we're done.
1303 */
1304 if (max)
1305 len = BTOBB(BDSTRAT_SIZE);
1306
1307 /*
1308 * Otherwise, fill everything beyond the initial record with records of
1309 * the previous cycle so the kernel head/tail detection works correctly.
1310 *
1311 * We don't particularly care about the record size or content here.
1312 * It's only important that the headers are in place such that the
1313 * kernel finds 1.) a clean log and 2.) the correct current cycle value.
1314 * Therefore, bump up the record size to the max to use larger I/Os and
1315 * improve performance.
1316 */
1317 cycle--;
1318 blk = start + len;
1319 if (dptr)
1320 dptr += BBTOB(len);
1321 end_blk = start + length;
1322
1323 len = min(end_blk - blk, len);
1324 while (blk < end_blk) {
1325 lsn = xlog_assign_lsn(cycle, blk - start);
1326 tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
1327
1328 ptr = dptr;
1329 if (btp) {
de319479 1330 bp = libxfs_getbufr_uncached(btp, blk, len);
c92c796e
DW
1331 ptr = bp->b_addr;
1332 }
1333 /*
1334 * Note: pass the full buffer length as the sunit to initialize
1335 * the entire buffer.
1336 */
1337 libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
1338 tail_lsn, next, bp);
18b4f688 1339 if (bp) {
f524ae04 1340 libxfs_buf_mark_dirty(bp);
18b4f688
DW
1341 libxfs_buf_relse(bp);
1342 }
c92c796e
DW
1343
1344 blk += len;
1345 if (dptr)
1346 dptr += BBTOB(len);
1347 len = min(end_blk - blk, len);
1348 }
1349
1350 return 0;
1351}
1352
1353int
1354libxfs_log_header(
1355 char *caddr,
1356 uuid_t *fs_uuid,
1357 int version,
1358 int sunit,
1359 int fmt,
1360 xfs_lsn_t lsn,
1361 xfs_lsn_t tail_lsn,
1362 libxfs_get_block_t *nextfunc,
1363 void *private)
1364{
1365 xlog_rec_header_t *head = (xlog_rec_header_t *)caddr;
1366 char *p = caddr;
1367 __be32 cycle_lsn;
1368 int i, len;
1369 int hdrs = 1;
1370
1371 if (lsn == NULLCOMMITLSN)
1372 lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
1373 if (tail_lsn == NULLCOMMITLSN)
1374 tail_lsn = lsn;
1375
1376 len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
1377
1378 memset(p, 0, BBSIZE);
1379 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1380 head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
1381 head->h_version = cpu_to_be32(version);
1382 head->h_crc = cpu_to_le32(0);
1383 head->h_prev_block = cpu_to_be32(-1);
1384 head->h_num_logops = cpu_to_be32(1);
1385 head->h_fmt = cpu_to_be32(fmt);
1386 head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
1387
1388 head->h_lsn = cpu_to_be64(lsn);
1389 head->h_tail_lsn = cpu_to_be64(tail_lsn);
1390
1391 memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
1392
1393 /*
1394 * The kernel expects to see either a log record header magic value or
1395 * the LSN cycle at the top of every log block. The first word of each
1396 * non-header block is copied to the record headers and replaced with
1397 * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
1398 * details).
1399 *
1400 * Even though we only ever write an unmount record (one block), we
1401 * support writing log records up to the max log buffer size of 256k to
1402 * improve log format performance. This means a record can require up
1403 * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
1404 * data (each header supports 32k of data).
1405 */
1406 cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
1407 if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
1408 hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
1409 if (sunit % XLOG_HEADER_CYCLE_SIZE)
1410 hdrs++;
1411 }
1412
1413 /*
1414 * A fixed number of extended headers is expected based on h_size. If
1415 * required, format those now so the unmount record is located
1416 * correctly.
1417 *
1418 * Since we only write an unmount record, we only need one h_cycle_data
1419 * entry for the unmount record block. The subsequent record data
1420 * blocks are zeroed, which means we can stamp them directly with the
1421 * cycle and zero the rest of the cycle data in the extended headers.
1422 */
1423 if (hdrs > 1) {
1424 for (i = 1; i < hdrs; i++) {
1425 p = nextfunc(p, BBSIZE, private);
1426 memset(p, 0, BBSIZE);
1427 /* xlog_rec_ext_header.xh_cycle */
1428 *(__be32 *)p = cycle_lsn;
1429 }
1430 }
1431
1432 /*
1433 * The total length is the max of the stripe unit or 2 basic block
1434 * minimum (1 hdr blk + 1 data blk). The record length is the total
1435 * minus however many header blocks are required.
1436 */
1437 head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
1438
1439 /*
1440 * Write out the unmount record, pack the first word into the record
1441 * header and stamp the block with the cycle.
1442 */
1443 p = nextfunc(p, BBSIZE, private);
1444 unmount_record(p);
1445
1446 head->h_cycle_data[0] = *(__be32 *)p;
1447 *(__be32 *)p = cycle_lsn;
1448
1449 /*
1450 * Finally, zero all remaining blocks in the record and stamp each with
1451 * the cycle. We don't need to pack any of these blocks because the
1452 * cycle data in the headers has already been zeroed.
1453 */
1454 len = max(len, hdrs + 1);
1455 for (i = hdrs + 1; i < len; i++) {
1456 p = nextfunc(p, BBSIZE, private);
1457 memset(p, 0, BBSIZE);
1458 *(__be32 *)p = cycle_lsn;
1459 }
1460
1461 return BBTOB(len);
1462}
1463
af60a998
DW
1464void
1465libxfs_buf_set_priority(
1466 struct xfs_buf *bp,
1467 int priority)
1468{
1469 cache_node_set_priority(libxfs_bcache, &bp->b_node, priority);
1470}
1471
1472int
1473libxfs_buf_priority(
1474 struct xfs_buf *bp)
1475{
1476 return cache_node_get_priority(&bp->b_node);
1477}
1be76d11
DW
1478
1479/*
1480 * Log a message about and stale a buffer that a caller has decided is corrupt.
1481 *
1482 * This function should be called for the kinds of metadata corruption that
1483 * cannot be detect from a verifier, such as incorrect inter-block relationship
1484 * data. Do /not/ call this function from a verifier function.
1485 *
1486 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
1487 * be marked stale, but b_error will not be set. The caller is responsible for
1488 * releasing the buffer or fixing it.
1489 */
1490void
1491__xfs_buf_mark_corrupt(
1492 struct xfs_buf *bp,
1493 xfs_failaddr_t fa)
1494{
1495 ASSERT(bp->b_flags & XBF_DONE);
1496
9f09216e 1497 xfs_buf_corruption_error(bp, fa);
1be76d11
DW
1498 xfs_buf_stale(bp);
1499}