]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - libxfs/rdwr.c
xfs_scrub: use datadev parallelization estimates for thread count
[thirdparty/xfsprogs-dev.git] / libxfs / rdwr.c
CommitLineData
959ef981 1// SPDX-License-Identifier: GPL-2.0
2bd0ea18 2/*
f1b058f9 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
da23017d 4 * All Rights Reserved.
2bd0ea18
NS
5 */
6
b626fb59 7
9c799827 8#include "libxfs_priv.h"
1aef52f8 9#include "init.h"
b626fb59
DC
10#include "xfs_fs.h"
11#include "xfs_shared.h"
12#include "xfs_format.h"
13#include "xfs_log_format.h"
14#include "xfs_trans_resv.h"
15#include "xfs_mount.h"
16#include "xfs_inode_buf.h"
17#include "xfs_inode_fork.h"
18#include "xfs_inode.h"
19#include "xfs_trans.h"
20
6b803e5a 21#include "libxfs.h" /* for LIBXFS_EXIT_ON_FAILURE */
2bd0ea18 22
6af7c1ea
DC
23/*
24 * Important design/architecture note:
25 *
26 * The userspace code that uses the buffer cache is much less constrained than
27 * the kernel code. The userspace code is pretty nasty in places, especially
28 * when it comes to buffer error handling. Very little of the userspace code
29 * outside libxfs clears bp->b_error - very little code even checks it - so the
30 * libxfs code is tripping on stale errors left by the userspace code.
31 *
32 * We can't clear errors or zero buffer contents in libxfs_getbuf-* like we do
33 * in the kernel, because those functions are used by the libxfs_readbuf_*
34 * functions and hence need to leave the buffers unchanged on cache hits. This
35 * is actually the only way to gather a write error from a libxfs_writebuf()
36 * call - you need to get the buffer again so you can check bp->b_error field -
37 * assuming that the buffer is still in the cache when you check, that is.
38 *
39 * This is very different to the kernel code which does not release buffers on a
40 * write so we can wait on IO and check errors. The kernel buffer cache also
41 * guarantees a buffer of a known initial state from xfs_buf_get() even on a
42 * cache hit.
43 *
44 * IOWs, userspace is behaving quite differently to the kernel and as a result
45 * it leaks errors from reads, invalidations and writes through
46 * libxfs_getbuf/libxfs_readbuf.
47 *
48 * The result of this is that until the userspace code outside libxfs is cleaned
49 * up, functions that release buffers from userspace control (i.e
50 * libxfs_writebuf/libxfs_putbuf) need to zero bp->b_error to prevent
51 * propagation of stale errors into future buffer operations.
52 */
53
5000d01d 54#define BDSTRAT_SIZE (256 * 1024)
2bd0ea18 55
2556c98b
BN
56#define IO_BCOMPARE_CHECK
57
9542ae13
DC
58/* XXX: (dgc) Propagate errors, only exit if fail-on-error flag set */
59int
75c8b434 60libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len)
2bd0ea18 61{
3cc4d0db
NS
62 xfs_off_t start_offset, end_offset, offset;
63 ssize_t zsize, bytes;
2bd0ea18 64 char *z;
3cc4d0db 65 int fd;
2bd0ea18 66
3cc4d0db 67 zsize = min(BDSTRAT_SIZE, BBTOB(len));
b74a1f6a 68 if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {
9440d84d
NS
69 fprintf(stderr,
70 _("%s: %s can't memalign %d bytes: %s\n"),
7dfd8291 71 progname, __FUNCTION__, (int)zsize, strerror(errno));
2bd0ea18
NS
72 exit(1);
73 }
3cc4d0db
NS
74 memset(z, 0, zsize);
75
75c8b434 76 fd = libxfs_device_to_fd(btp->dev);
cb5b3ef4 77 start_offset = LIBXFS_BBTOOFF64(start);
3cc4d0db 78
dc8878f4 79 if ((lseek(fd, start_offset, SEEK_SET)) < 0) {
3cc4d0db 80 fprintf(stderr, _("%s: %s seek to offset %llu failed: %s\n"),
7dfd8291
NS
81 progname, __FUNCTION__,
82 (unsigned long long)start_offset, strerror(errno));
3cc4d0db
NS
83 exit(1);
84 }
85
cb5b3ef4 86 end_offset = LIBXFS_BBTOOFF64(start + len) - start_offset;
3cc4d0db
NS
87 for (offset = 0; offset < end_offset; ) {
88 bytes = min((ssize_t)(end_offset - offset), zsize);
89 if ((bytes = write(fd, z, bytes)) < 0) {
90 fprintf(stderr, _("%s: %s write failed: %s\n"),
9440d84d 91 progname, __FUNCTION__, strerror(errno));
2bd0ea18 92 exit(1);
3cc4d0db
NS
93 } else if (bytes == 0) {
94 fprintf(stderr, _("%s: %s not progressing?\n"),
95 progname, __FUNCTION__);
96 exit(1);
2bd0ea18 97 }
3cc4d0db 98 offset += bytes;
2bd0ea18
NS
99 }
100 free(z);
9542ae13 101 return 0;
2bd0ea18
NS
102}
103
989b74bc 104static void unmount_record(void *p)
2bd0ea18 105{
989b74bc 106 xlog_op_header_t *op = (xlog_op_header_t *)p;
5000d01d
SL
107 /* the data section must be 32 bit size aligned */
108 struct {
14f8b681
DW
109 uint16_t magic;
110 uint16_t pad1;
111 uint32_t pad2; /* may as well make it 64 bits */
5000d01d
SL
112 } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
113
989b74bc 114 memset(p, 0, BBSIZE);
ad9b88eb
BF
115 /* dummy tid to mark this as written from userspace */
116 op->oh_tid = cpu_to_be32(0xb0c0d0d0);
5e656dbb
BN
117 op->oh_len = cpu_to_be32(sizeof(magic));
118 op->oh_clientid = XFS_LOG;
119 op->oh_flags = XLOG_UNMOUNT_TRANS;
120 op->oh_res2 = 0;
989b74bc
NS
121
122 /* and the data for this op */
1552a820 123 memcpy((char *)p + sizeof(xlog_op_header_t), &magic, sizeof(magic));
989b74bc
NS
124}
125
1c12a814
BF
126static char *next(
127 char *ptr,
128 int offset,
129 void *private)
989b74bc 130{
1c12a814 131 struct xfs_buf *buf = (struct xfs_buf *)private;
989b74bc 132
1c12a814 133 if (buf &&
135e4bfe 134 (buf->b_bcount < (int)(ptr - (char *)buf->b_addr) + offset))
989b74bc 135 abort();
1c12a814 136
989b74bc
NS
137 return ptr + offset;
138}
139
1c12a814
BF
140/*
141 * Format the log. The caller provides either a buftarg which is used to access
142 * the log via buffers or a direct pointer to a buffer that encapsulates the
143 * entire log.
144 */
989b74bc
NS
145int
146libxfs_log_clear(
75c8b434 147 struct xfs_buftarg *btp,
1c12a814 148 char *dptr,
989b74bc 149 xfs_daddr_t start,
1c12a814 150 uint length, /* basic blocks */
989b74bc
NS
151 uuid_t *fs_uuid,
152 int version,
1c12a814 153 int sunit, /* bytes */
0c12ba5f 154 int fmt,
571a78a7
BF
155 int cycle,
156 bool max)
989b74bc 157{
1c12a814 158 struct xfs_buf *bp = NULL;
989b74bc 159 int len;
0c12ba5f 160 xfs_lsn_t lsn;
0337f27c
BF
161 xfs_lsn_t tail_lsn;
162 xfs_daddr_t blk;
163 xfs_daddr_t end_blk;
1c12a814 164 char *ptr;
989b74bc 165
1c12a814
BF
166 if (((btp && dptr) || (!btp && !dptr)) ||
167 (btp && !btp->dev) || !fs_uuid)
2bd0ea18 168 return -EINVAL;
5000d01d
SL
169
170 /* first zero the log */
1c12a814
BF
171 if (btp)
172 libxfs_device_zero(btp, start, length);
173 else
174 memset(dptr, 0, BBTOB(length));
5000d01d 175
0337f27c
BF
176 /*
177 * Initialize the log record length and LSNs. XLOG_INIT_CYCLE is a
178 * special reset case where we only write a single record where the lsn
179 * and tail_lsn match. Otherwise, the record lsn starts at block 0 of
180 * the specified cycle and points tail_lsn at the last record of the
181 * previous cycle.
182 */
989b74bc 183 len = ((version == 2) && sunit) ? BTOBB(sunit) : 2;
68d16907 184 len = max(len, 2);
0337f27c
BF
185 lsn = xlog_assign_lsn(cycle, 0);
186 if (cycle == XLOG_INIT_CYCLE)
187 tail_lsn = lsn;
188 else
189 tail_lsn = xlog_assign_lsn(cycle - 1, length - len);
190
191 /* write out the first log record */
1c12a814
BF
192 ptr = dptr;
193 if (btp) {
194 bp = libxfs_getbufr(btp, start, len);
04338619 195 ptr = bp->b_addr;
1c12a814
BF
196 }
197 libxfs_log_header(ptr, fs_uuid, version, sunit, fmt, lsn, tail_lsn,
198 next, bp);
199 if (bp) {
200 bp->b_flags |= LIBXFS_B_DIRTY;
201 libxfs_putbufr(bp);
202 }
0337f27c
BF
203
204 /*
205 * There's nothing else to do if this is a log reset. The kernel detects
206 * the rest of the log is zeroed and starts at cycle 1.
207 */
208 if (cycle == XLOG_INIT_CYCLE)
209 return 0;
210
571a78a7
BF
211 /*
212 * Bump the record size for a full log format if the caller allows it.
213 * This is primarily for performance reasons and most callers don't care
214 * about record size since the log is clean after we're done.
215 */
216 if (max)
217 len = BTOBB(BDSTRAT_SIZE);
218
0337f27c
BF
219 /*
220 * Otherwise, fill everything beyond the initial record with records of
221 * the previous cycle so the kernel head/tail detection works correctly.
222 *
223 * We don't particularly care about the record size or content here.
224 * It's only important that the headers are in place such that the
225 * kernel finds 1.) a clean log and 2.) the correct current cycle value.
226 * Therefore, bump up the record size to the max to use larger I/Os and
227 * improve performance.
228 */
229 cycle--;
230 blk = start + len;
1c12a814
BF
231 if (dptr)
232 dptr += BBTOB(len);
0337f27c
BF
233 end_blk = start + length;
234
571a78a7 235 len = min(end_blk - blk, len);
0337f27c
BF
236 while (blk < end_blk) {
237 lsn = xlog_assign_lsn(cycle, blk - start);
238 tail_lsn = xlog_assign_lsn(cycle, blk - start - len);
239
1c12a814
BF
240 ptr = dptr;
241 if (btp) {
242 bp = libxfs_getbufr(btp, blk, len);
04338619 243 ptr = bp->b_addr;
1c12a814 244 }
0337f27c
BF
245 /*
246 * Note: pass the full buffer length as the sunit to initialize
247 * the entire buffer.
248 */
1c12a814
BF
249 libxfs_log_header(ptr, fs_uuid, version, BBTOB(len), fmt, lsn,
250 tail_lsn, next, bp);
251 if (bp) {
252 bp->b_flags |= LIBXFS_B_DIRTY;
253 libxfs_putbufr(bp);
254 }
0337f27c 255
0337f27c 256 blk += len;
1c12a814
BF
257 if (dptr)
258 dptr += BBTOB(len);
571a78a7 259 len = min(end_blk - blk, len);
0337f27c
BF
260 }
261
989b74bc
NS
262 return 0;
263}
5000d01d 264
989b74bc
NS
265int
266libxfs_log_header(
d60ba955 267 char *caddr,
989b74bc
NS
268 uuid_t *fs_uuid,
269 int version,
270 int sunit,
271 int fmt,
0c12ba5f
BF
272 xfs_lsn_t lsn,
273 xfs_lsn_t tail_lsn,
989b74bc
NS
274 libxfs_get_block_t *nextfunc,
275 void *private)
276{
277 xlog_rec_header_t *head = (xlog_rec_header_t *)caddr;
d60ba955 278 char *p = caddr;
5e656dbb 279 __be32 cycle_lsn;
989b74bc 280 int i, len;
20fbd459 281 int hdrs = 1;
989b74bc 282
0c12ba5f
BF
283 if (lsn == NULLCOMMITLSN)
284 lsn = xlog_assign_lsn(XLOG_INIT_CYCLE, 0);
285 if (tail_lsn == NULLCOMMITLSN)
286 tail_lsn = lsn;
287
989b74bc 288 len = ((version == 2) && sunit) ? BTOBB(sunit) : 1;
5000d01d 289
989b74bc 290 memset(p, 0, BBSIZE);
5e656dbb 291 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
0c12ba5f 292 head->h_cycle = cpu_to_be32(CYCLE_LSN(lsn));
5e656dbb 293 head->h_version = cpu_to_be32(version);
167d49cb 294 head->h_crc = cpu_to_le32(0);
5e656dbb
BN
295 head->h_prev_block = cpu_to_be32(-1);
296 head->h_num_logops = cpu_to_be32(1);
5e656dbb 297 head->h_fmt = cpu_to_be32(fmt);
68d16907 298 head->h_size = cpu_to_be32(max(sunit, XLOG_BIG_RECORD_BSIZE));
5000d01d 299
0c12ba5f
BF
300 head->h_lsn = cpu_to_be64(lsn);
301 head->h_tail_lsn = cpu_to_be64(tail_lsn);
5000d01d 302
6699422d 303 memcpy(&head->h_fs_uuid, fs_uuid, sizeof(uuid_t));
73bf5988 304
ad9b88eb 305 /*
20fbd459
BF
306 * The kernel expects to see either a log record header magic value or
307 * the LSN cycle at the top of every log block. The first word of each
308 * non-header block is copied to the record headers and replaced with
309 * the cycle value (see xlog_[un]pack_data() and xlog_get_cycle() for
310 * details).
311 *
312 * Even though we only ever write an unmount record (one block), we
313 * support writing log records up to the max log buffer size of 256k to
314 * improve log format performance. This means a record can require up
315 * to 8 headers (1 rec. header + 7 ext. headers) for the packed cycle
316 * data (each header supports 32k of data).
ad9b88eb 317 */
46eca962 318 cycle_lsn = CYCLE_LSN_DISK(head->h_lsn);
20fbd459
BF
319 if (version == 2 && sunit > XLOG_HEADER_CYCLE_SIZE) {
320 hdrs = sunit / XLOG_HEADER_CYCLE_SIZE;
321 if (sunit % XLOG_HEADER_CYCLE_SIZE)
322 hdrs++;
323 }
324
325 /*
326 * A fixed number of extended headers is expected based on h_size. If
327 * required, format those now so the unmount record is located
328 * correctly.
329 *
330 * Since we only write an unmount record, we only need one h_cycle_data
331 * entry for the unmount record block. The subsequent record data
332 * blocks are zeroed, which means we can stamp them directly with the
333 * cycle and zero the rest of the cycle data in the extended headers.
334 */
335 if (hdrs > 1) {
336 for (i = 1; i < hdrs; i++) {
337 p = nextfunc(p, BBSIZE, private);
338 memset(p, 0, BBSIZE);
339 /* xlog_rec_ext_header.xh_cycle */
340 *(__be32 *)p = cycle_lsn;
341 }
342 }
343
344 /*
345 * The total length is the max of the stripe unit or 2 basic block
346 * minimum (1 hdr blk + 1 data blk). The record length is the total
347 * minus however many header blocks are required.
348 */
68d16907 349 head->h_len = cpu_to_be32(max(BBTOB(2), sunit) - hdrs * BBSIZE);
20fbd459
BF
350
351 /*
352 * Write out the unmount record, pack the first word into the record
353 * header and stamp the block with the cycle.
354 */
355 p = nextfunc(p, BBSIZE, private);
356 unmount_record(p);
357
ad9b88eb
BF
358 head->h_cycle_data[0] = *(__be32 *)p;
359 *(__be32 *)p = cycle_lsn;
360
361 /*
20fbd459
BF
362 * Finally, zero all remaining blocks in the record and stamp each with
363 * the cycle. We don't need to pack any of these blocks because the
364 * cycle data in the headers has already been zeroed.
ad9b88eb 365 */
68d16907 366 len = max(len, hdrs + 1);
20fbd459 367 for (i = hdrs + 1; i < len; i++) {
989b74bc
NS
368 p = nextfunc(p, BBSIZE, private);
369 memset(p, 0, BBSIZE);
5e656dbb 370 *(__be32 *)p = cycle_lsn;
73bf5988 371 }
5000d01d 372
989b74bc 373 return BBTOB(len);
2bd0ea18
NS
374}
375
2556c98b
BN
376/*
377 * Simple I/O (buffer cache) interface
378 */
379
380
381#ifdef XFS_BUF_TRACING
382
383#undef libxfs_readbuf
a2ceac1f 384#undef libxfs_readbuf_map
2556c98b
BN
385#undef libxfs_writebuf
386#undef libxfs_getbuf
a2ceac1f 387#undef libxfs_getbuf_map
2ae22647 388#undef libxfs_getbuf_flags
2556c98b
BN
389#undef libxfs_putbuf
390
75c8b434 391xfs_buf_t *libxfs_readbuf(struct xfs_buftarg *, xfs_daddr_t, int, int,
f756f80c 392 const struct xfs_buf_ops *);
75c8b434 393xfs_buf_t *libxfs_readbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
f756f80c 394 int, int, const struct xfs_buf_ops *);
2556c98b 395int libxfs_writebuf(xfs_buf_t *, int);
75c8b434 396xfs_buf_t *libxfs_getbuf(struct xfs_buftarg *, xfs_daddr_t, int);
7e3ab890
DC
397xfs_buf_t *libxfs_getbuf_map(struct xfs_buftarg *, struct xfs_buf_map *,
398 int, int);
75c8b434
DC
399xfs_buf_t *libxfs_getbuf_flags(struct xfs_buftarg *, xfs_daddr_t, int,
400 unsigned int);
2556c98b
BN
401void libxfs_putbuf (xfs_buf_t *);
402
a2ceac1f
DC
403#define __add_trace(bp, func, file, line) \
404do { \
405 if (bp) { \
406 (bp)->b_func = (func); \
407 (bp)->b_file = (file); \
408 (bp)->b_line = (line); \
409 } \
410} while (0)
411
2556c98b 412xfs_buf_t *
a2ceac1f 413libxfs_trace_readbuf(const char *func, const char *file, int line,
75c8b434
DC
414 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
415 const struct xfs_buf_ops *ops)
2556c98b 416{
75c8b434 417 xfs_buf_t *bp = libxfs_readbuf(btp, blkno, len, flags, ops);
a2ceac1f
DC
418 __add_trace(bp, func, file, line);
419 return bp;
420}
2556c98b 421
a2ceac1f
DC
422xfs_buf_t *
423libxfs_trace_readbuf_map(const char *func, const char *file, int line,
75c8b434
DC
424 struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, int flags,
425 const struct xfs_buf_ops *ops)
a2ceac1f 426{
75c8b434 427 xfs_buf_t *bp = libxfs_readbuf_map(btp, map, nmaps, flags, ops);
a2ceac1f 428 __add_trace(bp, func, file, line);
2556c98b
BN
429 return bp;
430}
431
432int
433libxfs_trace_writebuf(const char *func, const char *file, int line, xfs_buf_t *bp, int flags)
434{
a2ceac1f 435 __add_trace(bp, func, file, line);
2556c98b
BN
436 return libxfs_writebuf(bp, flags);
437}
438
439xfs_buf_t *
a2ceac1f 440libxfs_trace_getbuf(const char *func, const char *file, int line,
75c8b434 441 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
2556c98b 442{
75c8b434 443 xfs_buf_t *bp = libxfs_getbuf(btp, blkno, len);
a2ceac1f
DC
444 __add_trace(bp, func, file, line);
445 return bp;
446}
2556c98b 447
a2ceac1f
DC
448xfs_buf_t *
449libxfs_trace_getbuf_map(const char *func, const char *file, int line,
7e3ab890
DC
450 struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
451 int flags)
a2ceac1f 452{
7e3ab890 453 xfs_buf_t *bp = libxfs_getbuf_map(btp, map, nmaps, flags);
a2ceac1f 454 __add_trace(bp, func, file, line);
2556c98b
BN
455 return bp;
456}
457
2ae22647
CH
458xfs_buf_t *
459libxfs_trace_getbuf_flags(const char *func, const char *file, int line,
75c8b434 460 struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, unsigned int flags)
2ae22647 461{
75c8b434 462 xfs_buf_t *bp = libxfs_getbuf_flags(btp, blkno, len, flags);
a2ceac1f 463 __add_trace(bp, func, file, line);
2ae22647
CH
464 return bp;
465}
466
2556c98b
BN
467void
468libxfs_trace_putbuf(const char *func, const char *file, int line, xfs_buf_t *bp)
469{
a2ceac1f 470 __add_trace(bp, func, file, line);
2556c98b
BN
471 libxfs_putbuf(bp);
472}
473
474
475#endif
476
477
f1b058f9
NS
478xfs_buf_t *
479libxfs_getsb(xfs_mount_t *mp, int flags)
480{
75c8b434
DC
481 return libxfs_readbuf(mp->m_ddev_targp, XFS_SB_DADDR,
482 XFS_FSS_TO_BB(mp, 1), flags, &xfs_sb_buf_ops);
f1b058f9
NS
483}
484
5e656dbb 485kmem_zone_t *xfs_buf_zone;
69ec88b5
BN
486
487static struct cache_mru xfs_buf_freelist =
488 {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
489 0, PTHREAD_MUTEX_INITIALIZER };
f1b058f9 490
a2ceac1f
DC
491/*
492 * The bufkey is used to pass the new buffer information to the cache object
493 * allocation routine. Because discontiguous buffers need to pass different
494 * information, we need fields to pass that information. However, because the
495 * blkno and bblen is needed for the initial cache entry lookup (i.e. for
496 * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous
497 * buffer initialisation instead of a contiguous buffer.
498 */
499struct xfs_bufkey {
75c8b434 500 struct xfs_buftarg *buftarg;
a2ceac1f
DC
501 xfs_daddr_t blkno;
502 unsigned int bblen;
503 struct xfs_buf_map *map;
504 int nmaps;
505};
f1b058f9 506
602dcc0e
DC
507/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
508#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
509#define CACHE_LINE_SIZE 64
f1b058f9 510static unsigned int
602dcc0e 511libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
f1b058f9 512{
602dcc0e
DC
513 uint64_t hashval = ((struct xfs_bufkey *)key)->blkno;
514 uint64_t tmp;
515
516 tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
517 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
518 return tmp % hashsize;
f1b058f9
NS
519}
520
521static int
522libxfs_bcompare(struct cache_node *node, cache_key_t key)
523{
a2ceac1f
DC
524 struct xfs_buf *bp = (struct xfs_buf *)node;
525 struct xfs_bufkey *bkey = (struct xfs_bufkey *)key;
f1b058f9 526
75c8b434 527 if (bp->b_target->dev == bkey->buftarg->dev &&
ba9ecd40
DC
528 bp->b_bn == bkey->blkno) {
529 if (bp->b_bcount == BBTOB(bkey->bblen))
530 return CACHE_HIT;
531#ifdef IO_BCOMPARE_CHECK
532 if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) {
533 fprintf(stderr,
534 "%lx: Badness in key lookup (length)\n"
535 "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n",
536 pthread_self(),
537 (unsigned long long)bp->b_bn, (int)bp->b_bcount,
538 (unsigned long long)bkey->blkno,
539 BBTOB(bkey->bblen));
540 }
f1b058f9 541#endif
ba9ecd40
DC
542 return CACHE_PURGE;
543 }
544 return CACHE_MISS;
f1b058f9
NS
545}
546
e6b359b3 547static void
75c8b434
DC
548__initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
549 unsigned int bytes)
e6b359b3
NS
550{
551 bp->b_flags = 0;
5dfa5cd2 552 bp->b_bn = bno;
e6b359b3 553 bp->b_bcount = bytes;
a2ceac1f 554 bp->b_length = BTOBB(bytes);
75c8b434 555 bp->b_target = btp;
a6a7776a 556 bp->b_error = 0;
69ec88b5
BN
557 if (!bp->b_addr)
558 bp->b_addr = memalign(libxfs_device_alignment(), bytes);
e6b359b3
NS
559 if (!bp->b_addr) {
560 fprintf(stderr,
561 _("%s: %s can't memalign %u bytes: %s\n"),
562 progname, __FUNCTION__, bytes,
563 strerror(errno));
564 exit(1);
565 }
bf43fd28 566 memset(bp->b_addr, 0, bytes);
2556c98b
BN
567#ifdef XFS_BUF_TRACING
568 list_head_init(&bp->b_lock_list);
569#endif
570 pthread_mutex_init(&bp->b_lock, NULL);
50722af1
CH
571 bp->b_holder = 0;
572 bp->b_recur = 0;
75c8b434 573 bp->b_ops = NULL;
2c6c6328
BF
574
575 if (!bp->b_maps) {
576 bp->b_nmaps = 1;
577 bp->b_maps = &bp->__b_map;
578 bp->b_maps[0].bm_bn = bp->b_bn;
579 bp->b_maps[0].bm_len = bp->b_length;
580 }
e6b359b3
NS
581}
582
a2ceac1f 583static void
75c8b434
DC
584libxfs_initbuf(xfs_buf_t *bp, struct xfs_buftarg *btp, xfs_daddr_t bno,
585 unsigned int bytes)
a2ceac1f 586{
75c8b434 587 __initbuf(bp, btp, bno, bytes);
a2ceac1f
DC
588}
589
590static void
75c8b434
DC
591libxfs_initbuf_map(xfs_buf_t *bp, struct xfs_buftarg *btp,
592 struct xfs_buf_map *map, int nmaps)
a2ceac1f
DC
593{
594 unsigned int bytes = 0;
595 int i;
596
597 bytes = sizeof(struct xfs_buf_map) * nmaps;
85428dd2
DC
598 bp->b_maps = malloc(bytes);
599 if (!bp->b_maps) {
a2ceac1f
DC
600 fprintf(stderr,
601 _("%s: %s can't malloc %u bytes: %s\n"),
602 progname, __FUNCTION__, bytes,
603 strerror(errno));
604 exit(1);
605 }
606 bp->b_nmaps = nmaps;
607
608 bytes = 0;
609 for ( i = 0; i < nmaps; i++) {
85428dd2
DC
610 bp->b_maps[i].bm_bn = map[i].bm_bn;
611 bp->b_maps[i].bm_len = map[i].bm_len;
a2ceac1f
DC
612 bytes += BBTOB(map[i].bm_len);
613 }
614
75c8b434 615 __initbuf(bp, btp, map[0].bm_bn, bytes);
a2ceac1f
DC
616 bp->b_flags |= LIBXFS_B_DISCONTIG;
617}
618
00ff2b10 619static xfs_buf_t *
a2ceac1f 620__libxfs_getbufr(int blen)
e6b359b3
NS
621{
622 xfs_buf_t *bp;
69ec88b5
BN
623
624 /*
625 * first look for a buffer that can be used as-is,
626 * if one cannot be found, see if there is a buffer,
ff1f79a7 627 * and if so, free its buffer and set b_addr to NULL
69ec88b5
BN
628 * before calling libxfs_initbuf.
629 */
630 pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
631 if (!list_empty(&xfs_buf_freelist.cm_list)) {
632 list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
633 if (bp->b_bcount == blen) {
634 list_del_init(&bp->b_node.cn_mru);
635 break;
636 }
637 }
638 if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
639 bp = list_entry(xfs_buf_freelist.cm_list.next,
640 xfs_buf_t, b_node.cn_mru);
641 list_del_init(&bp->b_node.cn_mru);
642 free(bp->b_addr);
643 bp->b_addr = NULL;
2c6c6328
BF
644 if (bp->b_maps != &bp->__b_map)
645 free(bp->b_maps);
85428dd2 646 bp->b_maps = NULL;
69ec88b5
BN
647 }
648 } else
5e656dbb 649 bp = kmem_zone_zalloc(xfs_buf_zone, 0);
69ec88b5 650 pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
e0607266 651 bp->b_ops = NULL;
0a7942b3
DC
652 if (bp->b_flags & LIBXFS_B_DIRTY)
653 fprintf(stderr, "found dirty buffer (bulk) on free list!");
e6b359b3 654
a2ceac1f
DC
655 return bp;
656}
657
658xfs_buf_t *
75c8b434 659libxfs_getbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen)
a2ceac1f
DC
660{
661 xfs_buf_t *bp;
662 int blen = BBTOB(bblen);
663
664 bp =__libxfs_getbufr(blen);
665 if (bp)
75c8b434 666 libxfs_initbuf(bp, btp, blkno, blen);
2556c98b 667#ifdef IO_DEBUG
a2ceac1f 668 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
f63fd268 669 pthread_self(), __FUNCTION__, blen,
2556c98b
BN
670 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
671#endif
69ec88b5 672
e6b359b3
NS
673 return bp;
674}
675
00ff2b10 676static xfs_buf_t *
75c8b434 677libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen,
a2ceac1f
DC
678 struct xfs_buf_map *map, int nmaps)
679{
680 xfs_buf_t *bp;
681 int blen = BBTOB(bblen);
682
683 if (!map || !nmaps) {
684 fprintf(stderr,
685 _("%s: %s invalid map %p or nmaps %d\n"),
686 progname, __FUNCTION__, map, nmaps);
687 exit(1);
688 }
689
690 if (blkno != map[0].bm_bn) {
691 fprintf(stderr,
b47c8cae
NS
692 _("%s: %s map blkno 0x%llx doesn't match key 0x%llx\n"),
693 progname, __FUNCTION__, (long long)map[0].bm_bn,
694 (long long)blkno);
a2ceac1f
DC
695 exit(1);
696 }
697
698 bp =__libxfs_getbufr(blen);
699 if (bp)
75c8b434 700 libxfs_initbuf_map(bp, btp, map, nmaps);
a2ceac1f
DC
701#ifdef IO_DEBUG
702 printf("%lx: %s: allocated %u bytes buffer, key=0x%llx(0x%llx), %p\n",
703 pthread_self(), __FUNCTION__, blen,
704 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
705#endif
706
707 return bp;
708}
2556c98b
BN
709
710#ifdef XFS_BUF_TRACING
711struct list_head lock_buf_list = {&lock_buf_list, &lock_buf_list};
712int lock_buf_count = 0;
713#endif
e6b359b3 714
a2ceac1f
DC
715static struct xfs_buf *
716__cache_lookup(struct xfs_bufkey *key, unsigned int flags)
2bd0ea18 717{
a2ceac1f 718 struct xfs_buf *bp;
2556c98b 719
a2ceac1f 720 cache_node_get(libxfs_bcache, key, (struct cache_node **)&bp);
2ae22647
CH
721 if (!bp)
722 return NULL;
723
724 if (use_xfs_buf_lock) {
50722af1
CH
725 int ret;
726
727 ret = pthread_mutex_trylock(&bp->b_lock);
728 if (ret) {
729 ASSERT(ret == EAGAIN);
730 if (flags & LIBXFS_GETBUF_TRYLOCK)
731 goto out_put;
732
733 if (pthread_equal(bp->b_holder, pthread_self())) {
734 fprintf(stderr,
735 _("Warning: recursive buffer locking at block %" PRIu64 " detected\n"),
a2ceac1f 736 key->blkno);
50722af1
CH
737 bp->b_recur++;
738 return bp;
739 } else {
740 pthread_mutex_lock(&bp->b_lock);
2ae22647 741 }
2ae22647 742 }
50722af1
CH
743
744 bp->b_holder = pthread_self();
2ae22647
CH
745 }
746
747 cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp,
748 cache_node_get_priority((struct cache_node *)bp) -
a040d7c9 749 CACHE_PREFETCH_PRIORITY);
2556c98b 750#ifdef XFS_BUF_TRACING
2ae22647
CH
751 pthread_mutex_lock(&libxfs_bcache->c_mutex);
752 lock_buf_count++;
753 list_add(&bp->b_lock_list, &lock_buf_list);
754 pthread_mutex_unlock(&libxfs_bcache->c_mutex);
2556c98b 755#endif
2bd0ea18 756#ifdef IO_DEBUG
a2ceac1f
DC
757 printf("%lx %s: hit buffer %p for bno = 0x%llx/0x%llx\n",
758 pthread_self(), __FUNCTION__,
759 bp, bp->b_bn, (long long)LIBXFS_BBTOOFF64(key->blkno));
2bd0ea18 760#endif
2556c98b 761
f1b058f9 762 return bp;
50722af1
CH
763out_put:
764 cache_node_put(libxfs_bcache, (struct cache_node *)bp);
765 return NULL;
f1b058f9
NS
766}
767
a2ceac1f 768struct xfs_buf *
75c8b434
DC
769libxfs_getbuf_flags(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len,
770 unsigned int flags)
a2ceac1f 771{
3dd2705a 772 struct xfs_bufkey key = {NULL};
a2ceac1f 773
75c8b434 774 key.buftarg = btp;
a2ceac1f
DC
775 key.blkno = blkno;
776 key.bblen = len;
777
778 return __cache_lookup(&key, flags);
779}
780
e8ecd760
DW
781/*
782 * Clean the buffer flags for libxfs_getbuf*(), which wants to return
783 * an unused buffer with clean state. This prevents CRC errors on a
784 * re-read of a corrupt block that was prefetched and freed. This
785 * can happen with a massively corrupt directory that is discarded,
786 * but whose blocks are then recycled into expanding lost+found.
787 *
788 * Note however that if the buffer's dirty (prefetch calls getbuf)
789 * we'll leave the state alone because we don't want to discard blocks
790 * that have been fixed.
791 */
792static void
793reset_buf_state(
794 struct xfs_buf *bp)
795{
796 if (bp && !(bp->b_flags & LIBXFS_B_DIRTY))
797 bp->b_flags &= ~(LIBXFS_B_UNCHECKED | LIBXFS_B_STALE |
798 LIBXFS_B_UPTODATE);
799}
800
2ae22647 801struct xfs_buf *
75c8b434 802libxfs_getbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len)
2ae22647 803{
e8ecd760
DW
804 struct xfs_buf *bp;
805
806 bp = libxfs_getbuf_flags(btp, blkno, len, 0);
807 reset_buf_state(bp);
808 return bp;
2ae22647
CH
809}
810
e8ecd760
DW
811static struct xfs_buf *
812__libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
813 int nmaps, int flags)
a2ceac1f 814{
3dd2705a 815 struct xfs_bufkey key = {NULL};
a2ceac1f
DC
816 int i;
817
f388124d
DC
818 if (nmaps == 1)
819 return libxfs_getbuf_flags(btp, map[0].bm_bn, map[0].bm_len,
820 flags);
821
75c8b434 822 key.buftarg = btp;
a2ceac1f
DC
823 key.blkno = map[0].bm_bn;
824 for (i = 0; i < nmaps; i++) {
825 key.bblen += map[i].bm_len;
826 }
827 key.map = map;
828 key.nmaps = nmaps;
829
7e3ab890 830 return __cache_lookup(&key, flags);
a2ceac1f
DC
831}
832
e8ecd760
DW
833struct xfs_buf *
834libxfs_getbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map,
835 int nmaps, int flags)
836{
837 struct xfs_buf *bp;
838
839 bp = __libxfs_getbuf_map(btp, map, nmaps, flags);
840 reset_buf_state(bp);
841 return bp;
842}
843
f1b058f9
NS
844void
845libxfs_putbuf(xfs_buf_t *bp)
846{
cee99cfa
DC
847 /*
848 * ensure that any errors on this use of the buffer don't carry
849 * over to the next user.
850 */
851 bp->b_error = 0;
852
2556c98b
BN
853#ifdef XFS_BUF_TRACING
854 pthread_mutex_lock(&libxfs_bcache->c_mutex);
855 lock_buf_count--;
856 ASSERT(lock_buf_count >= 0);
857 list_del_init(&bp->b_lock_list);
858 pthread_mutex_unlock(&libxfs_bcache->c_mutex);
859#endif
50722af1
CH
860 if (use_xfs_buf_lock) {
861 if (bp->b_recur) {
862 bp->b_recur--;
863 } else {
864 bp->b_holder = 0;
865 pthread_mutex_unlock(&bp->b_lock);
866 }
867 }
6af7c1ea 868
a040d7c9 869 cache_node_put(libxfs_bcache, (struct cache_node *)bp);
f1b058f9
NS
870}
871
872void
873libxfs_purgebuf(xfs_buf_t *bp)
874{
3dd2705a 875 struct xfs_bufkey key = {NULL};
f1b058f9 876
75c8b434 877 key.buftarg = bp->b_target;
5dfa5cd2 878 key.blkno = bp->b_bn;
75c8b434 879 key.bblen = bp->b_length;
f1b058f9
NS
880
881 cache_node_purge(libxfs_bcache, &key, (struct cache_node *)bp);
882}
2bd0ea18 883
f1b058f9 884static struct cache_node *
2556c98b 885libxfs_balloc(cache_key_t key)
f1b058f9 886{
a2ceac1f 887 struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key;
2556c98b 888
a2ceac1f
DC
889 if (bufkey->map)
890 return (struct cache_node *)
75c8b434 891 libxfs_getbufr_map(bufkey->buftarg,
a2ceac1f
DC
892 bufkey->blkno, bufkey->bblen,
893 bufkey->map, bufkey->nmaps);
75c8b434 894 return (struct cache_node *)libxfs_getbufr(bufkey->buftarg,
a2ceac1f 895 bufkey->blkno, bufkey->bblen);
2bd0ea18
NS
896}
897
a2ceac1f
DC
898
899static int
900__read_buf(int fd, void *buf, int len, off64_t offset, int flags)
2bd0ea18 901{
bcea58c7 902 int sts;
2bd0ea18 903
2f9a125c 904 sts = pread(fd, buf, len, offset);
bcea58c7 905 if (sts < 0) {
11202ec2 906 int error = errno;
9440d84d 907 fprintf(stderr, _("%s: read failed: %s\n"),
c3928e39 908 progname, strerror(error));
9440d84d 909 if (flags & LIBXFS_EXIT_ON_FAILURE)
2bd0ea18 910 exit(1);
11202ec2 911 return -error;
a2ceac1f 912 } else if (sts != len) {
bcea58c7 913 fprintf(stderr, _("%s: error - read only %d of %d bytes\n"),
a2ceac1f 914 progname, sts, len);
bcea58c7
CH
915 if (flags & LIBXFS_EXIT_ON_FAILURE)
916 exit(1);
12b53197 917 return -EIO;
2bd0ea18 918 }
a2ceac1f
DC
919 return 0;
920}
921
922int
75c8b434
DC
923libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, xfs_buf_t *bp,
924 int len, int flags)
a2ceac1f 925{
75c8b434 926 int fd = libxfs_device_to_fd(btp->dev);
a2ceac1f
DC
927 int bytes = BBTOB(len);
928 int error;
929
930 ASSERT(BBTOB(len) <= bp->b_bcount);
931
932 error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags);
933 if (!error &&
75c8b434 934 bp->b_target->dev == btp->dev &&
5dfa5cd2 935 bp->b_bn == blkno &&
f1b058f9
NS
936 bp->b_bcount == bytes)
937 bp->b_flags |= LIBXFS_B_UPTODATE;
a2ceac1f
DC
938#ifdef IO_DEBUG
939 printf("%lx: %s: read %u bytes, error %d, blkno=0x%llx(0x%llx), %p\n",
940 pthread_self(), __FUNCTION__, bytes, error,
941 (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
942#endif
943 return error;
2bd0ea18
NS
944}
945
adbb3573
DC
946void
947libxfs_readbuf_verify(struct xfs_buf *bp, const struct xfs_buf_ops *ops)
948{
949 if (!ops)
950 return;
951 bp->b_ops = ops;
952 bp->b_ops->verify_read(bp);
953 bp->b_flags &= ~LIBXFS_B_UNCHECKED;
954}
955
956
2bd0ea18 957xfs_buf_t *
75c8b434
DC
958libxfs_readbuf(struct xfs_buftarg *btp, xfs_daddr_t blkno, int len, int flags,
959 const struct xfs_buf_ops *ops)
2bd0ea18 960{
f1b058f9 961 xfs_buf_t *bp;
2bd0ea18
NS
962 int error;
963
e8ecd760 964 bp = libxfs_getbuf_flags(btp, blkno, len, 0);
75c8b434
DC
965 if (!bp)
966 return NULL;
adbb3573
DC
967
968 /*
969 * if the buffer was prefetched, it is likely that it was not validated.
970 * Hence if we are supplied an ops function and the buffer is marked as
971 * unchecked, we need to validate it now.
972 *
973 * We do this verification even if the buffer is dirty - the
974 * verification is almost certainly going to fail the CRC check in this
975 * case as a dirty buffer has not had the CRC recalculated. However, we
976 * should not be dirtying unchecked buffers and therefore failing it
977 * here because it's dirty and unchecked indicates we've screwed up
978 * somewhere else.
979 */
980 bp->b_error = 0;
981 if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
982 if (bp->b_flags & LIBXFS_B_UNCHECKED)
983 libxfs_readbuf_verify(bp, ops);
75c8b434 984 return bp;
adbb3573 985 }
75c8b434
DC
986
987 /*
adbb3573
DC
988 * Set the ops on a cache miss (i.e. first physical read) as the
989 * verifier may change the ops to match the type of buffer it contains.
75c8b434
DC
990 * A cache hit might reset the verifier to the original type if we set
991 * it again, but it won't get called again and set to match the buffer
992 * contents. *cough* xfs_da_node_buf_ops *cough*.
993 */
75c8b434
DC
994 error = libxfs_readbufr(btp, blkno, bp, len, flags);
995 if (error)
996 bp->b_error = error;
adbb3573
DC
997 else
998 libxfs_readbuf_verify(bp, ops);
f1b058f9 999 return bp;
2bd0ea18
NS
1000}
1001
800db1c1 1002int
6d5e5ee0 1003libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags)
a2ceac1f 1004{
d0bbcbcb 1005 int fd;
800db1c1 1006 int error = 0;
04338619 1007 void *buf;
800db1c1 1008 int i;
75c8b434 1009
75c8b434 1010 fd = libxfs_device_to_fd(btp->dev);
a2ceac1f
DC
1011 buf = bp->b_addr;
1012 for (i = 0; i < bp->b_nmaps; i++) {
85428dd2
DC
1013 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
1014 int len = BBTOB(bp->b_maps[i].bm_len);
a2ceac1f 1015
a2ceac1f
DC
1016 error = __read_buf(fd, buf, len, offset, flags);
1017 if (error) {
1018 bp->b_error = error;
1019 break;
1020 }
1021 buf += len;
a2ceac1f
DC
1022 }
1023
64eb960f 1024 if (!error)
800db1c1
DC
1025 bp->b_flags |= LIBXFS_B_UPTODATE;
1026#ifdef IO_DEBUG
15028317
DW
1027 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1028 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
1029 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
800db1c1
DC
1030#endif
1031 return error;
1032}
1033
1034struct xfs_buf *
1035libxfs_readbuf_map(struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps,
1036 int flags, const struct xfs_buf_ops *ops)
1037{
1038 struct xfs_buf *bp;
1039 int error = 0;
1040
1041 if (nmaps == 1)
1042 return libxfs_readbuf(btp, map[0].bm_bn, map[0].bm_len,
1043 flags, ops);
1044
e8ecd760 1045 bp = __libxfs_getbuf_map(btp, map, nmaps, 0);
800db1c1
DC
1046 if (!bp)
1047 return NULL;
1048
1049 bp->b_error = 0;
adbb3573
DC
1050 if ((bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
1051 if (bp->b_flags & LIBXFS_B_UNCHECKED)
1052 libxfs_readbuf_verify(bp, ops);
800db1c1 1053 return bp;
75c8b434 1054 }
adbb3573
DC
1055 error = libxfs_readbufr_map(btp, bp, flags);
1056 if (!error)
1057 libxfs_readbuf_verify(bp, ops);
1058
15028317 1059#ifdef IO_DEBUGX
a2ceac1f
DC
1060 printf("%lx: %s: read %lu bytes, error %d, blkno=%llu(%llu), %p\n",
1061 pthread_self(), __FUNCTION__, buf - (char *)bp->b_addr, error,
5dfa5cd2 1062 (long long)LIBXFS_BBTOOFF64(bp->b_bn), (long long)bp->b_bn, bp);
a2ceac1f
DC
1063#endif
1064 return bp;
1065}
1066
1067static int
1068__write_buf(int fd, void *buf, int len, off64_t offset, int flags)
2bd0ea18
NS
1069{
1070 int sts;
2bd0ea18 1071
2f9a125c 1072 sts = pwrite(fd, buf, len, offset);
2bd0ea18 1073 if (sts < 0) {
11202ec2 1074 int error = errno;
2f9a125c 1075 fprintf(stderr, _("%s: pwrite failed: %s\n"),
c3928e39 1076 progname, strerror(error));
a2ceac1f 1077 if (flags & LIBXFS_B_EXIT)
2bd0ea18 1078 exit(1);
11202ec2 1079 return -error;
a2ceac1f 1080 } else if (sts != len) {
2f9a125c 1081 fprintf(stderr, _("%s: error - pwrite only %d of %d bytes\n"),
a2ceac1f
DC
1082 progname, sts, len);
1083 if (flags & LIBXFS_B_EXIT)
2bd0ea18 1084 exit(1);
12b53197 1085 return -EIO;
2bd0ea18 1086 }
a2ceac1f
DC
1087 return 0;
1088}
1089
1090int
1091libxfs_writebufr(xfs_buf_t *bp)
1092{
75c8b434 1093 int fd = libxfs_device_to_fd(bp->b_target->dev);
a2ceac1f 1094
75c8b434
DC
1095 /*
1096 * we never write buffers that are marked stale. This indicates they
1097 * contain data that has been invalidated, and even if the buffer is
1098 * dirty it must *never* be written. Verifiers are wonderful for finding
1099 * bugs like this. Make sure the error is obvious as to the cause.
1100 */
1101 if (bp->b_flags & LIBXFS_B_STALE) {
12b53197 1102 bp->b_error = -ESTALE;
75c8b434
DC
1103 return bp->b_error;
1104 }
1105
1106 /*
1107 * clear any pre-existing error status on the buffer. This can occur if
1108 * the buffer is corrupt on disk and the repair process doesn't clear
1109 * the error before fixing and writing it back.
1110 */
1111 bp->b_error = 0;
1112 if (bp->b_ops) {
1113 bp->b_ops->verify_write(bp);
1114 if (bp->b_error) {
1115 fprintf(stderr,
a3fac935
ES
1116 _("%s: write verifer failed on %s bno 0x%llx/0x%x\n"),
1117 __func__, bp->b_ops->name,
1118 (long long)bp->b_bn, bp->b_bcount);
75c8b434
DC
1119 return bp->b_error;
1120 }
1121 }
1122
a2ceac1f 1123 if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
e8f1e8aa 1124 bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
5dfa5cd2 1125 LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
a2ceac1f
DC
1126 } else {
1127 int i;
04338619 1128 void *buf = bp->b_addr;
a2ceac1f
DC
1129
1130 for (i = 0; i < bp->b_nmaps; i++) {
85428dd2
DC
1131 off64_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn);
1132 int len = BBTOB(bp->b_maps[i].bm_len);
a2ceac1f 1133
e8f1e8aa
DC
1134 bp->b_error = __write_buf(fd, buf, len, offset,
1135 bp->b_flags);
1136 if (bp->b_error)
a2ceac1f 1137 break;
a2ceac1f 1138 buf += len;
a2ceac1f
DC
1139 }
1140 }
1141
f1b058f9 1142#ifdef IO_DEBUG
d085fb48 1143 printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
2556c98b 1144 pthread_self(), __FUNCTION__, bp->b_bcount,
5dfa5cd2 1145 (long long)LIBXFS_BBTOOFF64(bp->b_bn),
e8f1e8aa 1146 (long long)bp->b_bn, bp, bp->b_error);
f1b058f9 1147#endif
e8f1e8aa 1148 if (!bp->b_error) {
a2ceac1f 1149 bp->b_flags |= LIBXFS_B_UPTODATE;
adbb3573
DC
1150 bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
1151 LIBXFS_B_UNCHECKED);
a2ceac1f 1152 }
e8f1e8aa 1153 return bp->b_error;
2bd0ea18
NS
1154}
1155
1156int
f1b058f9 1157libxfs_writebuf_int(xfs_buf_t *bp, int flags)
2bd0ea18 1158{
203d38cc
DC
1159 /*
1160 * Clear any error hanging over from reading the buffer. This prevents
1161 * subsequent reads after this write from seeing stale errors.
1162 */
1163 bp->b_error = 0;
6af7c1ea 1164 bp->b_flags &= ~LIBXFS_B_STALE;
f1b058f9
NS
1165 bp->b_flags |= (LIBXFS_B_DIRTY | flags);
1166 return 0;
1167}
1168
1169int
1170libxfs_writebuf(xfs_buf_t *bp, int flags)
1171{
e0607266
DC
1172#ifdef IO_DEBUG
1173 printf("%lx: %s: dirty blkno=%llu(%llu)\n",
1174 pthread_self(), __FUNCTION__,
1175 (long long)LIBXFS_BBTOOFF64(bp->b_bn),
1176 (long long)bp->b_bn);
1177#endif
203d38cc
DC
1178 /*
1179 * Clear any error hanging over from reading the buffer. This prevents
1180 * subsequent reads after this write from seeing stale errors.
1181 */
1182 bp->b_error = 0;
6af7c1ea 1183 bp->b_flags &= ~LIBXFS_B_STALE;
f1b058f9
NS
1184 bp->b_flags |= (LIBXFS_B_DIRTY | flags);
1185 libxfs_putbuf(bp);
1186 return 0;
2bd0ea18
NS
1187}
1188
57c9fccb 1189void
f1b058f9 1190libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
57c9fccb 1191{
f1b058f9
NS
1192#ifdef IO_DEBUG
1193 if (boff + len > bp->b_bcount) {
2556c98b 1194 printf("Badness, iomove out of range!\n"
a2ceac1f 1195 "bp=(bno 0x%llx, bytes %u) range=(boff %u, bytes %u)\n",
5dfa5cd2 1196 (long long)bp->b_bn, bp->b_bcount, boff, len);
57c9fccb 1197 abort();
f1b058f9
NS
1198 }
1199#endif
57c9fccb
NS
1200 switch (flags) {
1201 case LIBXFS_BZERO:
f1b058f9 1202 memset(bp->b_addr + boff, 0, len);
57c9fccb
NS
1203 break;
1204 case LIBXFS_BREAD:
f1b058f9 1205 memcpy(data, bp->b_addr + boff, len);
57c9fccb
NS
1206 break;
1207 case LIBXFS_BWRITE:
f1b058f9 1208 memcpy(bp->b_addr + boff, data, len);
57c9fccb
NS
1209 break;
1210 }
1211}
1212
33165ec3 1213static void
0a7942b3
DC
1214libxfs_brelse(
1215 struct cache_node *node)
33165ec3 1216{
0a7942b3 1217 struct xfs_buf *bp = (struct xfs_buf *)node;
33165ec3 1218
0a7942b3
DC
1219 if (!bp)
1220 return;
1221 if (bp->b_flags & LIBXFS_B_DIRTY)
1222 fprintf(stderr,
1223 "releasing dirty buffer to free list!");
1224
1225 pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
1226 list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
1227 pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
33165ec3
BN
1228}
1229
e08f5594 1230static unsigned int
69ec88b5 1231libxfs_bulkrelse(
0a7942b3
DC
1232 struct cache *cache,
1233 struct list_head *list)
2556c98b 1234{
69ec88b5 1235 xfs_buf_t *bp;
e08f5594 1236 int count = 0;
2556c98b 1237
69ec88b5 1238 if (list_empty(list))
e08f5594 1239 return 0 ;
69ec88b5
BN
1240
1241 list_for_each_entry(bp, list, b_node.cn_mru) {
2556c98b 1242 if (bp->b_flags & LIBXFS_B_DIRTY)
0a7942b3
DC
1243 fprintf(stderr,
1244 "releasing dirty buffer (bulk) to free list!");
e08f5594 1245 count++;
2556c98b 1246 }
69ec88b5
BN
1247
1248 pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
0b90dda6 1249 list_splice(list, &xfs_buf_freelist.cm_list);
69ec88b5 1250 pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
e08f5594
BN
1251
1252 return count;
69ec88b5
BN
1253}
1254
864028ed
ES
1255/*
1256 * Free everything from the xfs_buf_freelist MRU, used at final teardown
1257 */
1258void
1259libxfs_bcache_free(void)
1260{
1261 struct list_head *cm_list;
1262 xfs_buf_t *bp, *next;
1263
1264 cm_list = &xfs_buf_freelist.cm_list;
1265 list_for_each_entry_safe(bp, next, cm_list, b_node.cn_mru) {
1266 free(bp->b_addr);
1267 if (bp->b_maps != &bp->__b_map)
1268 free(bp->b_maps);
1269 kmem_zone_free(xfs_buf_zone, bp);
1270 }
1271}
1272
e8f1e8aa
DC
1273/*
1274 * When a buffer is marked dirty, the error is cleared. Hence if we are trying
1275 * to flush a buffer prior to cache reclaim that has an error on it it means
1276 * we've already tried to flush it and it failed. Prevent repeated corruption
1277 * errors from being reported by skipping such buffers - when the corruption is
1278 * fixed the buffer will be marked dirty again and we can write it again.
1279 */
0a7942b3
DC
1280static int
1281libxfs_bflush(
1282 struct cache_node *node)
69ec88b5 1283{
0a7942b3 1284 struct xfs_buf *bp = (struct xfs_buf *)node;
69ec88b5 1285
e8f1e8aa 1286 if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
0a7942b3 1287 return libxfs_writebufr(bp);
e8f1e8aa 1288 return bp->b_error;
2556c98b
BN
1289}
1290
1291void
1292libxfs_putbufr(xfs_buf_t *bp)
1293{
0a7942b3
DC
1294 if (bp->b_flags & LIBXFS_B_DIRTY)
1295 libxfs_writebufr(bp);
2556c98b
BN
1296 libxfs_brelse((struct cache_node *)bp);
1297}
1298
1299
f1b058f9
NS
1300void
1301libxfs_bcache_purge(void)
1302{
1303 cache_purge(libxfs_bcache);
1304}
1305
e8cb94ee 1306void
33165ec3
BN
1307libxfs_bcache_flush(void)
1308{
1309 cache_flush(libxfs_bcache);
1310}
1311
2556c98b
BN
1312int
1313libxfs_bcache_overflowed(void)
1314{
1315 return cache_overflowed(libxfs_bcache);
1316}
1317
f1b058f9 1318struct cache_operations libxfs_bcache_operations = {
bd9cc49a
ES
1319 .hash = libxfs_bhash,
1320 .alloc = libxfs_balloc,
1321 .flush = libxfs_bflush,
1322 .relse = libxfs_brelse,
1323 .compare = libxfs_bcompare,
1324 .bulkrelse = libxfs_bulkrelse
f1b058f9
NS
1325};
1326
2bd0ea18 1327
f1b058f9 1328/*
3a19fb7d 1329 * Inode cache stubs.
f1b058f9
NS
1330 */
1331
bf0e024f 1332kmem_zone_t *xfs_inode_zone;
5e656dbb 1333extern kmem_zone_t *xfs_ili_zone;
f1b058f9 1334
20e882d4
DW
1335/*
1336 * If there are inline format data / attr forks attached to this inode,
1337 * make sure they're not corrupt.
1338 */
1339bool
1340libxfs_inode_verify_forks(
12ac6e04
DW
1341 struct xfs_inode *ip,
1342 struct xfs_ifork_ops *ops)
20e882d4 1343{
1d3bac1f 1344 struct xfs_ifork *ifp;
20e882d4
DW
1345 xfs_failaddr_t fa;
1346
12ac6e04
DW
1347 if (!ops)
1348 return true;
1349
1350 fa = xfs_ifork_verify_data(ip, ops);
20e882d4 1351 if (fa) {
1d3bac1f
DW
1352 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1353 xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
1354 ifp->if_u1.if_data, ifp->if_bytes, fa);
20e882d4
DW
1355 return false;
1356 }
1357
12ac6e04 1358 fa = xfs_ifork_verify_attr(ip, ops);
20e882d4 1359 if (fa) {
1d3bac1f
DW
1360 ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
1361 xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
1362 ifp ? ifp->if_u1.if_data : NULL,
1363 ifp ? ifp->if_bytes : 0, fa);
20e882d4
DW
1364 return false;
1365 }
1366 return true;
1367}
1368
2bd0ea18 1369int
12ac6e04
DW
1370libxfs_iget(
1371 struct xfs_mount *mp,
1372 struct xfs_trans *tp,
1373 xfs_ino_t ino,
1374 uint lock_flags,
1375 struct xfs_inode **ipp,
1376 struct xfs_ifork_ops *ifork_ops)
2bd0ea18 1377{
12ac6e04
DW
1378 struct xfs_inode *ip;
1379 int error = 0;
2bd0ea18 1380
3a19fb7d
CH
1381 ip = kmem_zone_zalloc(xfs_inode_zone, 0);
1382 if (!ip)
12b53197 1383 return -ENOMEM;
2bd0ea18 1384
3a19fb7d
CH
1385 ip->i_ino = ino;
1386 ip->i_mount = mp;
81f8132a 1387 error = xfs_iread(mp, tp, ip, 0);
3a19fb7d
CH
1388 if (error) {
1389 kmem_zone_free(xfs_inode_zone, ip);
1390 *ipp = NULL;
1391 return error;
1392 }
f1b058f9 1393
12ac6e04 1394 if (!libxfs_inode_verify_forks(ip, ifork_ops)) {
31845e4c 1395 libxfs_irele(ip);
20e882d4
DW
1396 return -EFSCORRUPTED;
1397 }
1398
ff105f75
DC
1399 /*
1400 * set up the inode ops structure that the libxfs code relies on
1401 */
e37bf53c 1402 if (XFS_ISDIR(ip))
ff105f75
DC
1403 ip->d_ops = mp->m_dir_inode_ops;
1404 else
1405 ip->d_ops = mp->m_nondir_inode_ops;
1406
3a19fb7d
CH
1407 *ipp = ip;
1408 return 0;
f1b058f9
NS
1409}
1410
1411static void
014e5f6d
ES
1412libxfs_idestroy(xfs_inode_t *ip)
1413{
e37bf53c 1414 switch (VFS_I(ip)->i_mode & S_IFMT) {
014e5f6d
ES
1415 case S_IFREG:
1416 case S_IFDIR:
1417 case S_IFLNK:
1418 libxfs_idestroy_fork(ip, XFS_DATA_FORK);
1419 break;
1420 }
1421 if (ip->i_afp)
1422 libxfs_idestroy_fork(ip, XFS_ATTR_FORK);
a90b9ad2
DW
1423 if (ip->i_cowfp)
1424 xfs_idestroy_fork(ip, XFS_COW_FORK);
014e5f6d
ES
1425}
1426
2bd0ea18 1427void
31845e4c
DW
1428libxfs_irele(
1429 struct xfs_inode *ip)
2bd0ea18 1430{
3a19fb7d
CH
1431 if (ip->i_itemp)
1432 kmem_zone_free(xfs_ili_zone, ip->i_itemp);
1433 ip->i_itemp = NULL;
1434 libxfs_idestroy(ip);
1435 kmem_zone_free(xfs_inode_zone, ip);
2bd0ea18 1436}