1 // SPDX-License-Identifier: GPL-2.0+
3 * Copyright (C) 2018 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
9 #include <sys/statvfs.h>
10 #include "libfrog/ptvar.h"
11 #include "libfrog/workqueue.h"
12 #include "libfrog/paths.h"
13 #include "xfs_scrub.h"
17 #include "read_verify.h"
23 * Manages the data block read verification phase. The caller schedules
24 * verification requests, which are then scheduled to be run by a thread
25 * pool worker. Adjacent (or nearly adjacent) requests can be combined
26 * to reduce overhead when free space fragmentation is high. The thread
27 * pool takes care of issuing multiple IOs to the device, if possible.
31 * Perform all IO in 32M chunks. This cannot exceed 65536 sectors
32 * because that's the biggest SCSI VERIFY(16) we dare to send.
34 #define RVP_IO_MAX_SIZE (33554432)
37 * If we're running in the background then we perform IO in 128k chunks
38 * to reduce the load on the IO subsystem.
40 #define RVP_BACKGROUND_IO_MAX_SIZE (131072)
42 /* What's the real maximum IO size? */
43 static inline unsigned int
46 return bg_mode
> 0 ? RVP_BACKGROUND_IO_MAX_SIZE
: RVP_IO_MAX_SIZE
;
49 /* Tolerate 64k holes in adjacent read verify requests. */
50 #define RVP_IO_BATCH_LOCALITY (65536)
55 uint64_t io_start
; /* bytes */
56 uint64_t io_length
; /* bytes */
59 struct read_verify_pool
{
60 struct workqueue wq
; /* thread pool */
61 struct scrub_ctx
*ctx
; /* scrub context */
62 void *readbuf
; /* read buffer */
63 struct ptcounter
*verified_bytes
;
64 struct ptvar
*rvstate
; /* combines read requests */
65 struct disk
*disk
; /* which disk? */
66 read_verify_ioerr_fn_t ioerr_fn
; /* io error callback */
67 size_t miniosz
; /* minimum io size, bytes */
70 * Store a runtime error code here so that we can stop the pool and
71 * return it to the caller.
77 * Create a thread pool to run read verifiers.
79 * @disk is the disk we want to verify.
80 * @miniosz is the minimum size of an IO to expect (in bytes).
81 * @ioerr_fn will be called when IO errors occur.
82 * @submitter_threads is the number of threads that may be sending verify
83 * requests at any given time.
86 read_verify_pool_alloc(
87 struct scrub_ctx
*ctx
,
90 read_verify_ioerr_fn_t ioerr_fn
,
91 unsigned int submitter_threads
,
92 struct read_verify_pool
**prvp
)
94 struct read_verify_pool
*rvp
;
95 unsigned int verifier_threads
= disk_heads(disk
);
99 * The minimum IO size must be a multiple of the disk sector size
100 * and a factor of the max io size.
102 if (miniosz
% disk
->d_lbasize
)
104 if (rvp_io_max_size() % miniosz
)
107 rvp
= calloc(1, sizeof(struct read_verify_pool
));
111 ret
= posix_memalign((void **)&rvp
->readbuf
, page_size
,
115 ret
= ptcounter_alloc(verifier_threads
, &rvp
->verified_bytes
);
118 rvp
->miniosz
= miniosz
;
121 rvp
->ioerr_fn
= ioerr_fn
;
122 ret
= -ptvar_alloc(submitter_threads
, sizeof(struct read_verify
),
126 ret
= -workqueue_create(&rvp
->wq
, (struct xfs_mount
*)rvp
,
127 verifier_threads
== 1 ? 0 : verifier_threads
);
134 ptvar_free(rvp
->rvstate
);
136 ptcounter_free(rvp
->verified_bytes
);
144 /* Abort all verification work. */
146 read_verify_pool_abort(
147 struct read_verify_pool
*rvp
)
149 if (!rvp
->runtime_error
)
150 rvp
->runtime_error
= ECANCELED
;
151 workqueue_terminate(&rvp
->wq
);
154 /* Finish up any read verification work. */
156 read_verify_pool_flush(
157 struct read_verify_pool
*rvp
)
159 return -workqueue_terminate(&rvp
->wq
);
162 /* Finish up any read verification work and tear it down. */
164 read_verify_pool_destroy(
165 struct read_verify_pool
*rvp
)
167 workqueue_destroy(&rvp
->wq
);
168 ptvar_free(rvp
->rvstate
);
169 ptcounter_free(rvp
->verified_bytes
);
175 * Issue a read-verify IO in big batches.
179 struct workqueue
*wq
,
183 struct read_verify
*rv
= arg
;
184 struct read_verify_pool
*rvp
;
185 unsigned long long verified
= 0;
192 rvp
= (struct read_verify_pool
*)wq
->wq_ctx
;
193 if (rvp
->runtime_error
)
196 io_max_size
= rvp_io_max_size();
198 while (rv
->io_length
> 0) {
200 len
= min(rv
->io_length
, io_max_size
);
201 dbg_printf("diskverify %d %"PRIu64
" %zu\n", rvp
->disk
->d_fd
,
203 sz
= disk_read_verify(rvp
->disk
, rvp
->readbuf
, rv
->io_start
,
205 if (sz
== len
&& io_max_size
< rvp
->miniosz
) {
207 * If the verify request was 100% successful and less
208 * than a single block in length, we were trying to
209 * read to the end of a block after a short read. That
210 * suggests there's something funny with this device,
211 * so single-step our way through the rest of the @rv
214 io_max_size
= rvp
->miniosz
;
218 /* Runtime error, bail out... */
219 if (read_error
!= EIO
&& read_error
!= EILSEQ
) {
220 rvp
->runtime_error
= read_error
;
225 * A direct read encountered an error while performing
226 * a multi-block read. Reduce the transfer size to a
227 * single block so that we can identify the exact range
228 * of bad blocks and good blocks. We single-step all
229 * the way to the end of the @rv range, (re)starting
230 * with the block that just failed.
232 if (io_max_size
> rvp
->miniosz
) {
233 io_max_size
= rvp
->miniosz
;
238 * A direct read hit an error while we were stepping
239 * through single blocks. Mark everything bad from
240 * io_start to the next miniosz block.
242 sz
= rvp
->miniosz
- (rv
->io_start
% rvp
->miniosz
);
243 dbg_printf("IOERR %d @ %"PRIu64
" %zu err %d\n",
244 rvp
->disk
->d_fd
, rv
->io_start
, sz
,
246 rvp
->ioerr_fn(rvp
->ctx
, rvp
->disk
, rv
->io_start
, sz
,
247 read_error
, rv
->io_end_arg
);
248 } else if (sz
< len
) {
250 * A short direct read suggests that we might have hit
251 * an IO error midway through the read but still had to
252 * return the number of bytes that were actually read.
254 * We need to force an EIO, so try reading the rest of
255 * the block (if it was a partial block read) or the
258 io_max_size
= rvp
->miniosz
- (sz
% rvp
->miniosz
);
259 dbg_printf("SHORT %d READ @ %"PRIu64
" %zu try for %zd\n",
260 rvp
->disk
->d_fd
, rv
->io_start
, sz
,
263 /* We should never get back more bytes than we asked. */
276 ret
= ptcounter_add(rvp
->verified_bytes
, verified
);
278 rvp
->runtime_error
= ret
;
281 /* Queue a read verify request. */
284 struct read_verify_pool
*rvp
,
285 struct read_verify
*rv
)
287 struct read_verify
*tmp
;
290 dbg_printf("verify fd %d start %"PRIu64
" len %"PRIu64
"\n",
291 rvp
->disk
->d_fd
, rv
->io_start
, rv
->io_length
);
293 /* Worker thread saw a runtime error, don't queue more. */
294 if (rvp
->runtime_error
)
295 return rvp
->runtime_error
;
297 /* Otherwise clone the request and queue the copy. */
298 tmp
= malloc(sizeof(struct read_verify
));
300 rvp
->runtime_error
= errno
;
304 memcpy(tmp
, rv
, sizeof(*tmp
));
306 ret
= -workqueue_add(&rvp
->wq
, read_verify
, 0, tmp
);
309 rvp
->runtime_error
= ret
;
318 * Issue an IO request. We'll batch subsequent requests if they're
319 * within 64k of each other
322 read_verify_schedule_io(
323 struct read_verify_pool
*rvp
,
328 struct read_verify
*rv
;
333 assert(rvp
->readbuf
);
335 /* Round up and down to the start of a miniosz chunk. */
336 start
&= ~(rvp
->miniosz
- 1);
337 length
= roundup(length
, rvp
->miniosz
);
339 rv
= ptvar_get(rvp
->rvstate
, &ret
);
342 req_end
= start
+ length
;
343 rv_end
= rv
->io_start
+ rv
->io_length
;
346 * If we have a stashed IO, we haven't changed fds, the error
347 * reporting is the same, and the two extents are close,
348 * we can combine them.
350 if (rv
->io_length
> 0 &&
351 end_arg
== rv
->io_end_arg
&&
352 ((start
>= rv
->io_start
&& start
<= rv_end
+ RVP_IO_BATCH_LOCALITY
) ||
353 (rv
->io_start
>= start
&&
354 rv
->io_start
<= req_end
+ RVP_IO_BATCH_LOCALITY
))) {
355 rv
->io_start
= min(rv
->io_start
, start
);
356 rv
->io_length
= max(req_end
, rv_end
) - rv
->io_start
;
358 /* Otherwise, issue the stashed IO (if there is one) */
359 if (rv
->io_length
> 0) {
362 res
= read_verify_queue(rvp
, rv
);
367 /* Stash the new IO. */
368 rv
->io_start
= start
;
369 rv
->io_length
= length
;
370 rv
->io_end_arg
= end_arg
;
376 /* Force any per-thread stashed IOs into the verifier. */
383 struct read_verify_pool
*rvp
= foreach_arg
;
384 struct read_verify
*rv
= data
;
386 if (rv
->io_length
== 0)
389 return -read_verify_queue(rvp
, rv
);
392 /* Force any stashed IOs into the verifier. */
394 read_verify_force_io(
395 struct read_verify_pool
*rvp
)
397 assert(rvp
->readbuf
);
399 return -ptvar_foreach(rvp
->rvstate
, force_one_io
, rvp
);
402 /* How many bytes has this process verified? */
405 struct read_verify_pool
*rvp
,
406 uint64_t *bytes_checked
)
408 return ptcounter_value(rvp
->verified_bytes
, bytes_checked
);