]>
Commit | Line | Data |
---|---|---|
959ef981 | 1 | // SPDX-License-Identifier: GPL-2.0+ |
2000470d DW |
2 | /* |
3 | * Copyright (C) 2018 Oracle. All Rights Reserved. | |
2000470d | 4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> |
2000470d | 5 | */ |
a440f877 | 6 | #include "xfs.h" |
2000470d | 7 | #include <stdint.h> |
2000470d DW |
8 | #include <stdlib.h> |
9 | #include <sys/statvfs.h> | |
14051909 | 10 | #include "libfrog/ptvar.h" |
56598728 | 11 | #include "libfrog/workqueue.h" |
42b4c8e8 | 12 | #include "libfrog/paths.h" |
2000470d DW |
13 | #include "xfs_scrub.h" |
14 | #include "common.h" | |
15 | #include "counter.h" | |
16 | #include "disk.h" | |
17 | #include "read_verify.h" | |
ed60d210 | 18 | #include "progress.h" |
2000470d DW |
19 | |
20 | /* | |
21 | * Read Verify Pool | |
22 | * | |
23 | * Manages the data block read verification phase. The caller schedules | |
24 | * verification requests, which are then scheduled to be run by a thread | |
25 | * pool worker. Adjacent (or nearly adjacent) requests can be combined | |
26 | * to reduce overhead when free space fragmentation is high. The thread | |
27 | * pool takes care of issuing multiple IOs to the device, if possible. | |
28 | */ | |
29 | ||
30 | /* | |
31 | * Perform all IO in 32M chunks. This cannot exceed 65536 sectors | |
32 | * because that's the biggest SCSI VERIFY(16) we dare to send. | |
33 | */ | |
34 | #define RVP_IO_MAX_SIZE (33554432) | |
16dbab1a DW |
35 | |
36 | /* | |
37 | * If we're running in the background then we perform IO in 128k chunks | |
38 | * to reduce the load on the IO subsystem. | |
39 | */ | |
40 | #define RVP_BACKGROUND_IO_MAX_SIZE (131072) | |
41 | ||
42 | /* What's the real maximum IO size? */ | |
43 | static inline unsigned int | |
44 | rvp_io_max_size(void) | |
45 | { | |
46 | return bg_mode > 0 ? RVP_BACKGROUND_IO_MAX_SIZE : RVP_IO_MAX_SIZE; | |
47 | } | |
2000470d DW |
48 | |
49 | /* Tolerate 64k holes in adjacent read verify requests. */ | |
50 | #define RVP_IO_BATCH_LOCALITY (65536) | |
51 | ||
41c08606 DW |
52 | struct read_verify { |
53 | void *io_end_arg; | |
54 | struct disk *io_disk; | |
55 | uint64_t io_start; /* bytes */ | |
56 | uint64_t io_length; /* bytes */ | |
57 | }; | |
58 | ||
2000470d DW |
59 | struct read_verify_pool { |
60 | struct workqueue wq; /* thread pool */ | |
61 | struct scrub_ctx *ctx; /* scrub context */ | |
62 | void *readbuf; /* read buffer */ | |
63 | struct ptcounter *verified_bytes; | |
41c08606 | 64 | struct ptvar *rvstate; /* combines read requests */ |
f1bb1696 | 65 | struct disk *disk; /* which disk? */ |
2000470d DW |
66 | read_verify_ioerr_fn_t ioerr_fn; /* io error callback */ |
67 | size_t miniosz; /* minimum io size, bytes */ | |
5c657f1e DW |
68 | |
69 | /* | |
70 | * Store a runtime error code here so that we can stop the pool and | |
71 | * return it to the caller. | |
72 | */ | |
73 | int runtime_error; | |
2000470d DW |
74 | }; |
75 | ||
41c08606 DW |
76 | /* |
77 | * Create a thread pool to run read verifiers. | |
78 | * | |
f1bb1696 | 79 | * @disk is the disk we want to verify. |
41c08606 DW |
80 | * @miniosz is the minimum size of an IO to expect (in bytes). |
81 | * @ioerr_fn will be called when IO errors occur. | |
41c08606 DW |
82 | * @submitter_threads is the number of threads that may be sending verify |
83 | * requests at any given time. | |
84 | */ | |
8cab77d3 DW |
85 | int |
86 | read_verify_pool_alloc( | |
2000470d | 87 | struct scrub_ctx *ctx, |
f1bb1696 | 88 | struct disk *disk, |
2000470d DW |
89 | size_t miniosz, |
90 | read_verify_ioerr_fn_t ioerr_fn, | |
8cab77d3 DW |
91 | unsigned int submitter_threads, |
92 | struct read_verify_pool **prvp) | |
2000470d DW |
93 | { |
94 | struct read_verify_pool *rvp; | |
eacea707 | 95 | unsigned int verifier_threads = disk_heads(disk); |
8cab77d3 | 96 | int ret; |
2000470d | 97 | |
29c4f385 DW |
98 | /* |
99 | * The minimum IO size must be a multiple of the disk sector size | |
100 | * and a factor of the max io size. | |
101 | */ | |
102 | if (miniosz % disk->d_lbasize) | |
103 | return EINVAL; | |
16dbab1a | 104 | if (rvp_io_max_size() % miniosz) |
29c4f385 DW |
105 | return EINVAL; |
106 | ||
2000470d DW |
107 | rvp = calloc(1, sizeof(struct read_verify_pool)); |
108 | if (!rvp) | |
8cab77d3 | 109 | return errno; |
2000470d | 110 | |
8cab77d3 | 111 | ret = posix_memalign((void **)&rvp->readbuf, page_size, |
16dbab1a | 112 | rvp_io_max_size()); |
8cab77d3 | 113 | if (ret) |
2000470d | 114 | goto out_free; |
eacea707 | 115 | ret = ptcounter_alloc(verifier_threads, &rvp->verified_bytes); |
8cab77d3 | 116 | if (ret) |
2000470d DW |
117 | goto out_buf; |
118 | rvp->miniosz = miniosz; | |
119 | rvp->ctx = ctx; | |
f1bb1696 | 120 | rvp->disk = disk; |
2000470d | 121 | rvp->ioerr_fn = ioerr_fn; |
2f4422f4 | 122 | ret = -ptvar_alloc(submitter_threads, sizeof(struct read_verify), |
cb321a39 | 123 | &rvp->rvstate); |
8cab77d3 | 124 | if (ret) |
41c08606 | 125 | goto out_counter; |
baed134d | 126 | ret = -workqueue_create(&rvp->wq, (struct xfs_mount *)rvp, |
eacea707 | 127 | verifier_threads == 1 ? 0 : verifier_threads); |
2000470d | 128 | if (ret) |
41c08606 | 129 | goto out_rvstate; |
8cab77d3 DW |
130 | *prvp = rvp; |
131 | return 0; | |
2000470d | 132 | |
41c08606 DW |
133 | out_rvstate: |
134 | ptvar_free(rvp->rvstate); | |
2000470d DW |
135 | out_counter: |
136 | ptcounter_free(rvp->verified_bytes); | |
137 | out_buf: | |
138 | free(rvp->readbuf); | |
139 | out_free: | |
140 | free(rvp); | |
8cab77d3 | 141 | return ret; |
2000470d DW |
142 | } |
143 | ||
4cd869e5 DW |
144 | /* Abort all verification work. */ |
145 | void | |
146 | read_verify_pool_abort( | |
147 | struct read_verify_pool *rvp) | |
148 | { | |
149 | if (!rvp->runtime_error) | |
150 | rvp->runtime_error = ECANCELED; | |
151 | workqueue_terminate(&rvp->wq); | |
152 | } | |
153 | ||
2000470d | 154 | /* Finish up any read verification work. */ |
8cab77d3 | 155 | int |
2000470d DW |
156 | read_verify_pool_flush( |
157 | struct read_verify_pool *rvp) | |
158 | { | |
baed134d | 159 | return -workqueue_terminate(&rvp->wq); |
2000470d DW |
160 | } |
161 | ||
162 | /* Finish up any read verification work and tear it down. */ | |
163 | void | |
164 | read_verify_pool_destroy( | |
165 | struct read_verify_pool *rvp) | |
166 | { | |
7668d01d | 167 | workqueue_destroy(&rvp->wq); |
41c08606 | 168 | ptvar_free(rvp->rvstate); |
2000470d DW |
169 | ptcounter_free(rvp->verified_bytes); |
170 | free(rvp->readbuf); | |
171 | free(rvp); | |
172 | } | |
173 | ||
174 | /* | |
175 | * Issue a read-verify IO in big batches. | |
176 | */ | |
177 | static void | |
178 | read_verify( | |
179 | struct workqueue *wq, | |
180 | xfs_agnumber_t agno, | |
181 | void *arg) | |
182 | { | |
183 | struct read_verify *rv = arg; | |
184 | struct read_verify_pool *rvp; | |
185 | unsigned long long verified = 0; | |
27464242 | 186 | ssize_t io_max_size; |
2000470d DW |
187 | ssize_t sz; |
188 | ssize_t len; | |
27464242 | 189 | int read_error; |
5c657f1e | 190 | int ret; |
2000470d DW |
191 | |
192 | rvp = (struct read_verify_pool *)wq->wq_ctx; | |
27464242 DW |
193 | if (rvp->runtime_error) |
194 | return; | |
195 | ||
16dbab1a | 196 | io_max_size = rvp_io_max_size(); |
27464242 | 197 | |
2000470d | 198 | while (rv->io_length > 0) { |
27464242 DW |
199 | read_error = 0; |
200 | len = min(rv->io_length, io_max_size); | |
f1bb1696 | 201 | dbg_printf("diskverify %d %"PRIu64" %zu\n", rvp->disk->d_fd, |
2000470d | 202 | rv->io_start, len); |
f1bb1696 DW |
203 | sz = disk_read_verify(rvp->disk, rvp->readbuf, rv->io_start, |
204 | len); | |
27464242 DW |
205 | if (sz == len && io_max_size < rvp->miniosz) { |
206 | /* | |
207 | * If the verify request was 100% successful and less | |
208 | * than a single block in length, we were trying to | |
209 | * read to the end of a block after a short read. That | |
210 | * suggests there's something funny with this device, | |
211 | * so single-step our way through the rest of the @rv | |
212 | * range. | |
213 | */ | |
214 | io_max_size = rvp->miniosz; | |
215 | } else if (sz < 0) { | |
216 | read_error = errno; | |
217 | ||
218 | /* Runtime error, bail out... */ | |
219 | if (read_error != EIO && read_error != EILSEQ) { | |
220 | rvp->runtime_error = read_error; | |
221 | return; | |
222 | } | |
223 | ||
224 | /* | |
225 | * A direct read encountered an error while performing | |
226 | * a multi-block read. Reduce the transfer size to a | |
227 | * single block so that we can identify the exact range | |
228 | * of bad blocks and good blocks. We single-step all | |
229 | * the way to the end of the @rv range, (re)starting | |
230 | * with the block that just failed. | |
231 | */ | |
232 | if (io_max_size > rvp->miniosz) { | |
233 | io_max_size = rvp->miniosz; | |
234 | continue; | |
235 | } | |
236 | ||
237 | /* | |
238 | * A direct read hit an error while we were stepping | |
239 | * through single blocks. Mark everything bad from | |
240 | * io_start to the next miniosz block. | |
241 | */ | |
242 | sz = rvp->miniosz - (rv->io_start % rvp->miniosz); | |
243 | dbg_printf("IOERR %d @ %"PRIu64" %zu err %d\n", | |
244 | rvp->disk->d_fd, rv->io_start, sz, | |
245 | read_error); | |
246 | rvp->ioerr_fn(rvp->ctx, rvp->disk, rv->io_start, sz, | |
247 | read_error, rv->io_end_arg); | |
248 | } else if (sz < len) { | |
249 | /* | |
250 | * A short direct read suggests that we might have hit | |
251 | * an IO error midway through the read but still had to | |
252 | * return the number of bytes that were actually read. | |
253 | * | |
254 | * We need to force an EIO, so try reading the rest of | |
255 | * the block (if it was a partial block read) or the | |
256 | * next full block. | |
257 | */ | |
258 | io_max_size = rvp->miniosz - (sz % rvp->miniosz); | |
259 | dbg_printf("SHORT %d READ @ %"PRIu64" %zu try for %zd\n", | |
260 | rvp->disk->d_fd, rv->io_start, sz, | |
261 | io_max_size); | |
262 | } else { | |
263 | /* We should never get back more bytes than we asked. */ | |
264 | assert(sz == len); | |
2000470d DW |
265 | } |
266 | ||
27464242 DW |
267 | progress_add(sz); |
268 | if (read_error == 0) | |
269 | verified += sz; | |
270 | rv->io_start += sz; | |
271 | rv->io_length -= sz; | |
16dbab1a | 272 | background_sleep(); |
2000470d DW |
273 | } |
274 | ||
275 | free(rv); | |
5c657f1e | 276 | ret = ptcounter_add(rvp->verified_bytes, verified); |
8cab77d3 | 277 | if (ret) |
5c657f1e | 278 | rvp->runtime_error = ret; |
2000470d DW |
279 | } |
280 | ||
281 | /* Queue a read verify request. */ | |
8cab77d3 | 282 | static int |
2000470d DW |
283 | read_verify_queue( |
284 | struct read_verify_pool *rvp, | |
285 | struct read_verify *rv) | |
286 | { | |
287 | struct read_verify *tmp; | |
288 | bool ret; | |
289 | ||
290 | dbg_printf("verify fd %d start %"PRIu64" len %"PRIu64"\n", | |
f1bb1696 | 291 | rvp->disk->d_fd, rv->io_start, rv->io_length); |
2000470d | 292 | |
5c657f1e DW |
293 | /* Worker thread saw a runtime error, don't queue more. */ |
294 | if (rvp->runtime_error) | |
8cab77d3 | 295 | return rvp->runtime_error; |
5c657f1e DW |
296 | |
297 | /* Otherwise clone the request and queue the copy. */ | |
2000470d DW |
298 | tmp = malloc(sizeof(struct read_verify)); |
299 | if (!tmp) { | |
5c657f1e | 300 | rvp->runtime_error = errno; |
8cab77d3 | 301 | return errno; |
2000470d | 302 | } |
5c657f1e | 303 | |
2000470d DW |
304 | memcpy(tmp, rv, sizeof(*tmp)); |
305 | ||
baed134d | 306 | ret = -workqueue_add(&rvp->wq, read_verify, 0, tmp); |
2000470d | 307 | if (ret) { |
2000470d | 308 | free(tmp); |
5c657f1e | 309 | rvp->runtime_error = ret; |
8cab77d3 | 310 | return ret; |
2000470d | 311 | } |
8cab77d3 | 312 | |
2000470d | 313 | rv->io_length = 0; |
8cab77d3 | 314 | return 0; |
2000470d DW |
315 | } |
316 | ||
317 | /* | |
318 | * Issue an IO request. We'll batch subsequent requests if they're | |
319 | * within 64k of each other | |
320 | */ | |
8cab77d3 | 321 | int |
2000470d DW |
322 | read_verify_schedule_io( |
323 | struct read_verify_pool *rvp, | |
2000470d DW |
324 | uint64_t start, |
325 | uint64_t length, | |
326 | void *end_arg) | |
327 | { | |
41c08606 | 328 | struct read_verify *rv; |
2000470d DW |
329 | uint64_t req_end; |
330 | uint64_t rv_end; | |
cb321a39 | 331 | int ret; |
2000470d DW |
332 | |
333 | assert(rvp->readbuf); | |
29c4f385 DW |
334 | |
335 | /* Round up and down to the start of a miniosz chunk. */ | |
336 | start &= ~(rvp->miniosz - 1); | |
337 | length = roundup(length, rvp->miniosz); | |
338 | ||
cb321a39 DW |
339 | rv = ptvar_get(rvp->rvstate, &ret); |
340 | if (ret) | |
2f4422f4 | 341 | return -ret; |
2000470d DW |
342 | req_end = start + length; |
343 | rv_end = rv->io_start + rv->io_length; | |
344 | ||
345 | /* | |
346 | * If we have a stashed IO, we haven't changed fds, the error | |
347 | * reporting is the same, and the two extents are close, | |
348 | * we can combine them. | |
349 | */ | |
f1bb1696 | 350 | if (rv->io_length > 0 && |
2000470d DW |
351 | end_arg == rv->io_end_arg && |
352 | ((start >= rv->io_start && start <= rv_end + RVP_IO_BATCH_LOCALITY) || | |
353 | (rv->io_start >= start && | |
354 | rv->io_start <= req_end + RVP_IO_BATCH_LOCALITY))) { | |
355 | rv->io_start = min(rv->io_start, start); | |
356 | rv->io_length = max(req_end, rv_end) - rv->io_start; | |
357 | } else { | |
358 | /* Otherwise, issue the stashed IO (if there is one) */ | |
601ebcd8 DW |
359 | if (rv->io_length > 0) { |
360 | int res; | |
361 | ||
362 | res = read_verify_queue(rvp, rv); | |
363 | if (res) | |
364 | return res; | |
365 | } | |
2000470d DW |
366 | |
367 | /* Stash the new IO. */ | |
2000470d DW |
368 | rv->io_start = start; |
369 | rv->io_length = length; | |
370 | rv->io_end_arg = end_arg; | |
371 | } | |
372 | ||
8cab77d3 | 373 | return 0; |
2000470d DW |
374 | } |
375 | ||
22d658ec DW |
376 | /* Force any per-thread stashed IOs into the verifier. */ |
377 | static int | |
378 | force_one_io( | |
379 | struct ptvar *ptv, | |
380 | void *data, | |
381 | void *foreach_arg) | |
382 | { | |
383 | struct read_verify_pool *rvp = foreach_arg; | |
384 | struct read_verify *rv = data; | |
385 | ||
386 | if (rv->io_length == 0) | |
387 | return 0; | |
388 | ||
2f4422f4 | 389 | return -read_verify_queue(rvp, rv); |
22d658ec DW |
390 | } |
391 | ||
2000470d | 392 | /* Force any stashed IOs into the verifier. */ |
8cab77d3 | 393 | int |
2000470d | 394 | read_verify_force_io( |
41c08606 | 395 | struct read_verify_pool *rvp) |
2000470d | 396 | { |
2000470d | 397 | assert(rvp->readbuf); |
2000470d | 398 | |
2f4422f4 | 399 | return -ptvar_foreach(rvp->rvstate, force_one_io, rvp); |
2000470d DW |
400 | } |
401 | ||
402 | /* How many bytes has this process verified? */ | |
8cab77d3 | 403 | int |
2000470d | 404 | read_verify_bytes( |
8cab77d3 DW |
405 | struct read_verify_pool *rvp, |
406 | uint64_t *bytes_checked) | |
2000470d | 407 | { |
8cab77d3 | 408 | return ptcounter_value(rvp->verified_bytes, bytes_checked); |
2000470d | 409 | } |