]>
Commit | Line | Data |
---|---|---|
1 | // SPDX-License-Identifier: GPL-2.0-or-later | |
2 | /* | |
3 | * Copyright (C) 2018-2024 Oracle. All Rights Reserved. | |
4 | * Author: Darrick J. Wong <djwong@kernel.org> | |
5 | */ | |
6 | #include "xfs.h" | |
7 | #include <stdint.h> | |
8 | #include <dirent.h> | |
9 | #include <sys/statvfs.h> | |
10 | #include <linux/fsmap.h> | |
11 | #include "handle.h" | |
12 | #include "libfrog/paths.h" | |
13 | #include "libfrog/workqueue.h" | |
14 | #include "xfs_scrub.h" | |
15 | #include "common.h" | |
16 | #include "libfrog/bitmap.h" | |
17 | #include "disk.h" | |
18 | #include "filemap.h" | |
19 | #include "fscounters.h" | |
20 | #include "inodes.h" | |
21 | #include "read_verify.h" | |
22 | #include "spacemap.h" | |
23 | #include "vfs.h" | |
24 | #include "common.h" | |
25 | #include "libfrog/bulkstat.h" | |
26 | ||
27 | /* | |
28 | * Phase 6: Verify data file integrity. | |
29 | * | |
30 | * Identify potential data block extents with GETFSMAP, then feed those | |
31 | * extents to the read-verify pool to get the verify commands batched, | |
32 | * issued, and (if there are problems) reported back to us. If there | |
33 | * are errors, we'll record the bad regions and (if available) use rmap | |
34 | * to tell us if metadata are now corrupt. Otherwise, we'll scan the | |
35 | * whole directory tree looking for files that overlap the bad regions | |
36 | * and report the paths of the now corrupt files. | |
37 | */ | |
38 | ||
39 | /* Verify disk blocks with GETFSMAP */ | |
40 | ||
41 | struct media_verify_state { | |
42 | struct read_verify_pool *rvp_data; | |
43 | struct read_verify_pool *rvp_log; | |
44 | struct read_verify_pool *rvp_realtime; | |
45 | struct bitmap *d_bad; /* bytes */ | |
46 | struct bitmap *r_bad; /* bytes */ | |
47 | bool d_trunc:1; | |
48 | bool r_trunc:1; | |
49 | bool l_trunc:1; | |
50 | }; | |
51 | ||
52 | /* Find the fd for a given device identifier. */ | |
53 | static struct read_verify_pool * | |
54 | dev_to_pool( | |
55 | struct scrub_ctx *ctx, | |
56 | struct media_verify_state *vs, | |
57 | dev_t dev) | |
58 | { | |
59 | if (ctx->mnt.fsgeom.rtstart) { | |
60 | if (dev == XFS_DEV_DATA) | |
61 | return vs->rvp_data; | |
62 | if (dev == XFS_DEV_LOG) | |
63 | return vs->rvp_log; | |
64 | if (dev == XFS_DEV_RT) | |
65 | return vs->rvp_realtime; | |
66 | } else { | |
67 | if (dev == ctx->fsinfo.fs_datadev) | |
68 | return vs->rvp_data; | |
69 | if (dev == ctx->fsinfo.fs_logdev) | |
70 | return vs->rvp_log; | |
71 | if (dev == ctx->fsinfo.fs_rtdev) | |
72 | return vs->rvp_realtime; | |
73 | } | |
74 | abort(); | |
75 | } | |
76 | ||
77 | /* Find the device major/minor for a given file descriptor. */ | |
78 | static dev_t | |
79 | disk_to_dev( | |
80 | struct scrub_ctx *ctx, | |
81 | struct disk *disk) | |
82 | { | |
83 | if (ctx->mnt.fsgeom.rtstart) { | |
84 | if (disk == ctx->datadev) | |
85 | return XFS_DEV_DATA; | |
86 | if (disk == ctx->logdev) | |
87 | return XFS_DEV_LOG; | |
88 | if (disk == ctx->rtdev) | |
89 | return XFS_DEV_RT; | |
90 | } else { | |
91 | if (disk == ctx->datadev) | |
92 | return ctx->fsinfo.fs_datadev; | |
93 | if (disk == ctx->logdev) | |
94 | return ctx->fsinfo.fs_logdev; | |
95 | if (disk == ctx->rtdev) | |
96 | return ctx->fsinfo.fs_rtdev; | |
97 | } | |
98 | abort(); | |
99 | } | |
100 | ||
101 | /* Find the incore bad blocks bitmap for a given disk. */ | |
102 | static struct bitmap * | |
103 | bitmap_for_disk( | |
104 | struct scrub_ctx *ctx, | |
105 | struct disk *disk, | |
106 | struct media_verify_state *vs) | |
107 | { | |
108 | if (disk == ctx->datadev) | |
109 | return vs->d_bad; | |
110 | if (disk == ctx->rtdev) | |
111 | return vs->r_bad; | |
112 | return NULL; | |
113 | } | |
114 | ||
115 | struct disk_ioerr_report { | |
116 | struct scrub_ctx *ctx; | |
117 | struct disk *disk; | |
118 | }; | |
119 | ||
120 | struct owner_decode { | |
121 | uint64_t owner; | |
122 | const char *descr; | |
123 | }; | |
124 | ||
125 | static const struct owner_decode special_owners[] = { | |
126 | {XFS_FMR_OWN_FREE, "free space"}, | |
127 | {XFS_FMR_OWN_UNKNOWN, "unknown owner"}, | |
128 | {XFS_FMR_OWN_FS, "static FS metadata"}, | |
129 | {XFS_FMR_OWN_LOG, "journalling log"}, | |
130 | {XFS_FMR_OWN_AG, "per-AG metadata"}, | |
131 | {XFS_FMR_OWN_INOBT, "inode btree blocks"}, | |
132 | {XFS_FMR_OWN_INODES, "inodes"}, | |
133 | {XFS_FMR_OWN_REFC, "refcount btree"}, | |
134 | {XFS_FMR_OWN_COW, "CoW staging"}, | |
135 | {XFS_FMR_OWN_DEFECTIVE, "bad blocks"}, | |
136 | {0, NULL}, | |
137 | }; | |
138 | ||
139 | /* Decode a special owner. */ | |
140 | static const char * | |
141 | decode_special_owner( | |
142 | uint64_t owner) | |
143 | { | |
144 | const struct owner_decode *od = special_owners; | |
145 | ||
146 | while (od->descr) { | |
147 | if (od->owner == owner) | |
148 | return od->descr; | |
149 | od++; | |
150 | } | |
151 | ||
152 | return NULL; | |
153 | } | |
154 | ||
155 | /* Routines to translate bad physical extents into file paths and offsets. */ | |
156 | ||
157 | struct badfile_report { | |
158 | struct scrub_ctx *ctx; | |
159 | const char *descr; | |
160 | struct media_verify_state *vs; | |
161 | struct file_bmap *bmap; | |
162 | }; | |
163 | ||
164 | /* Report on bad extents found during a media scan. */ | |
165 | static int | |
166 | report_badfile( | |
167 | uint64_t start, | |
168 | uint64_t length, | |
169 | void *arg) | |
170 | { | |
171 | struct badfile_report *br = arg; | |
172 | unsigned long long bad_offset; | |
173 | unsigned long long bad_length; | |
174 | ||
175 | /* Clamp the bad region to the file mapping. */ | |
176 | if (start < br->bmap->bm_physical) { | |
177 | length -= br->bmap->bm_physical - start; | |
178 | start = br->bmap->bm_physical; | |
179 | } | |
180 | length = min(length, br->bmap->bm_length); | |
181 | ||
182 | /* Figure out how far into the bmap is the bad mapping and report it. */ | |
183 | bad_offset = start - br->bmap->bm_physical; | |
184 | bad_length = min(start + length, | |
185 | br->bmap->bm_physical + br->bmap->bm_length) - start; | |
186 | ||
187 | str_unfixable_error(br->ctx, br->descr, | |
188 | _("media error at data offset %llu length %llu."), | |
189 | br->bmap->bm_offset + bad_offset, bad_length); | |
190 | return 0; | |
191 | } | |
192 | ||
193 | /* Report if this extent overlaps a bad region. */ | |
194 | static int | |
195 | report_data_loss( | |
196 | struct scrub_ctx *ctx, | |
197 | int fd, | |
198 | int whichfork, | |
199 | struct fsxattr *fsx, | |
200 | struct file_bmap *bmap, | |
201 | void *arg) | |
202 | { | |
203 | struct badfile_report *br = arg; | |
204 | struct media_verify_state *vs = br->vs; | |
205 | struct bitmap *bmp; | |
206 | ||
207 | br->bmap = bmap; | |
208 | ||
209 | /* Only report errors for real extents. */ | |
210 | if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC)) | |
211 | return 0; | |
212 | ||
213 | if (fsx->fsx_xflags & FS_XFLAG_REALTIME) | |
214 | bmp = vs->r_bad; | |
215 | else | |
216 | bmp = vs->d_bad; | |
217 | ||
218 | return -bitmap_iterate_range(bmp, bmap->bm_physical, bmap->bm_length, | |
219 | report_badfile, br); | |
220 | } | |
221 | ||
222 | /* Report if the extended attribute data overlaps a bad region. */ | |
223 | static int | |
224 | report_attr_loss( | |
225 | struct scrub_ctx *ctx, | |
226 | int fd, | |
227 | int whichfork, | |
228 | struct fsxattr *fsx, | |
229 | struct file_bmap *bmap, | |
230 | void *arg) | |
231 | { | |
232 | struct badfile_report *br = arg; | |
233 | struct media_verify_state *vs = br->vs; | |
234 | struct bitmap *bmp = vs->d_bad; | |
235 | ||
236 | /* Complain about attr fork extents that don't look right. */ | |
237 | if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC)) { | |
238 | str_info(ctx, br->descr, | |
239 | _("found unexpected unwritten/delalloc attr fork extent.")); | |
240 | return 0; | |
241 | } | |
242 | ||
243 | if (fsx->fsx_xflags & FS_XFLAG_REALTIME) { | |
244 | str_info(ctx, br->descr, | |
245 | _("found unexpected realtime attr fork extent.")); | |
246 | return 0; | |
247 | } | |
248 | ||
249 | if (bitmap_test(bmp, bmap->bm_physical, bmap->bm_length)) | |
250 | str_corrupt(ctx, br->descr, | |
251 | _("media error in extended attribute data.")); | |
252 | ||
253 | return 0; | |
254 | } | |
255 | ||
256 | /* Iterate the extent mappings of a file to report errors. */ | |
257 | static int | |
258 | report_fd_loss( | |
259 | struct scrub_ctx *ctx, | |
260 | const char *descr, | |
261 | int fd, | |
262 | void *arg) | |
263 | { | |
264 | struct badfile_report br = { | |
265 | .ctx = ctx, | |
266 | .vs = arg, | |
267 | .descr = descr, | |
268 | }; | |
269 | struct file_bmap key = {0}; | |
270 | int ret; | |
271 | ||
272 | /* data fork */ | |
273 | ret = scrub_iterate_filemaps(ctx, fd, XFS_DATA_FORK, &key, | |
274 | report_data_loss, &br); | |
275 | if (ret) { | |
276 | str_liberror(ctx, ret, descr); | |
277 | return ret; | |
278 | } | |
279 | ||
280 | /* attr fork */ | |
281 | ret = scrub_iterate_filemaps(ctx, fd, XFS_ATTR_FORK, &key, | |
282 | report_attr_loss, &br); | |
283 | if (ret) { | |
284 | str_liberror(ctx, ret, descr); | |
285 | return ret; | |
286 | } | |
287 | ||
288 | return 0; | |
289 | } | |
290 | ||
291 | /* Report read verify errors in unlinked (but still open) files. */ | |
292 | static int | |
293 | report_inode_loss( | |
294 | struct scrub_ctx *ctx, | |
295 | struct xfs_handle *handle, | |
296 | struct xfs_bulkstat *bstat, | |
297 | void *arg) | |
298 | { | |
299 | char descr[DESCR_BUFSZ]; | |
300 | int fd; | |
301 | int error, err2; | |
302 | ||
303 | /* Ignore linked files and things we can't open. */ | |
304 | if (bstat->bs_nlink != 0) | |
305 | return 0; | |
306 | if (!S_ISREG(bstat->bs_mode) && !S_ISDIR(bstat->bs_mode)) | |
307 | return 0; | |
308 | ||
309 | scrub_render_ino_descr(ctx, descr, DESCR_BUFSZ, | |
310 | bstat->bs_ino, bstat->bs_gen, _("(unlinked)")); | |
311 | ||
312 | /* Try to open the inode. */ | |
313 | fd = scrub_open_handle(handle); | |
314 | if (fd < 0) { | |
315 | /* Handle is stale, try again. */ | |
316 | if (errno == ESTALE) | |
317 | return ESTALE; | |
318 | ||
319 | str_error(ctx, descr, | |
320 | _("Could not open to report read errors: %s."), | |
321 | strerror(errno)); | |
322 | return 0; | |
323 | } | |
324 | ||
325 | /* Go find the badness. */ | |
326 | error = report_fd_loss(ctx, descr, fd, arg); | |
327 | ||
328 | err2 = close(fd); | |
329 | if (err2) | |
330 | str_errno(ctx, descr); | |
331 | ||
332 | return error; | |
333 | } | |
334 | ||
335 | /* Scan a directory for matches in the read verify error list. */ | |
336 | static int | |
337 | report_dir_loss( | |
338 | struct scrub_ctx *ctx, | |
339 | const char *path, | |
340 | int dir_fd, | |
341 | void *arg) | |
342 | { | |
343 | return report_fd_loss(ctx, path, dir_fd, arg); | |
344 | } | |
345 | ||
346 | /* | |
347 | * Scan the inode associated with a directory entry for matches with | |
348 | * the read verify error list. | |
349 | */ | |
350 | static int | |
351 | report_dirent_loss( | |
352 | struct scrub_ctx *ctx, | |
353 | const char *path, | |
354 | int dir_fd, | |
355 | struct dirent *dirent, | |
356 | struct stat *sb, | |
357 | void *arg) | |
358 | { | |
359 | int fd; | |
360 | int error, err2; | |
361 | ||
362 | /* Ignore things we can't open. */ | |
363 | if (!S_ISREG(sb->st_mode) && !S_ISDIR(sb->st_mode)) | |
364 | return 0; | |
365 | ||
366 | /* Ignore . and .. */ | |
367 | if (!strcmp(".", dirent->d_name) || !strcmp("..", dirent->d_name)) | |
368 | return 0; | |
369 | ||
370 | /* | |
371 | * If we were given a dirent, open the associated file under | |
372 | * dir_fd for badblocks scanning. If dirent is NULL, then it's | |
373 | * the directory itself we want to scan. | |
374 | */ | |
375 | fd = openat(dir_fd, dirent->d_name, | |
376 | O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY); | |
377 | if (fd < 0) { | |
378 | char descr[PATH_MAX + 1]; | |
379 | ||
380 | if (errno == ENOENT) | |
381 | return 0; | |
382 | ||
383 | snprintf(descr, PATH_MAX, "%s/%s", path, dirent->d_name); | |
384 | descr[PATH_MAX] = 0; | |
385 | ||
386 | str_error(ctx, descr, | |
387 | _("Could not open to report read errors: %s."), | |
388 | strerror(errno)); | |
389 | return 0; | |
390 | } | |
391 | ||
392 | /* Go find the badness. */ | |
393 | error = report_fd_loss(ctx, path, fd, arg); | |
394 | ||
395 | err2 = close(fd); | |
396 | if (err2) | |
397 | str_errno(ctx, path); | |
398 | if (!error && err2) | |
399 | error = err2; | |
400 | ||
401 | return error; | |
402 | } | |
403 | ||
404 | struct ioerr_filerange { | |
405 | uint64_t physical; | |
406 | uint64_t length; | |
407 | }; | |
408 | ||
409 | /* | |
410 | * If reverse mapping and parent pointers are enabled, we can map media errors | |
411 | * directly back to a filename and a file position without needing to walk the | |
412 | * directory tree. | |
413 | */ | |
414 | static inline bool | |
415 | can_use_pptrs( | |
416 | const struct scrub_ctx *ctx) | |
417 | { | |
418 | return (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_PARENT) && | |
419 | (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_RMAPBT); | |
420 | } | |
421 | ||
422 | /* Use a fsmap to report metadata lost to a media error. */ | |
423 | static int | |
424 | report_ioerr_fsmap( | |
425 | struct scrub_ctx *ctx, | |
426 | struct fsmap *map, | |
427 | void *arg) | |
428 | { | |
429 | const char *type; | |
430 | struct xfs_bulkstat bs = { }; | |
431 | char buf[DESCR_BUFSZ]; | |
432 | struct ioerr_filerange *fr = arg; | |
433 | uint64_t err_off; | |
434 | int ret; | |
435 | ||
436 | /* Don't care about unwritten extents. */ | |
437 | if (map->fmr_flags & FMR_OF_PREALLOC) | |
438 | return 0; | |
439 | ||
440 | if (fr->physical > map->fmr_physical) | |
441 | err_off = fr->physical - map->fmr_physical; | |
442 | else | |
443 | err_off = 0; | |
444 | ||
445 | /* Report special owners */ | |
446 | if (map->fmr_flags & FMR_OF_SPECIAL_OWNER) { | |
447 | snprintf(buf, DESCR_BUFSZ, _("disk offset %"PRIu64), | |
448 | (uint64_t)map->fmr_physical + err_off); | |
449 | type = decode_special_owner(map->fmr_owner); | |
450 | /* | |
451 | * On filesystems that don't store reverse mappings, the | |
452 | * GETFSMAP call returns OWNER_UNKNOWN for allocated space. | |
453 | * We'll have to let the directory tree walker find the file | |
454 | * that lost data. | |
455 | */ | |
456 | if (!(ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_RMAPBT) && | |
457 | map->fmr_owner == XFS_FMR_OWN_UNKNOWN) { | |
458 | str_info(ctx, buf, _("media error detected.")); | |
459 | } else { | |
460 | str_corrupt(ctx, buf, _("media error in %s."), type); | |
461 | } | |
462 | } | |
463 | ||
464 | if (can_use_pptrs(ctx)) { | |
465 | ret = -xfrog_bulkstat_single(&ctx->mnt, map->fmr_owner, 0, &bs); | |
466 | if (ret) | |
467 | str_liberror(ctx, ret, | |
468 | _("bulkstat for media error report")); | |
469 | } | |
470 | ||
471 | /* Report extent maps */ | |
472 | if (map->fmr_flags & FMR_OF_EXTENT_MAP) { | |
473 | bool attr = (map->fmr_flags & FMR_OF_ATTR_FORK); | |
474 | ||
475 | scrub_render_ino_descr(ctx, buf, DESCR_BUFSZ, | |
476 | map->fmr_owner, bs.bs_gen, " %s", | |
477 | attr ? _("extended attribute") : | |
478 | _("file data")); | |
479 | str_corrupt(ctx, buf, _("media error in extent map")); | |
480 | } | |
481 | ||
482 | /* | |
483 | * If directory parent pointers are available, use that to find the | |
484 | * pathname to a file, and report that path as having lost its | |
485 | * extended attributes, or the precise offset of the lost file data. | |
486 | */ | |
487 | if (!can_use_pptrs(ctx)) | |
488 | return 0; | |
489 | ||
490 | scrub_render_ino_descr(ctx, buf, DESCR_BUFSZ, map->fmr_owner, | |
491 | bs.bs_gen, NULL); | |
492 | ||
493 | if (map->fmr_flags & FMR_OF_ATTR_FORK) { | |
494 | str_corrupt(ctx, buf, _("media error in extended attributes")); | |
495 | return 0; | |
496 | } | |
497 | ||
498 | str_unfixable_error(ctx, buf, | |
499 | _("media error at data offset %llu length %llu."), | |
500 | err_off, fr->length); | |
501 | return 0; | |
502 | } | |
503 | ||
504 | /* | |
505 | * For a range of bad blocks, visit each space mapping that overlaps the bad | |
506 | * range so that we can report lost metadata. | |
507 | */ | |
508 | static int | |
509 | report_ioerr( | |
510 | uint64_t start, | |
511 | uint64_t length, | |
512 | void *arg) | |
513 | { | |
514 | struct fsmap keys[2] = { }; | |
515 | struct ioerr_filerange fr = { | |
516 | .physical = start, | |
517 | .length = length, | |
518 | }; | |
519 | struct disk_ioerr_report *dioerr = arg; | |
520 | ||
521 | /* Go figure out which blocks are bad from the fsmap. */ | |
522 | keys[0].fmr_device = disk_to_dev(dioerr->ctx, dioerr->disk); | |
523 | keys[0].fmr_physical = start; | |
524 | keys[1].fmr_device = keys[0].fmr_device; | |
525 | keys[1].fmr_physical = start + length - 1; | |
526 | keys[1].fmr_owner = ULLONG_MAX; | |
527 | keys[1].fmr_offset = ULLONG_MAX; | |
528 | keys[1].fmr_flags = UINT_MAX; | |
529 | return -scrub_iterate_fsmap(dioerr->ctx, keys, report_ioerr_fsmap, | |
530 | &fr); | |
531 | } | |
532 | ||
533 | /* Report all the media errors found on a disk. */ | |
534 | static int | |
535 | report_disk_ioerrs( | |
536 | struct scrub_ctx *ctx, | |
537 | struct disk *disk, | |
538 | struct media_verify_state *vs) | |
539 | { | |
540 | struct disk_ioerr_report dioerr = { | |
541 | .ctx = ctx, | |
542 | .disk = disk, | |
543 | }; | |
544 | struct bitmap *tree; | |
545 | ||
546 | if (!disk) | |
547 | return 0; | |
548 | tree = bitmap_for_disk(ctx, disk, vs); | |
549 | if (!tree) | |
550 | return 0; | |
551 | return -bitmap_iterate(tree, report_ioerr, &dioerr); | |
552 | } | |
553 | ||
554 | /* Given bad extent lists for the data & rtdev, find bad files. */ | |
555 | static int | |
556 | report_all_media_errors( | |
557 | struct scrub_ctx *ctx, | |
558 | struct media_verify_state *vs) | |
559 | { | |
560 | int ret; | |
561 | ||
562 | if (vs->d_trunc) | |
563 | str_corrupt(ctx, ctx->mntpoint, _("data device truncated")); | |
564 | if (vs->l_trunc) | |
565 | str_corrupt(ctx, ctx->mntpoint, _("log device truncated")); | |
566 | if (vs->r_trunc) | |
567 | str_corrupt(ctx, ctx->mntpoint, _("rt device truncated")); | |
568 | ||
569 | ret = report_disk_ioerrs(ctx, ctx->datadev, vs); | |
570 | if (ret) { | |
571 | str_liberror(ctx, ret, _("walking datadev io errors")); | |
572 | return ret; | |
573 | } | |
574 | ||
575 | ret = report_disk_ioerrs(ctx, ctx->rtdev, vs); | |
576 | if (ret) { | |
577 | str_liberror(ctx, ret, _("walking rtdev io errors")); | |
578 | return ret; | |
579 | } | |
580 | ||
581 | /* | |
582 | * Scan the directory tree to get file paths if we didn't already use | |
583 | * directory parent pointers to report the loss. If parent pointers | |
584 | * are enabled, report_ioerr_fsmap will have already reported file | |
585 | * paths that have lost file data and xattrs. | |
586 | */ | |
587 | if (can_use_pptrs(ctx)) | |
588 | return 0; | |
589 | ||
590 | ret = scan_fs_tree(ctx, report_dir_loss, report_dirent_loss, vs); | |
591 | if (ret) | |
592 | return ret; | |
593 | ||
594 | /* Scan for unlinked files. */ | |
595 | return scrub_scan_user_files(ctx, report_inode_loss, vs); | |
596 | } | |
597 | ||
598 | /* Schedule a read-verify of a (data block) extent. */ | |
599 | static int | |
600 | check_rmap( | |
601 | struct scrub_ctx *ctx, | |
602 | struct fsmap *map, | |
603 | void *arg) | |
604 | { | |
605 | struct media_verify_state *vs = arg; | |
606 | struct read_verify_pool *rvp; | |
607 | int ret; | |
608 | ||
609 | rvp = dev_to_pool(ctx, vs, map->fmr_device); | |
610 | ||
611 | dbg_printf("rmap dev %d:%d phys %"PRIu64" owner %"PRId64 | |
612 | " offset %"PRIu64" len %"PRIu64" flags 0x%x\n", | |
613 | major(map->fmr_device), minor(map->fmr_device), | |
614 | (uint64_t)map->fmr_physical, (int64_t)map->fmr_owner, | |
615 | (uint64_t)map->fmr_offset, (uint64_t)map->fmr_length, | |
616 | map->fmr_flags); | |
617 | ||
618 | /* "Unknown" extents should be verified; they could be data. */ | |
619 | if ((map->fmr_flags & FMR_OF_SPECIAL_OWNER) && | |
620 | map->fmr_owner == XFS_FMR_OWN_UNKNOWN) | |
621 | map->fmr_flags &= ~FMR_OF_SPECIAL_OWNER; | |
622 | ||
623 | /* | |
624 | * We only care about read-verifying data extents that have been | |
625 | * written to disk. This means we can skip "special" owners | |
626 | * (metadata), xattr blocks, unwritten extents, and extent maps. | |
627 | * These should all get checked elsewhere in the scrubber. | |
628 | */ | |
629 | if (map->fmr_flags & (FMR_OF_PREALLOC | FMR_OF_ATTR_FORK | | |
630 | FMR_OF_EXTENT_MAP | FMR_OF_SPECIAL_OWNER)) | |
631 | return 0; | |
632 | ||
633 | /* XXX: Filter out directory data blocks. */ | |
634 | ||
635 | /* Schedule the read verify command for (eventual) running. */ | |
636 | ret = read_verify_schedule_io(rvp, map->fmr_physical, map->fmr_length, | |
637 | vs); | |
638 | if (ret) { | |
639 | str_liberror(ctx, ret, _("scheduling media verify command")); | |
640 | return ret; | |
641 | } | |
642 | ||
643 | return 0; | |
644 | } | |
645 | ||
646 | /* Wait for read/verify actions to finish, then return # bytes checked. */ | |
647 | static int | |
648 | clean_pool( | |
649 | struct read_verify_pool *rvp, | |
650 | unsigned long long *bytes_checked) | |
651 | { | |
652 | uint64_t pool_checked; | |
653 | int ret; | |
654 | ||
655 | if (!rvp) | |
656 | return 0; | |
657 | ||
658 | ret = read_verify_force_io(rvp); | |
659 | if (ret) | |
660 | return ret; | |
661 | ||
662 | ret = read_verify_pool_flush(rvp); | |
663 | if (ret) | |
664 | goto out_destroy; | |
665 | ||
666 | ret = read_verify_bytes(rvp, &pool_checked); | |
667 | if (ret) | |
668 | goto out_destroy; | |
669 | ||
670 | *bytes_checked += pool_checked; | |
671 | out_destroy: | |
672 | read_verify_pool_destroy(rvp); | |
673 | return ret; | |
674 | } | |
675 | ||
676 | /* Remember a media error for later. */ | |
677 | static void | |
678 | remember_ioerr( | |
679 | struct scrub_ctx *ctx, | |
680 | struct disk *disk, | |
681 | uint64_t start, | |
682 | uint64_t length, | |
683 | int error, | |
684 | void *arg) | |
685 | { | |
686 | struct media_verify_state *vs = arg; | |
687 | struct bitmap *tree; | |
688 | int ret; | |
689 | ||
690 | if (!length) { | |
691 | if (disk == ctx->datadev) | |
692 | vs->d_trunc = true; | |
693 | else if (disk == ctx->logdev) | |
694 | vs->l_trunc = true; | |
695 | else if (disk == ctx->rtdev) | |
696 | vs->r_trunc = true; | |
697 | return; | |
698 | } | |
699 | ||
700 | tree = bitmap_for_disk(ctx, disk, vs); | |
701 | if (!tree) { | |
702 | str_liberror(ctx, ENOENT, _("finding bad block bitmap")); | |
703 | return; | |
704 | } | |
705 | ||
706 | ret = -bitmap_set(tree, start, length); | |
707 | if (ret) | |
708 | str_liberror(ctx, ret, _("setting bad block bitmap")); | |
709 | } | |
710 | ||
711 | /* | |
712 | * Read verify all the file data blocks in a filesystem. Since XFS doesn't | |
713 | * do data checksums, we trust that the underlying storage will pass back | |
714 | * an IO error if it can't retrieve whatever we previously stored there. | |
715 | * If we hit an IO error, we'll record the bad blocks in a bitmap and then | |
716 | * scan the extent maps of the entire fs tree to figure (and the unlinked | |
717 | * inodes) out which files are now broken. | |
718 | */ | |
719 | int | |
720 | phase6_func( | |
721 | struct scrub_ctx *ctx) | |
722 | { | |
723 | struct media_verify_state vs = { NULL }; | |
724 | int ret, ret2, ret3; | |
725 | ||
726 | ret = -bitmap_alloc(&vs.d_bad); | |
727 | if (ret) { | |
728 | str_liberror(ctx, ret, _("creating datadev badblock bitmap")); | |
729 | return ret; | |
730 | } | |
731 | ||
732 | ret = -bitmap_alloc(&vs.r_bad); | |
733 | if (ret) { | |
734 | str_liberror(ctx, ret, _("creating realtime badblock bitmap")); | |
735 | goto out_dbad; | |
736 | } | |
737 | ||
738 | ret = read_verify_pool_alloc(ctx, ctx->datadev, | |
739 | ctx->mnt.fsgeom.blocksize, remember_ioerr, | |
740 | scrub_nproc(ctx), &vs.rvp_data); | |
741 | if (ret) { | |
742 | str_liberror(ctx, ret, _("creating datadev media verifier")); | |
743 | goto out_rbad; | |
744 | } | |
745 | if (ctx->logdev) { | |
746 | ret = read_verify_pool_alloc(ctx, ctx->logdev, | |
747 | ctx->mnt.fsgeom.blocksize, remember_ioerr, | |
748 | scrub_nproc(ctx), &vs.rvp_log); | |
749 | if (ret) { | |
750 | str_liberror(ctx, ret, | |
751 | _("creating logdev media verifier")); | |
752 | goto out_datapool; | |
753 | } | |
754 | } | |
755 | if (ctx->rtdev) { | |
756 | ret = read_verify_pool_alloc(ctx, ctx->rtdev, | |
757 | ctx->mnt.fsgeom.blocksize, remember_ioerr, | |
758 | scrub_nproc(ctx), &vs.rvp_realtime); | |
759 | if (ret) { | |
760 | str_liberror(ctx, ret, | |
761 | _("creating rtdev media verifier")); | |
762 | goto out_logpool; | |
763 | } | |
764 | } | |
765 | ret = scrub_scan_all_spacemaps(ctx, check_rmap, &vs); | |
766 | if (ret) | |
767 | goto out_rtpool; | |
768 | ||
769 | ret = clean_pool(vs.rvp_data, &ctx->bytes_checked); | |
770 | if (ret) | |
771 | str_liberror(ctx, ret, _("flushing datadev verify pool")); | |
772 | ||
773 | ret2 = clean_pool(vs.rvp_log, &ctx->bytes_checked); | |
774 | if (ret2) | |
775 | str_liberror(ctx, ret2, _("flushing logdev verify pool")); | |
776 | ||
777 | ret3 = clean_pool(vs.rvp_realtime, &ctx->bytes_checked); | |
778 | if (ret3) | |
779 | str_liberror(ctx, ret3, _("flushing rtdev verify pool")); | |
780 | ||
781 | /* | |
782 | * If the verify flush didn't work or we found no bad blocks, we're | |
783 | * done! No errors detected. | |
784 | */ | |
785 | if (ret || ret2 || ret3) | |
786 | goto out_rbad; | |
787 | if (bitmap_empty(vs.d_bad) && bitmap_empty(vs.r_bad)) | |
788 | goto out_rbad; | |
789 | ||
790 | /* Scan the whole dir tree to see what matches the bad extents. */ | |
791 | ret = report_all_media_errors(ctx, &vs); | |
792 | ||
793 | bitmap_free(&vs.r_bad); | |
794 | bitmap_free(&vs.d_bad); | |
795 | return ret; | |
796 | ||
797 | out_rtpool: | |
798 | if (vs.rvp_realtime) { | |
799 | read_verify_pool_abort(vs.rvp_realtime); | |
800 | read_verify_pool_destroy(vs.rvp_realtime); | |
801 | } | |
802 | out_logpool: | |
803 | if (vs.rvp_log) { | |
804 | read_verify_pool_abort(vs.rvp_log); | |
805 | read_verify_pool_destroy(vs.rvp_log); | |
806 | } | |
807 | out_datapool: | |
808 | read_verify_pool_abort(vs.rvp_data); | |
809 | read_verify_pool_destroy(vs.rvp_data); | |
810 | out_rbad: | |
811 | bitmap_free(&vs.r_bad); | |
812 | out_dbad: | |
813 | bitmap_free(&vs.d_bad); | |
814 | return ret; | |
815 | } | |
816 | ||
817 | /* Estimate how much work we're going to do. */ | |
818 | int | |
819 | phase6_estimate( | |
820 | struct scrub_ctx *ctx, | |
821 | uint64_t *items, | |
822 | unsigned int *nr_threads, | |
823 | int *rshift) | |
824 | { | |
825 | unsigned long long d_blocks; | |
826 | unsigned long long d_bfree; | |
827 | unsigned long long r_blocks; | |
828 | unsigned long long r_bfree; | |
829 | unsigned long long dontcare; | |
830 | int ret; | |
831 | ||
832 | ret = scrub_scan_estimate_blocks(ctx, &d_blocks, &d_bfree, &r_blocks, | |
833 | &r_bfree, &dontcare); | |
834 | if (ret) { | |
835 | str_liberror(ctx, ret, _("estimating verify work")); | |
836 | return ret; | |
837 | } | |
838 | ||
839 | *items = cvt_off_fsb_to_b(&ctx->mnt, | |
840 | (d_blocks - d_bfree) + (r_blocks - r_bfree)); | |
841 | ||
842 | /* | |
843 | * Each read-verify pool starts a thread pool, and each worker thread | |
844 | * can contribute to the progress counter. Hence we need to set | |
845 | * nr_threads appropriately to handle that many threads. | |
846 | */ | |
847 | *nr_threads = disk_heads(ctx->datadev); | |
848 | if (ctx->rtdev) | |
849 | *nr_threads += disk_heads(ctx->rtdev); | |
850 | if (ctx->logdev) | |
851 | *nr_threads += disk_heads(ctx->logdev); | |
852 | *rshift = 20; | |
853 | return 0; | |
854 | } |