]>
Commit | Line | Data |
---|---|---|
959ef981 | 1 | // SPDX-License-Identifier: GPL-2.0+ |
b364a9c0 DW |
2 | /* |
3 | * Copyright (C) 2018 Oracle. All Rights Reserved. | |
b364a9c0 | 4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> |
b364a9c0 | 5 | */ |
a440f877 | 6 | #include "xfs.h" |
b364a9c0 | 7 | #include <stdint.h> |
b364a9c0 DW |
8 | #include <dirent.h> |
9 | #include <sys/statvfs.h> | |
b364a9c0 DW |
10 | #include "handle.h" |
11 | #include "path.h" | |
b364a9c0 DW |
12 | #include "workqueue.h" |
13 | #include "xfs_scrub.h" | |
14 | #include "common.h" | |
15 | #include "bitmap.h" | |
16 | #include "disk.h" | |
17 | #include "filemap.h" | |
ed60d210 | 18 | #include "fscounters.h" |
b364a9c0 DW |
19 | #include "inodes.h" |
20 | #include "read_verify.h" | |
21 | #include "spacemap.h" | |
22 | #include "vfs.h" | |
23 | ||
24 | /* | |
25 | * Phase 6: Verify data file integrity. | |
26 | * | |
27 | * Identify potential data block extents with GETFSMAP, then feed those | |
28 | * extents to the read-verify pool to get the verify commands batched, | |
29 | * issued, and (if there are problems) reported back to us. If there | |
30 | * are errors, we'll record the bad regions and (if available) use rmap | |
31 | * to tell us if metadata are now corrupt. Otherwise, we'll scan the | |
32 | * whole directory tree looking for files that overlap the bad regions | |
33 | * and report the paths of the now corrupt files. | |
34 | */ | |
35 | ||
36 | /* Find the fd for a given device identifier. */ | |
37 | static struct disk * | |
38 | xfs_dev_to_disk( | |
39 | struct scrub_ctx *ctx, | |
40 | dev_t dev) | |
41 | { | |
42 | if (dev == ctx->fsinfo.fs_datadev) | |
43 | return ctx->datadev; | |
44 | else if (dev == ctx->fsinfo.fs_logdev) | |
45 | return ctx->logdev; | |
46 | else if (dev == ctx->fsinfo.fs_rtdev) | |
47 | return ctx->rtdev; | |
48 | abort(); | |
49 | } | |
50 | ||
51 | /* Find the device major/minor for a given file descriptor. */ | |
52 | static dev_t | |
53 | xfs_disk_to_dev( | |
54 | struct scrub_ctx *ctx, | |
55 | struct disk *disk) | |
56 | { | |
57 | if (disk == ctx->datadev) | |
58 | return ctx->fsinfo.fs_datadev; | |
59 | else if (disk == ctx->logdev) | |
60 | return ctx->fsinfo.fs_logdev; | |
61 | else if (disk == ctx->rtdev) | |
62 | return ctx->fsinfo.fs_rtdev; | |
63 | abort(); | |
64 | } | |
65 | ||
66 | struct owner_decode { | |
67 | uint64_t owner; | |
68 | const char *descr; | |
69 | }; | |
70 | ||
71 | static const struct owner_decode special_owners[] = { | |
72 | {XFS_FMR_OWN_FREE, "free space"}, | |
73 | {XFS_FMR_OWN_UNKNOWN, "unknown owner"}, | |
74 | {XFS_FMR_OWN_FS, "static FS metadata"}, | |
75 | {XFS_FMR_OWN_LOG, "journalling log"}, | |
76 | {XFS_FMR_OWN_AG, "per-AG metadata"}, | |
77 | {XFS_FMR_OWN_INOBT, "inode btree blocks"}, | |
78 | {XFS_FMR_OWN_INODES, "inodes"}, | |
79 | {XFS_FMR_OWN_REFC, "refcount btree"}, | |
80 | {XFS_FMR_OWN_COW, "CoW staging"}, | |
81 | {XFS_FMR_OWN_DEFECTIVE, "bad blocks"}, | |
82 | {0, NULL}, | |
83 | }; | |
84 | ||
85 | /* Decode a special owner. */ | |
86 | static const char * | |
87 | xfs_decode_special_owner( | |
88 | uint64_t owner) | |
89 | { | |
90 | const struct owner_decode *od = special_owners; | |
91 | ||
92 | while (od->descr) { | |
93 | if (od->owner == owner) | |
94 | return od->descr; | |
95 | od++; | |
96 | } | |
97 | ||
98 | return NULL; | |
99 | } | |
100 | ||
101 | /* Routines to translate bad physical extents into file paths and offsets. */ | |
102 | ||
103 | struct xfs_verify_error_info { | |
104 | struct bitmap *d_bad; /* bytes */ | |
105 | struct bitmap *r_bad; /* bytes */ | |
106 | }; | |
107 | ||
108 | /* Report if this extent overlaps a bad region. */ | |
109 | static bool | |
110 | xfs_report_verify_inode_bmap( | |
111 | struct scrub_ctx *ctx, | |
112 | const char *descr, | |
113 | int fd, | |
114 | int whichfork, | |
115 | struct fsxattr *fsx, | |
116 | struct xfs_bmap *bmap, | |
117 | void *arg) | |
118 | { | |
119 | struct xfs_verify_error_info *vei = arg; | |
120 | struct bitmap *bmp; | |
121 | ||
122 | /* Only report errors for real extents. */ | |
123 | if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC)) | |
124 | return true; | |
125 | ||
126 | if (fsx->fsx_xflags & FS_XFLAG_REALTIME) | |
127 | bmp = vei->r_bad; | |
128 | else | |
129 | bmp = vei->d_bad; | |
130 | ||
131 | if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length)) | |
132 | return true; | |
133 | ||
134 | str_error(ctx, descr, | |
135 | _("offset %llu failed read verification."), bmap->bm_offset); | |
136 | return true; | |
137 | } | |
138 | ||
139 | /* Iterate the extent mappings of a file to report errors. */ | |
140 | static bool | |
141 | xfs_report_verify_fd( | |
142 | struct scrub_ctx *ctx, | |
143 | const char *descr, | |
144 | int fd, | |
145 | void *arg) | |
146 | { | |
147 | struct xfs_bmap key = {0}; | |
148 | bool moveon; | |
149 | ||
150 | /* data fork */ | |
151 | moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_DATA_FORK, &key, | |
152 | xfs_report_verify_inode_bmap, arg); | |
153 | if (!moveon) | |
154 | return false; | |
155 | ||
156 | /* attr fork */ | |
157 | moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_ATTR_FORK, &key, | |
158 | xfs_report_verify_inode_bmap, arg); | |
159 | if (!moveon) | |
160 | return false; | |
161 | return true; | |
162 | } | |
163 | ||
164 | /* Report read verify errors in unlinked (but still open) files. */ | |
165 | static int | |
166 | xfs_report_verify_inode( | |
167 | struct scrub_ctx *ctx, | |
168 | struct xfs_handle *handle, | |
169 | struct xfs_bstat *bstat, | |
170 | void *arg) | |
171 | { | |
172 | char descr[DESCR_BUFSZ]; | |
b364a9c0 DW |
173 | bool moveon; |
174 | int fd; | |
175 | int error; | |
176 | ||
177 | snprintf(descr, DESCR_BUFSZ, _("inode %"PRIu64" (unlinked)"), | |
178 | (uint64_t)bstat->bs_ino); | |
179 | ||
180 | /* Ignore linked files and things we can't open. */ | |
181 | if (bstat->bs_nlink != 0) | |
182 | return 0; | |
183 | if (!S_ISREG(bstat->bs_mode) && !S_ISDIR(bstat->bs_mode)) | |
184 | return 0; | |
185 | ||
186 | /* Try to open the inode. */ | |
187 | fd = xfs_open_handle(handle); | |
188 | if (fd < 0) { | |
189 | error = errno; | |
190 | if (error == ESTALE) | |
191 | return error; | |
192 | ||
bb5dbd06 DW |
193 | str_info(ctx, descr, |
194 | _("Disappeared during read error reporting.")); | |
b364a9c0 DW |
195 | return error; |
196 | } | |
197 | ||
198 | /* Go find the badness. */ | |
199 | moveon = xfs_report_verify_fd(ctx, descr, fd, arg); | |
6c05cc5d DW |
200 | error = close(fd); |
201 | if (error) | |
202 | str_errno(ctx, descr); | |
b364a9c0 DW |
203 | |
204 | return moveon ? 0 : XFS_ITERATE_INODES_ABORT; | |
205 | } | |
206 | ||
207 | /* Scan a directory for matches in the read verify error list. */ | |
208 | static bool | |
209 | xfs_report_verify_dir( | |
210 | struct scrub_ctx *ctx, | |
211 | const char *path, | |
212 | int dir_fd, | |
213 | void *arg) | |
214 | { | |
215 | return xfs_report_verify_fd(ctx, path, dir_fd, arg); | |
216 | } | |
217 | ||
218 | /* | |
219 | * Scan the inode associated with a directory entry for matches with | |
220 | * the read verify error list. | |
221 | */ | |
222 | static bool | |
223 | xfs_report_verify_dirent( | |
224 | struct scrub_ctx *ctx, | |
225 | const char *path, | |
226 | int dir_fd, | |
227 | struct dirent *dirent, | |
228 | struct stat *sb, | |
229 | void *arg) | |
230 | { | |
231 | bool moveon; | |
232 | int fd; | |
6c05cc5d | 233 | int error; |
b364a9c0 DW |
234 | |
235 | /* Ignore things we can't open. */ | |
236 | if (!S_ISREG(sb->st_mode) && !S_ISDIR(sb->st_mode)) | |
237 | return true; | |
238 | ||
239 | /* Ignore . and .. */ | |
240 | if (!strcmp(".", dirent->d_name) || !strcmp("..", dirent->d_name)) | |
241 | return true; | |
242 | ||
243 | /* | |
244 | * If we were given a dirent, open the associated file under | |
245 | * dir_fd for badblocks scanning. If dirent is NULL, then it's | |
246 | * the directory itself we want to scan. | |
247 | */ | |
248 | fd = openat(dir_fd, dirent->d_name, | |
249 | O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY); | |
250 | if (fd < 0) | |
251 | return true; | |
252 | ||
253 | /* Go find the badness. */ | |
254 | moveon = xfs_report_verify_fd(ctx, path, fd, arg); | |
255 | if (moveon) | |
256 | goto out; | |
257 | ||
258 | out: | |
6c05cc5d DW |
259 | error = close(fd); |
260 | if (error) | |
261 | str_errno(ctx, path); | |
b364a9c0 DW |
262 | return moveon; |
263 | } | |
264 | ||
265 | /* Given bad extent lists for the data & rtdev, find bad files. */ | |
266 | static bool | |
267 | xfs_report_verify_errors( | |
268 | struct scrub_ctx *ctx, | |
269 | struct bitmap *d_bad, | |
270 | struct bitmap *r_bad) | |
271 | { | |
272 | struct xfs_verify_error_info vei; | |
273 | bool moveon; | |
274 | ||
275 | vei.d_bad = d_bad; | |
276 | vei.r_bad = r_bad; | |
277 | ||
278 | /* Scan the directory tree to get file paths. */ | |
279 | moveon = scan_fs_tree(ctx, xfs_report_verify_dir, | |
280 | xfs_report_verify_dirent, &vei); | |
281 | if (!moveon) | |
282 | return false; | |
283 | ||
284 | /* Scan for unlinked files. */ | |
285 | return xfs_scan_all_inodes(ctx, xfs_report_verify_inode, &vei); | |
286 | } | |
287 | ||
288 | /* Verify disk blocks with GETFSMAP */ | |
289 | ||
290 | struct xfs_verify_extent { | |
291 | struct read_verify_pool *readverify; | |
b364a9c0 DW |
292 | struct bitmap *d_bad; /* bytes */ |
293 | struct bitmap *r_bad; /* bytes */ | |
294 | }; | |
295 | ||
296 | /* Report an IO error resulting from read-verify based off getfsmap. */ | |
297 | static bool | |
298 | xfs_check_rmap_error_report( | |
299 | struct scrub_ctx *ctx, | |
300 | const char *descr, | |
301 | struct fsmap *map, | |
302 | void *arg) | |
303 | { | |
304 | const char *type; | |
305 | char buf[32]; | |
306 | uint64_t err_physical = *(uint64_t *)arg; | |
307 | uint64_t err_off; | |
308 | ||
309 | if (err_physical > map->fmr_physical) | |
310 | err_off = err_physical - map->fmr_physical; | |
311 | else | |
312 | err_off = 0; | |
313 | ||
314 | snprintf(buf, 32, _("disk offset %"PRIu64), | |
315 | (uint64_t)BTOBB(map->fmr_physical + err_off)); | |
316 | ||
317 | if (map->fmr_flags & FMR_OF_SPECIAL_OWNER) { | |
318 | type = xfs_decode_special_owner(map->fmr_owner); | |
319 | str_error(ctx, buf, | |
320 | _("%s failed read verification."), | |
321 | type); | |
322 | } | |
323 | ||
324 | /* | |
325 | * XXX: If we had a getparent() call we could report IO errors | |
326 | * efficiently. Until then, we'll have to scan the dir tree | |
327 | * to find the bad file's pathname. | |
328 | */ | |
329 | ||
330 | return true; | |
331 | } | |
332 | ||
333 | /* | |
334 | * Remember a read error for later, and see if rmap will tell us about the | |
335 | * owner ahead of time. | |
336 | */ | |
337 | static void | |
338 | xfs_check_rmap_ioerr( | |
339 | struct scrub_ctx *ctx, | |
340 | struct disk *disk, | |
341 | uint64_t start, | |
342 | uint64_t length, | |
343 | int error, | |
344 | void *arg) | |
345 | { | |
346 | struct fsmap keys[2]; | |
347 | char descr[DESCR_BUFSZ]; | |
348 | struct xfs_verify_extent *ve = arg; | |
349 | struct bitmap *tree; | |
350 | dev_t dev; | |
351 | bool moveon; | |
352 | ||
353 | dev = xfs_disk_to_dev(ctx, disk); | |
354 | ||
355 | /* | |
356 | * If we don't have parent pointers, save the bad extent for | |
357 | * later rescanning. | |
358 | */ | |
359 | if (dev == ctx->fsinfo.fs_datadev) | |
360 | tree = ve->d_bad; | |
361 | else if (dev == ctx->fsinfo.fs_rtdev) | |
362 | tree = ve->r_bad; | |
363 | else | |
364 | tree = NULL; | |
365 | if (tree) { | |
366 | moveon = bitmap_set(tree, start, length); | |
367 | if (!moveon) | |
368 | str_errno(ctx, ctx->mntpoint); | |
369 | } | |
370 | ||
371 | snprintf(descr, DESCR_BUFSZ, _("dev %d:%d ioerr @ %"PRIu64":%"PRIu64" "), | |
372 | major(dev), minor(dev), start, length); | |
373 | ||
374 | /* Go figure out which blocks are bad from the fsmap. */ | |
375 | memset(keys, 0, sizeof(struct fsmap) * 2); | |
376 | keys->fmr_device = dev; | |
377 | keys->fmr_physical = start; | |
378 | (keys + 1)->fmr_device = dev; | |
379 | (keys + 1)->fmr_physical = start + length - 1; | |
380 | (keys + 1)->fmr_owner = ULLONG_MAX; | |
381 | (keys + 1)->fmr_offset = ULLONG_MAX; | |
382 | (keys + 1)->fmr_flags = UINT_MAX; | |
383 | xfs_iterate_fsmap(ctx, descr, keys, xfs_check_rmap_error_report, | |
384 | &start); | |
385 | } | |
386 | ||
387 | /* Schedule a read-verify of a (data block) extent. */ | |
388 | static bool | |
389 | xfs_check_rmap( | |
390 | struct scrub_ctx *ctx, | |
391 | const char *descr, | |
392 | struct fsmap *map, | |
393 | void *arg) | |
394 | { | |
395 | struct xfs_verify_extent *ve = arg; | |
396 | struct disk *disk; | |
397 | ||
398 | dbg_printf("rmap dev %d:%d phys %"PRIu64" owner %"PRId64 | |
399 | " offset %"PRIu64" len %"PRIu64" flags 0x%x\n", | |
400 | major(map->fmr_device), minor(map->fmr_device), | |
401 | (uint64_t)map->fmr_physical, (int64_t)map->fmr_owner, | |
402 | (uint64_t)map->fmr_offset, (uint64_t)map->fmr_length, | |
403 | map->fmr_flags); | |
404 | ||
405 | /* "Unknown" extents should be verified; they could be data. */ | |
406 | if ((map->fmr_flags & FMR_OF_SPECIAL_OWNER) && | |
407 | map->fmr_owner == XFS_FMR_OWN_UNKNOWN) | |
408 | map->fmr_flags &= ~FMR_OF_SPECIAL_OWNER; | |
409 | ||
410 | /* | |
411 | * We only care about read-verifying data extents that have been | |
412 | * written to disk. This means we can skip "special" owners | |
413 | * (metadata), xattr blocks, unwritten extents, and extent maps. | |
414 | * These should all get checked elsewhere in the scrubber. | |
415 | */ | |
416 | if (map->fmr_flags & (FMR_OF_PREALLOC | FMR_OF_ATTR_FORK | | |
417 | FMR_OF_EXTENT_MAP | FMR_OF_SPECIAL_OWNER)) | |
418 | goto out; | |
419 | ||
420 | /* XXX: Filter out directory data blocks. */ | |
421 | ||
422 | /* Schedule the read verify command for (eventual) running. */ | |
423 | disk = xfs_dev_to_disk(ctx, map->fmr_device); | |
424 | ||
41c08606 DW |
425 | read_verify_schedule_io(ve->readverify, disk, map->fmr_physical, |
426 | map->fmr_length, ve); | |
b364a9c0 DW |
427 | |
428 | out: | |
429 | /* Is this the last extent? Fire off the read. */ | |
430 | if (map->fmr_flags & FMR_OF_LAST) | |
41c08606 | 431 | read_verify_force_io(ve->readverify); |
b364a9c0 DW |
432 | |
433 | return true; | |
434 | } | |
435 | ||
436 | /* | |
437 | * Read verify all the file data blocks in a filesystem. Since XFS doesn't | |
438 | * do data checksums, we trust that the underlying storage will pass back | |
439 | * an IO error if it can't retrieve whatever we previously stored there. | |
440 | * If we hit an IO error, we'll record the bad blocks in a bitmap and then | |
441 | * scan the extent maps of the entire fs tree to figure (and the unlinked | |
442 | * inodes) out which files are now broken. | |
443 | */ | |
444 | bool | |
445 | xfs_scan_blocks( | |
446 | struct scrub_ctx *ctx) | |
447 | { | |
448 | struct xfs_verify_extent ve; | |
449 | bool moveon; | |
450 | ||
b364a9c0 DW |
451 | moveon = bitmap_init(&ve.d_bad); |
452 | if (!moveon) { | |
453 | str_errno(ctx, ctx->mntpoint); | |
41c08606 | 454 | goto out; |
b364a9c0 DW |
455 | } |
456 | ||
457 | moveon = bitmap_init(&ve.r_bad); | |
458 | if (!moveon) { | |
459 | str_errno(ctx, ctx->mntpoint); | |
460 | goto out_dbad; | |
461 | } | |
462 | ||
463 | ve.readverify = read_verify_pool_init(ctx, ctx->geo.blocksize, | |
41c08606 DW |
464 | xfs_check_rmap_ioerr, disk_heads(ctx->datadev), |
465 | scrub_nproc(ctx)); | |
b364a9c0 DW |
466 | if (!ve.readverify) { |
467 | moveon = false; | |
82377bde | 468 | str_info(ctx, ctx->mntpoint, |
b364a9c0 DW |
469 | _("Could not create media verifier.")); |
470 | goto out_rbad; | |
471 | } | |
472 | moveon = xfs_scan_all_spacemaps(ctx, xfs_check_rmap, &ve); | |
473 | if (!moveon) | |
474 | goto out_pool; | |
475 | read_verify_pool_flush(ve.readverify); | |
476 | ctx->bytes_checked += read_verify_bytes(ve.readverify); | |
477 | read_verify_pool_destroy(ve.readverify); | |
478 | ||
479 | /* Scan the whole dir tree to see what matches the bad extents. */ | |
480 | if (!bitmap_empty(ve.d_bad) || !bitmap_empty(ve.r_bad)) | |
481 | moveon = xfs_report_verify_errors(ctx, ve.d_bad, ve.r_bad); | |
482 | ||
483 | bitmap_free(&ve.r_bad); | |
484 | bitmap_free(&ve.d_bad); | |
b364a9c0 DW |
485 | return moveon; |
486 | ||
487 | out_pool: | |
488 | read_verify_pool_destroy(ve.readverify); | |
489 | out_rbad: | |
490 | bitmap_free(&ve.r_bad); | |
491 | out_dbad: | |
492 | bitmap_free(&ve.d_bad); | |
41c08606 | 493 | out: |
b364a9c0 DW |
494 | return moveon; |
495 | } | |
ed60d210 DW |
496 | |
497 | /* Estimate how much work we're going to do. */ | |
498 | bool | |
499 | xfs_estimate_verify_work( | |
500 | struct scrub_ctx *ctx, | |
501 | uint64_t *items, | |
502 | unsigned int *nr_threads, | |
503 | int *rshift) | |
504 | { | |
505 | unsigned long long d_blocks; | |
506 | unsigned long long d_bfree; | |
507 | unsigned long long r_blocks; | |
508 | unsigned long long r_bfree; | |
509 | unsigned long long f_files; | |
510 | unsigned long long f_free; | |
511 | bool moveon; | |
512 | ||
513 | moveon = xfs_scan_estimate_blocks(ctx, &d_blocks, &d_bfree, | |
514 | &r_blocks, &r_bfree, &f_files, &f_free); | |
515 | if (!moveon) | |
516 | return moveon; | |
517 | ||
518 | *items = ((d_blocks - d_bfree) + (r_blocks - r_bfree)) << ctx->blocklog; | |
519 | *nr_threads = disk_heads(ctx->datadev); | |
520 | *rshift = 20; | |
521 | return moveon; | |
522 | } |