]>
Commit | Line | Data |
---|---|---|
b364a9c0 DW |
1 | /* |
2 | * Copyright (C) 2018 Oracle. All Rights Reserved. | |
3 | * | |
4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU General Public License | |
8 | * as published by the Free Software Foundation; either version 2 | |
9 | * of the License, or (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it would be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write the Free Software Foundation, | |
18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | |
19 | */ | |
20 | #include <stdio.h> | |
21 | #include <stdint.h> | |
22 | #include <stdbool.h> | |
23 | #include <dirent.h> | |
24 | #include <sys/statvfs.h> | |
25 | #include "xfs.h" | |
26 | #include "xfs_fs.h" | |
27 | #include "handle.h" | |
28 | #include "path.h" | |
29 | #include "ptvar.h" | |
30 | #include "workqueue.h" | |
31 | #include "xfs_scrub.h" | |
32 | #include "common.h" | |
33 | #include "bitmap.h" | |
34 | #include "disk.h" | |
35 | #include "filemap.h" | |
36 | #include "inodes.h" | |
37 | #include "read_verify.h" | |
38 | #include "spacemap.h" | |
39 | #include "vfs.h" | |
40 | ||
41 | /* | |
42 | * Phase 6: Verify data file integrity. | |
43 | * | |
44 | * Identify potential data block extents with GETFSMAP, then feed those | |
45 | * extents to the read-verify pool to get the verify commands batched, | |
46 | * issued, and (if there are problems) reported back to us. If there | |
47 | * are errors, we'll record the bad regions and (if available) use rmap | |
48 | * to tell us if metadata are now corrupt. Otherwise, we'll scan the | |
49 | * whole directory tree looking for files that overlap the bad regions | |
50 | * and report the paths of the now corrupt files. | |
51 | */ | |
52 | ||
53 | /* Find the fd for a given device identifier. */ | |
54 | static struct disk * | |
55 | xfs_dev_to_disk( | |
56 | struct scrub_ctx *ctx, | |
57 | dev_t dev) | |
58 | { | |
59 | if (dev == ctx->fsinfo.fs_datadev) | |
60 | return ctx->datadev; | |
61 | else if (dev == ctx->fsinfo.fs_logdev) | |
62 | return ctx->logdev; | |
63 | else if (dev == ctx->fsinfo.fs_rtdev) | |
64 | return ctx->rtdev; | |
65 | abort(); | |
66 | } | |
67 | ||
68 | /* Find the device major/minor for a given file descriptor. */ | |
69 | static dev_t | |
70 | xfs_disk_to_dev( | |
71 | struct scrub_ctx *ctx, | |
72 | struct disk *disk) | |
73 | { | |
74 | if (disk == ctx->datadev) | |
75 | return ctx->fsinfo.fs_datadev; | |
76 | else if (disk == ctx->logdev) | |
77 | return ctx->fsinfo.fs_logdev; | |
78 | else if (disk == ctx->rtdev) | |
79 | return ctx->fsinfo.fs_rtdev; | |
80 | abort(); | |
81 | } | |
82 | ||
83 | struct owner_decode { | |
84 | uint64_t owner; | |
85 | const char *descr; | |
86 | }; | |
87 | ||
88 | static const struct owner_decode special_owners[] = { | |
89 | {XFS_FMR_OWN_FREE, "free space"}, | |
90 | {XFS_FMR_OWN_UNKNOWN, "unknown owner"}, | |
91 | {XFS_FMR_OWN_FS, "static FS metadata"}, | |
92 | {XFS_FMR_OWN_LOG, "journalling log"}, | |
93 | {XFS_FMR_OWN_AG, "per-AG metadata"}, | |
94 | {XFS_FMR_OWN_INOBT, "inode btree blocks"}, | |
95 | {XFS_FMR_OWN_INODES, "inodes"}, | |
96 | {XFS_FMR_OWN_REFC, "refcount btree"}, | |
97 | {XFS_FMR_OWN_COW, "CoW staging"}, | |
98 | {XFS_FMR_OWN_DEFECTIVE, "bad blocks"}, | |
99 | {0, NULL}, | |
100 | }; | |
101 | ||
102 | /* Decode a special owner. */ | |
103 | static const char * | |
104 | xfs_decode_special_owner( | |
105 | uint64_t owner) | |
106 | { | |
107 | const struct owner_decode *od = special_owners; | |
108 | ||
109 | while (od->descr) { | |
110 | if (od->owner == owner) | |
111 | return od->descr; | |
112 | od++; | |
113 | } | |
114 | ||
115 | return NULL; | |
116 | } | |
117 | ||
118 | /* Routines to translate bad physical extents into file paths and offsets. */ | |
119 | ||
120 | struct xfs_verify_error_info { | |
121 | struct bitmap *d_bad; /* bytes */ | |
122 | struct bitmap *r_bad; /* bytes */ | |
123 | }; | |
124 | ||
125 | /* Report if this extent overlaps a bad region. */ | |
126 | static bool | |
127 | xfs_report_verify_inode_bmap( | |
128 | struct scrub_ctx *ctx, | |
129 | const char *descr, | |
130 | int fd, | |
131 | int whichfork, | |
132 | struct fsxattr *fsx, | |
133 | struct xfs_bmap *bmap, | |
134 | void *arg) | |
135 | { | |
136 | struct xfs_verify_error_info *vei = arg; | |
137 | struct bitmap *bmp; | |
138 | ||
139 | /* Only report errors for real extents. */ | |
140 | if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC)) | |
141 | return true; | |
142 | ||
143 | if (fsx->fsx_xflags & FS_XFLAG_REALTIME) | |
144 | bmp = vei->r_bad; | |
145 | else | |
146 | bmp = vei->d_bad; | |
147 | ||
148 | if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length)) | |
149 | return true; | |
150 | ||
151 | str_error(ctx, descr, | |
152 | _("offset %llu failed read verification."), bmap->bm_offset); | |
153 | return true; | |
154 | } | |
155 | ||
156 | /* Iterate the extent mappings of a file to report errors. */ | |
157 | static bool | |
158 | xfs_report_verify_fd( | |
159 | struct scrub_ctx *ctx, | |
160 | const char *descr, | |
161 | int fd, | |
162 | void *arg) | |
163 | { | |
164 | struct xfs_bmap key = {0}; | |
165 | bool moveon; | |
166 | ||
167 | /* data fork */ | |
168 | moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_DATA_FORK, &key, | |
169 | xfs_report_verify_inode_bmap, arg); | |
170 | if (!moveon) | |
171 | return false; | |
172 | ||
173 | /* attr fork */ | |
174 | moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_ATTR_FORK, &key, | |
175 | xfs_report_verify_inode_bmap, arg); | |
176 | if (!moveon) | |
177 | return false; | |
178 | return true; | |
179 | } | |
180 | ||
181 | /* Report read verify errors in unlinked (but still open) files. */ | |
182 | static int | |
183 | xfs_report_verify_inode( | |
184 | struct scrub_ctx *ctx, | |
185 | struct xfs_handle *handle, | |
186 | struct xfs_bstat *bstat, | |
187 | void *arg) | |
188 | { | |
189 | char descr[DESCR_BUFSZ]; | |
190 | char buf[DESCR_BUFSZ]; | |
191 | bool moveon; | |
192 | int fd; | |
193 | int error; | |
194 | ||
195 | snprintf(descr, DESCR_BUFSZ, _("inode %"PRIu64" (unlinked)"), | |
196 | (uint64_t)bstat->bs_ino); | |
197 | ||
198 | /* Ignore linked files and things we can't open. */ | |
199 | if (bstat->bs_nlink != 0) | |
200 | return 0; | |
201 | if (!S_ISREG(bstat->bs_mode) && !S_ISDIR(bstat->bs_mode)) | |
202 | return 0; | |
203 | ||
204 | /* Try to open the inode. */ | |
205 | fd = xfs_open_handle(handle); | |
206 | if (fd < 0) { | |
207 | error = errno; | |
208 | if (error == ESTALE) | |
209 | return error; | |
210 | ||
211 | str_warn(ctx, descr, "%s", strerror_r(error, buf, DESCR_BUFSZ)); | |
212 | return error; | |
213 | } | |
214 | ||
215 | /* Go find the badness. */ | |
216 | moveon = xfs_report_verify_fd(ctx, descr, fd, arg); | |
217 | close(fd); | |
218 | ||
219 | return moveon ? 0 : XFS_ITERATE_INODES_ABORT; | |
220 | } | |
221 | ||
222 | /* Scan a directory for matches in the read verify error list. */ | |
223 | static bool | |
224 | xfs_report_verify_dir( | |
225 | struct scrub_ctx *ctx, | |
226 | const char *path, | |
227 | int dir_fd, | |
228 | void *arg) | |
229 | { | |
230 | return xfs_report_verify_fd(ctx, path, dir_fd, arg); | |
231 | } | |
232 | ||
233 | /* | |
234 | * Scan the inode associated with a directory entry for matches with | |
235 | * the read verify error list. | |
236 | */ | |
237 | static bool | |
238 | xfs_report_verify_dirent( | |
239 | struct scrub_ctx *ctx, | |
240 | const char *path, | |
241 | int dir_fd, | |
242 | struct dirent *dirent, | |
243 | struct stat *sb, | |
244 | void *arg) | |
245 | { | |
246 | bool moveon; | |
247 | int fd; | |
248 | ||
249 | /* Ignore things we can't open. */ | |
250 | if (!S_ISREG(sb->st_mode) && !S_ISDIR(sb->st_mode)) | |
251 | return true; | |
252 | ||
253 | /* Ignore . and .. */ | |
254 | if (!strcmp(".", dirent->d_name) || !strcmp("..", dirent->d_name)) | |
255 | return true; | |
256 | ||
257 | /* | |
258 | * If we were given a dirent, open the associated file under | |
259 | * dir_fd for badblocks scanning. If dirent is NULL, then it's | |
260 | * the directory itself we want to scan. | |
261 | */ | |
262 | fd = openat(dir_fd, dirent->d_name, | |
263 | O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY); | |
264 | if (fd < 0) | |
265 | return true; | |
266 | ||
267 | /* Go find the badness. */ | |
268 | moveon = xfs_report_verify_fd(ctx, path, fd, arg); | |
269 | if (moveon) | |
270 | goto out; | |
271 | ||
272 | out: | |
273 | close(fd); | |
274 | ||
275 | return moveon; | |
276 | } | |
277 | ||
278 | /* Given bad extent lists for the data & rtdev, find bad files. */ | |
279 | static bool | |
280 | xfs_report_verify_errors( | |
281 | struct scrub_ctx *ctx, | |
282 | struct bitmap *d_bad, | |
283 | struct bitmap *r_bad) | |
284 | { | |
285 | struct xfs_verify_error_info vei; | |
286 | bool moveon; | |
287 | ||
288 | vei.d_bad = d_bad; | |
289 | vei.r_bad = r_bad; | |
290 | ||
291 | /* Scan the directory tree to get file paths. */ | |
292 | moveon = scan_fs_tree(ctx, xfs_report_verify_dir, | |
293 | xfs_report_verify_dirent, &vei); | |
294 | if (!moveon) | |
295 | return false; | |
296 | ||
297 | /* Scan for unlinked files. */ | |
298 | return xfs_scan_all_inodes(ctx, xfs_report_verify_inode, &vei); | |
299 | } | |
300 | ||
301 | /* Verify disk blocks with GETFSMAP */ | |
302 | ||
303 | struct xfs_verify_extent { | |
304 | struct read_verify_pool *readverify; | |
305 | struct ptvar *rvstate; | |
306 | struct bitmap *d_bad; /* bytes */ | |
307 | struct bitmap *r_bad; /* bytes */ | |
308 | }; | |
309 | ||
310 | /* Report an IO error resulting from read-verify based off getfsmap. */ | |
311 | static bool | |
312 | xfs_check_rmap_error_report( | |
313 | struct scrub_ctx *ctx, | |
314 | const char *descr, | |
315 | struct fsmap *map, | |
316 | void *arg) | |
317 | { | |
318 | const char *type; | |
319 | char buf[32]; | |
320 | uint64_t err_physical = *(uint64_t *)arg; | |
321 | uint64_t err_off; | |
322 | ||
323 | if (err_physical > map->fmr_physical) | |
324 | err_off = err_physical - map->fmr_physical; | |
325 | else | |
326 | err_off = 0; | |
327 | ||
328 | snprintf(buf, 32, _("disk offset %"PRIu64), | |
329 | (uint64_t)BTOBB(map->fmr_physical + err_off)); | |
330 | ||
331 | if (map->fmr_flags & FMR_OF_SPECIAL_OWNER) { | |
332 | type = xfs_decode_special_owner(map->fmr_owner); | |
333 | str_error(ctx, buf, | |
334 | _("%s failed read verification."), | |
335 | type); | |
336 | } | |
337 | ||
338 | /* | |
339 | * XXX: If we had a getparent() call we could report IO errors | |
340 | * efficiently. Until then, we'll have to scan the dir tree | |
341 | * to find the bad file's pathname. | |
342 | */ | |
343 | ||
344 | return true; | |
345 | } | |
346 | ||
347 | /* | |
348 | * Remember a read error for later, and see if rmap will tell us about the | |
349 | * owner ahead of time. | |
350 | */ | |
351 | static void | |
352 | xfs_check_rmap_ioerr( | |
353 | struct scrub_ctx *ctx, | |
354 | struct disk *disk, | |
355 | uint64_t start, | |
356 | uint64_t length, | |
357 | int error, | |
358 | void *arg) | |
359 | { | |
360 | struct fsmap keys[2]; | |
361 | char descr[DESCR_BUFSZ]; | |
362 | struct xfs_verify_extent *ve = arg; | |
363 | struct bitmap *tree; | |
364 | dev_t dev; | |
365 | bool moveon; | |
366 | ||
367 | dev = xfs_disk_to_dev(ctx, disk); | |
368 | ||
369 | /* | |
370 | * If we don't have parent pointers, save the bad extent for | |
371 | * later rescanning. | |
372 | */ | |
373 | if (dev == ctx->fsinfo.fs_datadev) | |
374 | tree = ve->d_bad; | |
375 | else if (dev == ctx->fsinfo.fs_rtdev) | |
376 | tree = ve->r_bad; | |
377 | else | |
378 | tree = NULL; | |
379 | if (tree) { | |
380 | moveon = bitmap_set(tree, start, length); | |
381 | if (!moveon) | |
382 | str_errno(ctx, ctx->mntpoint); | |
383 | } | |
384 | ||
385 | snprintf(descr, DESCR_BUFSZ, _("dev %d:%d ioerr @ %"PRIu64":%"PRIu64" "), | |
386 | major(dev), minor(dev), start, length); | |
387 | ||
388 | /* Go figure out which blocks are bad from the fsmap. */ | |
389 | memset(keys, 0, sizeof(struct fsmap) * 2); | |
390 | keys->fmr_device = dev; | |
391 | keys->fmr_physical = start; | |
392 | (keys + 1)->fmr_device = dev; | |
393 | (keys + 1)->fmr_physical = start + length - 1; | |
394 | (keys + 1)->fmr_owner = ULLONG_MAX; | |
395 | (keys + 1)->fmr_offset = ULLONG_MAX; | |
396 | (keys + 1)->fmr_flags = UINT_MAX; | |
397 | xfs_iterate_fsmap(ctx, descr, keys, xfs_check_rmap_error_report, | |
398 | &start); | |
399 | } | |
400 | ||
401 | /* Schedule a read-verify of a (data block) extent. */ | |
402 | static bool | |
403 | xfs_check_rmap( | |
404 | struct scrub_ctx *ctx, | |
405 | const char *descr, | |
406 | struct fsmap *map, | |
407 | void *arg) | |
408 | { | |
409 | struct xfs_verify_extent *ve = arg; | |
410 | struct disk *disk; | |
411 | ||
412 | dbg_printf("rmap dev %d:%d phys %"PRIu64" owner %"PRId64 | |
413 | " offset %"PRIu64" len %"PRIu64" flags 0x%x\n", | |
414 | major(map->fmr_device), minor(map->fmr_device), | |
415 | (uint64_t)map->fmr_physical, (int64_t)map->fmr_owner, | |
416 | (uint64_t)map->fmr_offset, (uint64_t)map->fmr_length, | |
417 | map->fmr_flags); | |
418 | ||
419 | /* "Unknown" extents should be verified; they could be data. */ | |
420 | if ((map->fmr_flags & FMR_OF_SPECIAL_OWNER) && | |
421 | map->fmr_owner == XFS_FMR_OWN_UNKNOWN) | |
422 | map->fmr_flags &= ~FMR_OF_SPECIAL_OWNER; | |
423 | ||
424 | /* | |
425 | * We only care about read-verifying data extents that have been | |
426 | * written to disk. This means we can skip "special" owners | |
427 | * (metadata), xattr blocks, unwritten extents, and extent maps. | |
428 | * These should all get checked elsewhere in the scrubber. | |
429 | */ | |
430 | if (map->fmr_flags & (FMR_OF_PREALLOC | FMR_OF_ATTR_FORK | | |
431 | FMR_OF_EXTENT_MAP | FMR_OF_SPECIAL_OWNER)) | |
432 | goto out; | |
433 | ||
434 | /* XXX: Filter out directory data blocks. */ | |
435 | ||
436 | /* Schedule the read verify command for (eventual) running. */ | |
437 | disk = xfs_dev_to_disk(ctx, map->fmr_device); | |
438 | ||
439 | read_verify_schedule_io(ve->readverify, ptvar_get(ve->rvstate), disk, | |
440 | map->fmr_physical, map->fmr_length, ve); | |
441 | ||
442 | out: | |
443 | /* Is this the last extent? Fire off the read. */ | |
444 | if (map->fmr_flags & FMR_OF_LAST) | |
445 | read_verify_force_io(ve->readverify, ptvar_get(ve->rvstate)); | |
446 | ||
447 | return true; | |
448 | } | |
449 | ||
450 | /* | |
451 | * Read verify all the file data blocks in a filesystem. Since XFS doesn't | |
452 | * do data checksums, we trust that the underlying storage will pass back | |
453 | * an IO error if it can't retrieve whatever we previously stored there. | |
454 | * If we hit an IO error, we'll record the bad blocks in a bitmap and then | |
455 | * scan the extent maps of the entire fs tree to figure (and the unlinked | |
456 | * inodes) out which files are now broken. | |
457 | */ | |
458 | bool | |
459 | xfs_scan_blocks( | |
460 | struct scrub_ctx *ctx) | |
461 | { | |
462 | struct xfs_verify_extent ve; | |
463 | bool moveon; | |
464 | ||
465 | ve.rvstate = ptvar_init(scrub_nproc(ctx), sizeof(struct read_verify)); | |
466 | if (!ve.rvstate) { | |
467 | str_errno(ctx, ctx->mntpoint); | |
468 | return false; | |
469 | } | |
470 | ||
471 | moveon = bitmap_init(&ve.d_bad); | |
472 | if (!moveon) { | |
473 | str_errno(ctx, ctx->mntpoint); | |
474 | goto out_ve; | |
475 | } | |
476 | ||
477 | moveon = bitmap_init(&ve.r_bad); | |
478 | if (!moveon) { | |
479 | str_errno(ctx, ctx->mntpoint); | |
480 | goto out_dbad; | |
481 | } | |
482 | ||
483 | ve.readverify = read_verify_pool_init(ctx, ctx->geo.blocksize, | |
484 | xfs_check_rmap_ioerr, disk_heads(ctx->datadev)); | |
485 | if (!ve.readverify) { | |
486 | moveon = false; | |
487 | str_error(ctx, ctx->mntpoint, | |
488 | _("Could not create media verifier.")); | |
489 | goto out_rbad; | |
490 | } | |
491 | moveon = xfs_scan_all_spacemaps(ctx, xfs_check_rmap, &ve); | |
492 | if (!moveon) | |
493 | goto out_pool; | |
494 | read_verify_pool_flush(ve.readverify); | |
495 | ctx->bytes_checked += read_verify_bytes(ve.readverify); | |
496 | read_verify_pool_destroy(ve.readverify); | |
497 | ||
498 | /* Scan the whole dir tree to see what matches the bad extents. */ | |
499 | if (!bitmap_empty(ve.d_bad) || !bitmap_empty(ve.r_bad)) | |
500 | moveon = xfs_report_verify_errors(ctx, ve.d_bad, ve.r_bad); | |
501 | ||
502 | bitmap_free(&ve.r_bad); | |
503 | bitmap_free(&ve.d_bad); | |
504 | ptvar_free(ve.rvstate); | |
505 | return moveon; | |
506 | ||
507 | out_pool: | |
508 | read_verify_pool_destroy(ve.readverify); | |
509 | out_rbad: | |
510 | bitmap_free(&ve.r_bad); | |
511 | out_dbad: | |
512 | bitmap_free(&ve.d_bad); | |
513 | out_ve: | |
514 | ptvar_free(ve.rvstate); | |
515 | return moveon; | |
516 | } |