]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - scrub/phase6.c
libfrog: introduce xfs_fd to wrap an fd to a file on an xfs filesystem
[thirdparty/xfsprogs-dev.git] / scrub / phase6.c
CommitLineData
959ef981 1// SPDX-License-Identifier: GPL-2.0+
b364a9c0
DW
2/*
3 * Copyright (C) 2018 Oracle. All Rights Reserved.
b364a9c0 4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
b364a9c0 5 */
a440f877 6#include "xfs.h"
b364a9c0 7#include <stdint.h>
b364a9c0
DW
8#include <dirent.h>
9#include <sys/statvfs.h>
b364a9c0
DW
10#include "handle.h"
11#include "path.h"
b364a9c0
DW
12#include "workqueue.h"
13#include "xfs_scrub.h"
14#include "common.h"
15#include "bitmap.h"
16#include "disk.h"
17#include "filemap.h"
ed60d210 18#include "fscounters.h"
b364a9c0
DW
19#include "inodes.h"
20#include "read_verify.h"
21#include "spacemap.h"
22#include "vfs.h"
23
24/*
25 * Phase 6: Verify data file integrity.
26 *
27 * Identify potential data block extents with GETFSMAP, then feed those
28 * extents to the read-verify pool to get the verify commands batched,
29 * issued, and (if there are problems) reported back to us. If there
30 * are errors, we'll record the bad regions and (if available) use rmap
31 * to tell us if metadata are now corrupt. Otherwise, we'll scan the
32 * whole directory tree looking for files that overlap the bad regions
33 * and report the paths of the now corrupt files.
34 */
35
f1bb1696
DW
36/* Verify disk blocks with GETFSMAP */
37
557f98d7 38struct media_verify_state {
f1bb1696
DW
39 struct read_verify_pool *rvp_data;
40 struct read_verify_pool *rvp_log;
41 struct read_verify_pool *rvp_realtime;
42 struct bitmap *d_bad; /* bytes */
43 struct bitmap *r_bad; /* bytes */
44};
45
b364a9c0 46/* Find the fd for a given device identifier. */
f1bb1696
DW
47static struct read_verify_pool *
48xfs_dev_to_pool(
49 struct scrub_ctx *ctx,
557f98d7 50 struct media_verify_state *vs,
f1bb1696 51 dev_t dev)
b364a9c0
DW
52{
53 if (dev == ctx->fsinfo.fs_datadev)
557f98d7 54 return vs->rvp_data;
b364a9c0 55 else if (dev == ctx->fsinfo.fs_logdev)
557f98d7 56 return vs->rvp_log;
b364a9c0 57 else if (dev == ctx->fsinfo.fs_rtdev)
557f98d7 58 return vs->rvp_realtime;
b364a9c0
DW
59 abort();
60}
61
62/* Find the device major/minor for a given file descriptor. */
63static dev_t
64xfs_disk_to_dev(
65 struct scrub_ctx *ctx,
66 struct disk *disk)
67{
68 if (disk == ctx->datadev)
69 return ctx->fsinfo.fs_datadev;
70 else if (disk == ctx->logdev)
71 return ctx->fsinfo.fs_logdev;
72 else if (disk == ctx->rtdev)
73 return ctx->fsinfo.fs_rtdev;
74 abort();
75}
76
77struct owner_decode {
78 uint64_t owner;
79 const char *descr;
80};
81
82static const struct owner_decode special_owners[] = {
83 {XFS_FMR_OWN_FREE, "free space"},
84 {XFS_FMR_OWN_UNKNOWN, "unknown owner"},
85 {XFS_FMR_OWN_FS, "static FS metadata"},
86 {XFS_FMR_OWN_LOG, "journalling log"},
87 {XFS_FMR_OWN_AG, "per-AG metadata"},
88 {XFS_FMR_OWN_INOBT, "inode btree blocks"},
89 {XFS_FMR_OWN_INODES, "inodes"},
90 {XFS_FMR_OWN_REFC, "refcount btree"},
91 {XFS_FMR_OWN_COW, "CoW staging"},
92 {XFS_FMR_OWN_DEFECTIVE, "bad blocks"},
93 {0, NULL},
94};
95
96/* Decode a special owner. */
97static const char *
98xfs_decode_special_owner(
99 uint64_t owner)
100{
101 const struct owner_decode *od = special_owners;
102
103 while (od->descr) {
104 if (od->owner == owner)
105 return od->descr;
106 od++;
107 }
108
109 return NULL;
110}
111
112/* Routines to translate bad physical extents into file paths and offsets. */
113
b364a9c0
DW
114/* Report if this extent overlaps a bad region. */
115static bool
116xfs_report_verify_inode_bmap(
117 struct scrub_ctx *ctx,
118 const char *descr,
119 int fd,
120 int whichfork,
121 struct fsxattr *fsx,
122 struct xfs_bmap *bmap,
123 void *arg)
124{
ed5f9cc7 125 struct media_verify_state *vs = arg;
b364a9c0
DW
126 struct bitmap *bmp;
127
128 /* Only report errors for real extents. */
129 if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC))
130 return true;
131
132 if (fsx->fsx_xflags & FS_XFLAG_REALTIME)
ed5f9cc7 133 bmp = vs->r_bad;
b364a9c0 134 else
ed5f9cc7 135 bmp = vs->d_bad;
b364a9c0
DW
136
137 if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length))
138 return true;
139
140 str_error(ctx, descr,
141_("offset %llu failed read verification."), bmap->bm_offset);
142 return true;
143}
144
145/* Iterate the extent mappings of a file to report errors. */
146static bool
147xfs_report_verify_fd(
148 struct scrub_ctx *ctx,
149 const char *descr,
150 int fd,
151 void *arg)
152{
153 struct xfs_bmap key = {0};
154 bool moveon;
155
156 /* data fork */
157 moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_DATA_FORK, &key,
158 xfs_report_verify_inode_bmap, arg);
159 if (!moveon)
160 return false;
161
162 /* attr fork */
163 moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_ATTR_FORK, &key,
164 xfs_report_verify_inode_bmap, arg);
165 if (!moveon)
166 return false;
167 return true;
168}
169
170/* Report read verify errors in unlinked (but still open) files. */
171static int
172xfs_report_verify_inode(
173 struct scrub_ctx *ctx,
174 struct xfs_handle *handle,
175 struct xfs_bstat *bstat,
176 void *arg)
177{
178 char descr[DESCR_BUFSZ];
b364a9c0
DW
179 bool moveon;
180 int fd;
181 int error;
182
183 snprintf(descr, DESCR_BUFSZ, _("inode %"PRIu64" (unlinked)"),
184 (uint64_t)bstat->bs_ino);
185
186 /* Ignore linked files and things we can't open. */
187 if (bstat->bs_nlink != 0)
188 return 0;
189 if (!S_ISREG(bstat->bs_mode) && !S_ISDIR(bstat->bs_mode))
190 return 0;
191
192 /* Try to open the inode. */
193 fd = xfs_open_handle(handle);
194 if (fd < 0) {
195 error = errno;
196 if (error == ESTALE)
197 return error;
198
bb5dbd06
DW
199 str_info(ctx, descr,
200_("Disappeared during read error reporting."));
b364a9c0
DW
201 return error;
202 }
203
204 /* Go find the badness. */
205 moveon = xfs_report_verify_fd(ctx, descr, fd, arg);
6c05cc5d
DW
206 error = close(fd);
207 if (error)
208 str_errno(ctx, descr);
b364a9c0
DW
209
210 return moveon ? 0 : XFS_ITERATE_INODES_ABORT;
211}
212
213/* Scan a directory for matches in the read verify error list. */
214static bool
215xfs_report_verify_dir(
216 struct scrub_ctx *ctx,
217 const char *path,
218 int dir_fd,
219 void *arg)
220{
221 return xfs_report_verify_fd(ctx, path, dir_fd, arg);
222}
223
224/*
225 * Scan the inode associated with a directory entry for matches with
226 * the read verify error list.
227 */
228static bool
229xfs_report_verify_dirent(
230 struct scrub_ctx *ctx,
231 const char *path,
232 int dir_fd,
233 struct dirent *dirent,
234 struct stat *sb,
235 void *arg)
236{
237 bool moveon;
238 int fd;
6c05cc5d 239 int error;
b364a9c0
DW
240
241 /* Ignore things we can't open. */
242 if (!S_ISREG(sb->st_mode) && !S_ISDIR(sb->st_mode))
243 return true;
244
245 /* Ignore . and .. */
246 if (!strcmp(".", dirent->d_name) || !strcmp("..", dirent->d_name))
247 return true;
248
249 /*
250 * If we were given a dirent, open the associated file under
251 * dir_fd for badblocks scanning. If dirent is NULL, then it's
252 * the directory itself we want to scan.
253 */
254 fd = openat(dir_fd, dirent->d_name,
255 O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY);
256 if (fd < 0)
257 return true;
258
259 /* Go find the badness. */
260 moveon = xfs_report_verify_fd(ctx, path, fd, arg);
261 if (moveon)
262 goto out;
263
264out:
6c05cc5d
DW
265 error = close(fd);
266 if (error)
267 str_errno(ctx, path);
b364a9c0
DW
268 return moveon;
269}
270
271/* Given bad extent lists for the data & rtdev, find bad files. */
272static bool
273xfs_report_verify_errors(
274 struct scrub_ctx *ctx,
ed5f9cc7 275 struct media_verify_state *vs)
b364a9c0 276{
b364a9c0
DW
277 bool moveon;
278
b364a9c0
DW
279 /* Scan the directory tree to get file paths. */
280 moveon = scan_fs_tree(ctx, xfs_report_verify_dir,
ed5f9cc7 281 xfs_report_verify_dirent, vs);
b364a9c0
DW
282 if (!moveon)
283 return false;
284
285 /* Scan for unlinked files. */
ed5f9cc7 286 return xfs_scan_all_inodes(ctx, xfs_report_verify_inode, vs);
b364a9c0
DW
287}
288
b364a9c0
DW
289/* Report an IO error resulting from read-verify based off getfsmap. */
290static bool
291xfs_check_rmap_error_report(
292 struct scrub_ctx *ctx,
293 const char *descr,
294 struct fsmap *map,
295 void *arg)
296{
297 const char *type;
298 char buf[32];
299 uint64_t err_physical = *(uint64_t *)arg;
300 uint64_t err_off;
301
302 if (err_physical > map->fmr_physical)
303 err_off = err_physical - map->fmr_physical;
304 else
305 err_off = 0;
306
307 snprintf(buf, 32, _("disk offset %"PRIu64),
308 (uint64_t)BTOBB(map->fmr_physical + err_off));
309
310 if (map->fmr_flags & FMR_OF_SPECIAL_OWNER) {
311 type = xfs_decode_special_owner(map->fmr_owner);
312 str_error(ctx, buf,
313_("%s failed read verification."),
314 type);
315 }
316
317 /*
318 * XXX: If we had a getparent() call we could report IO errors
319 * efficiently. Until then, we'll have to scan the dir tree
320 * to find the bad file's pathname.
321 */
322
323 return true;
324}
325
326/*
327 * Remember a read error for later, and see if rmap will tell us about the
328 * owner ahead of time.
329 */
330static void
331xfs_check_rmap_ioerr(
332 struct scrub_ctx *ctx,
333 struct disk *disk,
334 uint64_t start,
335 uint64_t length,
336 int error,
337 void *arg)
338{
339 struct fsmap keys[2];
340 char descr[DESCR_BUFSZ];
557f98d7 341 struct media_verify_state *vs = arg;
b364a9c0
DW
342 struct bitmap *tree;
343 dev_t dev;
b364a9c0
DW
344
345 dev = xfs_disk_to_dev(ctx, disk);
346
347 /*
348 * If we don't have parent pointers, save the bad extent for
349 * later rescanning.
350 */
351 if (dev == ctx->fsinfo.fs_datadev)
557f98d7 352 tree = vs->d_bad;
b364a9c0 353 else if (dev == ctx->fsinfo.fs_rtdev)
557f98d7 354 tree = vs->r_bad;
b364a9c0
DW
355 else
356 tree = NULL;
357 if (tree) {
93ab49dd
DW
358 errno = -bitmap_set(tree, start, length);
359 if (errno)
b364a9c0
DW
360 str_errno(ctx, ctx->mntpoint);
361 }
362
363 snprintf(descr, DESCR_BUFSZ, _("dev %d:%d ioerr @ %"PRIu64":%"PRIu64" "),
364 major(dev), minor(dev), start, length);
365
366 /* Go figure out which blocks are bad from the fsmap. */
367 memset(keys, 0, sizeof(struct fsmap) * 2);
368 keys->fmr_device = dev;
369 keys->fmr_physical = start;
370 (keys + 1)->fmr_device = dev;
371 (keys + 1)->fmr_physical = start + length - 1;
372 (keys + 1)->fmr_owner = ULLONG_MAX;
373 (keys + 1)->fmr_offset = ULLONG_MAX;
374 (keys + 1)->fmr_flags = UINT_MAX;
375 xfs_iterate_fsmap(ctx, descr, keys, xfs_check_rmap_error_report,
376 &start);
377}
378
379/* Schedule a read-verify of a (data block) extent. */
380static bool
381xfs_check_rmap(
382 struct scrub_ctx *ctx,
383 const char *descr,
384 struct fsmap *map,
385 void *arg)
386{
557f98d7 387 struct media_verify_state *vs = arg;
f1bb1696
DW
388 struct read_verify_pool *rvp;
389
557f98d7 390 rvp = xfs_dev_to_pool(ctx, vs, map->fmr_device);
b364a9c0
DW
391
392 dbg_printf("rmap dev %d:%d phys %"PRIu64" owner %"PRId64
393 " offset %"PRIu64" len %"PRIu64" flags 0x%x\n",
394 major(map->fmr_device), minor(map->fmr_device),
395 (uint64_t)map->fmr_physical, (int64_t)map->fmr_owner,
396 (uint64_t)map->fmr_offset, (uint64_t)map->fmr_length,
397 map->fmr_flags);
398
399 /* "Unknown" extents should be verified; they could be data. */
400 if ((map->fmr_flags & FMR_OF_SPECIAL_OWNER) &&
401 map->fmr_owner == XFS_FMR_OWN_UNKNOWN)
402 map->fmr_flags &= ~FMR_OF_SPECIAL_OWNER;
403
404 /*
405 * We only care about read-verifying data extents that have been
406 * written to disk. This means we can skip "special" owners
407 * (metadata), xattr blocks, unwritten extents, and extent maps.
408 * These should all get checked elsewhere in the scrubber.
409 */
410 if (map->fmr_flags & (FMR_OF_PREALLOC | FMR_OF_ATTR_FORK |
411 FMR_OF_EXTENT_MAP | FMR_OF_SPECIAL_OWNER))
412 goto out;
413
414 /* XXX: Filter out directory data blocks. */
415
416 /* Schedule the read verify command for (eventual) running. */
557f98d7 417 read_verify_schedule_io(rvp, map->fmr_physical, map->fmr_length, vs);
b364a9c0
DW
418
419out:
420 /* Is this the last extent? Fire off the read. */
421 if (map->fmr_flags & FMR_OF_LAST)
f1bb1696 422 read_verify_force_io(rvp);
b364a9c0
DW
423
424 return true;
425}
426
f1bb1696
DW
427/* Wait for read/verify actions to finish, then return # bytes checked. */
428static uint64_t
429clean_pool(
430 struct read_verify_pool *rvp)
431{
432 uint64_t ret;
433
434 if (!rvp)
435 return 0;
436
437 read_verify_pool_flush(rvp);
438 ret = read_verify_bytes(rvp);
439 read_verify_pool_destroy(rvp);
440 return ret;
441}
442
b364a9c0
DW
443/*
444 * Read verify all the file data blocks in a filesystem. Since XFS doesn't
445 * do data checksums, we trust that the underlying storage will pass back
446 * an IO error if it can't retrieve whatever we previously stored there.
447 * If we hit an IO error, we'll record the bad blocks in a bitmap and then
448 * scan the extent maps of the entire fs tree to figure (and the unlinked
449 * inodes) out which files are now broken.
450 */
451bool
452xfs_scan_blocks(
453 struct scrub_ctx *ctx)
454{
557f98d7 455 struct media_verify_state vs = { NULL };
93ab49dd 456 bool moveon = false;
b364a9c0 457
93ab49dd
DW
458 errno = -bitmap_init(&vs.d_bad);
459 if (errno) {
b364a9c0 460 str_errno(ctx, ctx->mntpoint);
41c08606 461 goto out;
b364a9c0
DW
462 }
463
93ab49dd
DW
464 errno = -bitmap_init(&vs.r_bad);
465 if (errno) {
b364a9c0
DW
466 str_errno(ctx, ctx->mntpoint);
467 goto out_dbad;
468 }
469
557f98d7 470 vs.rvp_data = read_verify_pool_init(ctx, ctx->datadev,
3f9efb2e 471 ctx->mnt.fsgeom.blocksize, xfs_check_rmap_ioerr,
41c08606 472 scrub_nproc(ctx));
557f98d7 473 if (!vs.rvp_data) {
82377bde 474 str_info(ctx, ctx->mntpoint,
f1bb1696 475_("Could not create data device media verifier."));
b364a9c0
DW
476 goto out_rbad;
477 }
f1bb1696 478 if (ctx->logdev) {
557f98d7 479 vs.rvp_log = read_verify_pool_init(ctx, ctx->logdev,
3f9efb2e 480 ctx->mnt.fsgeom.blocksize, xfs_check_rmap_ioerr,
f1bb1696 481 scrub_nproc(ctx));
557f98d7 482 if (!vs.rvp_log) {
f1bb1696
DW
483 str_info(ctx, ctx->mntpoint,
484 _("Could not create log device media verifier."));
485 goto out_datapool;
486 }
487 }
488 if (ctx->rtdev) {
557f98d7 489 vs.rvp_realtime = read_verify_pool_init(ctx, ctx->rtdev,
3f9efb2e 490 ctx->mnt.fsgeom.blocksize, xfs_check_rmap_ioerr,
f1bb1696 491 scrub_nproc(ctx));
557f98d7 492 if (!vs.rvp_realtime) {
f1bb1696
DW
493 str_info(ctx, ctx->mntpoint,
494 _("Could not create realtime device media verifier."));
495 goto out_logpool;
496 }
497 }
557f98d7 498 moveon = xfs_scan_all_spacemaps(ctx, xfs_check_rmap, &vs);
b364a9c0 499 if (!moveon)
f1bb1696 500 goto out_rtpool;
557f98d7
DW
501 ctx->bytes_checked += clean_pool(vs.rvp_data);
502 ctx->bytes_checked += clean_pool(vs.rvp_log);
503 ctx->bytes_checked += clean_pool(vs.rvp_realtime);
b364a9c0
DW
504
505 /* Scan the whole dir tree to see what matches the bad extents. */
557f98d7 506 if (!bitmap_empty(vs.d_bad) || !bitmap_empty(vs.r_bad))
ed5f9cc7 507 moveon = xfs_report_verify_errors(ctx, &vs);
b364a9c0 508
557f98d7
DW
509 bitmap_free(&vs.r_bad);
510 bitmap_free(&vs.d_bad);
b364a9c0
DW
511 return moveon;
512
f1bb1696 513out_rtpool:
557f98d7
DW
514 if (vs.rvp_realtime)
515 read_verify_pool_destroy(vs.rvp_realtime);
f1bb1696 516out_logpool:
557f98d7
DW
517 if (vs.rvp_log)
518 read_verify_pool_destroy(vs.rvp_log);
f1bb1696 519out_datapool:
557f98d7 520 read_verify_pool_destroy(vs.rvp_data);
b364a9c0 521out_rbad:
557f98d7 522 bitmap_free(&vs.r_bad);
b364a9c0 523out_dbad:
557f98d7 524 bitmap_free(&vs.d_bad);
41c08606 525out:
b364a9c0
DW
526 return moveon;
527}
ed60d210
DW
528
529/* Estimate how much work we're going to do. */
530bool
531xfs_estimate_verify_work(
532 struct scrub_ctx *ctx,
533 uint64_t *items,
534 unsigned int *nr_threads,
535 int *rshift)
536{
537 unsigned long long d_blocks;
538 unsigned long long d_bfree;
539 unsigned long long r_blocks;
540 unsigned long long r_bfree;
541 unsigned long long f_files;
542 unsigned long long f_free;
543 bool moveon;
544
545 moveon = xfs_scan_estimate_blocks(ctx, &d_blocks, &d_bfree,
546 &r_blocks, &r_bfree, &f_files, &f_free);
547 if (!moveon)
548 return moveon;
549
550 *items = ((d_blocks - d_bfree) + (r_blocks - r_bfree)) << ctx->blocklog;
551 *nr_threads = disk_heads(ctx->datadev);
552 *rshift = 20;
553 return moveon;
554}