1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
14 #include "ordered-data.h"
15 #include "transaction.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
20 #include "block-group.h"
23 #include "accessors.h"
24 #include "file-item.h"
26 #include "raid-stripe-tree.h"
29 * This is only the first step towards a full-features scrub. It reads all
30 * extent and super block and verifies the checksums. In case a bad checksum
31 * is found or the extent cannot be read, good data will be written back if
34 * Future enhancements:
35 * - In case an unrepairable extent is encountered, track which files are
36 * affected and report them
37 * - track and record media errors, throw out bad devices
38 * - add a mode to also read unallocated space
44 * The following value only influences the performance.
46 * This detemines how many stripes would be submitted in one go,
47 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
49 #define SCRUB_STRIPES_PER_GROUP 8
52 * How many groups we have for each sctx.
54 * This would be 8M per device, the same value as the old scrub in-flight bios
57 #define SCRUB_GROUPS_PER_SCTX 16
59 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP)
62 * The following value times PAGE_SIZE needs to be large enough to match the
63 * largest node/leaf/sector size that shall be supported.
65 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
67 /* Represent one sector and its needed info to verify the content. */
68 struct scrub_sector_verification
{
73 * Csum pointer for data csum verification. Should point to a
74 * sector csum inside scrub_stripe::csums.
76 * NULL if this data sector has no csum.
81 * Extra info for metadata verification. All sectors inside a
82 * tree block share the same generation.
88 enum scrub_stripe_flags
{
89 /* Set when @mirror_num, @dev, @physical and @logical are set. */
90 SCRUB_STRIPE_FLAG_INITIALIZED
,
92 /* Set when the read-repair is finished. */
93 SCRUB_STRIPE_FLAG_REPAIR_DONE
,
96 * Set for data stripes if it's triggered from P/Q stripe.
97 * During such scrub, we should not report errors in data stripes, nor
98 * update the accounting.
100 SCRUB_STRIPE_FLAG_NO_REPORT
,
103 #define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
106 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
108 struct scrub_stripe
{
109 struct scrub_ctx
*sctx
;
110 struct btrfs_block_group
*bg
;
112 struct page
*pages
[SCRUB_STRIPE_PAGES
];
113 struct scrub_sector_verification
*sectors
;
115 struct btrfs_device
*dev
;
121 /* Should be BTRFS_STRIPE_LEN / sectorsize. */
125 * How many data/meta extents are in this stripe. Only for scrub status
126 * reporting purposes.
132 wait_queue_head_t io_wait
;
133 wait_queue_head_t repair_wait
;
136 * Indicate the states of the stripe. Bits are defined in
137 * scrub_stripe_flags enum.
141 /* Indicate which sectors are covered by extent items. */
142 unsigned long extent_sector_bitmap
;
145 * The errors hit during the initial read of the stripe.
147 * Would be utilized for error reporting and repair.
149 * The remaining init_nr_* records the number of errors hit, only used
150 * by error reporting.
152 unsigned long init_error_bitmap
;
153 unsigned int init_nr_io_errors
;
154 unsigned int init_nr_csum_errors
;
155 unsigned int init_nr_meta_errors
;
158 * The following error bitmaps are all for the current status.
159 * Every time we submit a new read, these bitmaps may be updated.
161 * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
163 * IO and csum errors can happen for both metadata and data.
165 unsigned long error_bitmap
;
166 unsigned long io_error_bitmap
;
167 unsigned long csum_error_bitmap
;
168 unsigned long meta_error_bitmap
;
170 /* For writeback (repair or replace) error reporting. */
171 unsigned long write_error_bitmap
;
173 /* Writeback can be concurrent, thus we need to protect the bitmap. */
174 spinlock_t write_error_lock
;
177 * Checksum for the whole stripe if this stripe is inside a data block
182 struct work_struct work
;
186 struct scrub_stripe stripes
[SCRUB_TOTAL_STRIPES
];
187 struct scrub_stripe
*raid56_data_stripes
;
188 struct btrfs_fs_info
*fs_info
;
189 struct btrfs_path extent_path
;
190 struct btrfs_path csum_path
;
197 /* State of IO submission throttling affecting the associated device */
198 ktime_t throttle_deadline
;
204 struct mutex wr_lock
;
205 struct btrfs_device
*wr_tgtdev
;
210 struct btrfs_scrub_progress stat
;
211 spinlock_t stat_lock
;
214 * Use a ref counter to avoid use-after-free issues. Scrub workers
215 * decrement bios_in_flight and workers_pending and then do a wakeup
216 * on the list_wait wait queue. We must ensure the main scrub task
217 * doesn't free the scrub context before or while the workers are
218 * doing the wakeup() call.
223 struct scrub_warning
{
224 struct btrfs_path
*path
;
225 u64 extent_item_size
;
229 struct btrfs_device
*dev
;
232 static void release_scrub_stripe(struct scrub_stripe
*stripe
)
237 for (int i
= 0; i
< SCRUB_STRIPE_PAGES
; i
++) {
238 if (stripe
->pages
[i
])
239 __free_page(stripe
->pages
[i
]);
240 stripe
->pages
[i
] = NULL
;
242 kfree(stripe
->sectors
);
243 kfree(stripe
->csums
);
244 stripe
->sectors
= NULL
;
245 stripe
->csums
= NULL
;
250 static int init_scrub_stripe(struct btrfs_fs_info
*fs_info
,
251 struct scrub_stripe
*stripe
)
255 memset(stripe
, 0, sizeof(*stripe
));
257 stripe
->nr_sectors
= BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
;
260 init_waitqueue_head(&stripe
->io_wait
);
261 init_waitqueue_head(&stripe
->repair_wait
);
262 atomic_set(&stripe
->pending_io
, 0);
263 spin_lock_init(&stripe
->write_error_lock
);
265 ret
= btrfs_alloc_page_array(SCRUB_STRIPE_PAGES
, stripe
->pages
);
269 stripe
->sectors
= kcalloc(stripe
->nr_sectors
,
270 sizeof(struct scrub_sector_verification
),
272 if (!stripe
->sectors
)
275 stripe
->csums
= kcalloc(BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
,
276 fs_info
->csum_size
, GFP_KERNEL
);
281 release_scrub_stripe(stripe
);
285 static void wait_scrub_stripe_io(struct scrub_stripe
*stripe
)
287 wait_event(stripe
->io_wait
, atomic_read(&stripe
->pending_io
) == 0);
290 static void scrub_put_ctx(struct scrub_ctx
*sctx
);
292 static void __scrub_blocked_if_needed(struct btrfs_fs_info
*fs_info
)
294 while (atomic_read(&fs_info
->scrub_pause_req
)) {
295 mutex_unlock(&fs_info
->scrub_lock
);
296 wait_event(fs_info
->scrub_pause_wait
,
297 atomic_read(&fs_info
->scrub_pause_req
) == 0);
298 mutex_lock(&fs_info
->scrub_lock
);
302 static void scrub_pause_on(struct btrfs_fs_info
*fs_info
)
304 atomic_inc(&fs_info
->scrubs_paused
);
305 wake_up(&fs_info
->scrub_pause_wait
);
308 static void scrub_pause_off(struct btrfs_fs_info
*fs_info
)
310 mutex_lock(&fs_info
->scrub_lock
);
311 __scrub_blocked_if_needed(fs_info
);
312 atomic_dec(&fs_info
->scrubs_paused
);
313 mutex_unlock(&fs_info
->scrub_lock
);
315 wake_up(&fs_info
->scrub_pause_wait
);
318 static void scrub_blocked_if_needed(struct btrfs_fs_info
*fs_info
)
320 scrub_pause_on(fs_info
);
321 scrub_pause_off(fs_info
);
324 static noinline_for_stack
void scrub_free_ctx(struct scrub_ctx
*sctx
)
331 for (i
= 0; i
< SCRUB_TOTAL_STRIPES
; i
++)
332 release_scrub_stripe(&sctx
->stripes
[i
]);
337 static void scrub_put_ctx(struct scrub_ctx
*sctx
)
339 if (refcount_dec_and_test(&sctx
->refs
))
340 scrub_free_ctx(sctx
);
343 static noinline_for_stack
struct scrub_ctx
*scrub_setup_ctx(
344 struct btrfs_fs_info
*fs_info
, int is_dev_replace
)
346 struct scrub_ctx
*sctx
;
349 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use
352 sctx
= kvzalloc(sizeof(*sctx
), GFP_KERNEL
);
355 refcount_set(&sctx
->refs
, 1);
356 sctx
->is_dev_replace
= is_dev_replace
;
357 sctx
->fs_info
= fs_info
;
358 sctx
->extent_path
.search_commit_root
= 1;
359 sctx
->extent_path
.skip_locking
= 1;
360 sctx
->csum_path
.search_commit_root
= 1;
361 sctx
->csum_path
.skip_locking
= 1;
362 for (i
= 0; i
< SCRUB_TOTAL_STRIPES
; i
++) {
365 ret
= init_scrub_stripe(fs_info
, &sctx
->stripes
[i
]);
368 sctx
->stripes
[i
].sctx
= sctx
;
370 sctx
->first_free
= 0;
371 atomic_set(&sctx
->cancel_req
, 0);
373 spin_lock_init(&sctx
->stat_lock
);
374 sctx
->throttle_deadline
= 0;
376 mutex_init(&sctx
->wr_lock
);
377 if (is_dev_replace
) {
378 WARN_ON(!fs_info
->dev_replace
.tgtdev
);
379 sctx
->wr_tgtdev
= fs_info
->dev_replace
.tgtdev
;
385 scrub_free_ctx(sctx
);
386 return ERR_PTR(-ENOMEM
);
389 static int scrub_print_warning_inode(u64 inum
, u64 offset
, u64 num_bytes
,
390 u64 root
, void *warn_ctx
)
396 struct extent_buffer
*eb
;
397 struct btrfs_inode_item
*inode_item
;
398 struct scrub_warning
*swarn
= warn_ctx
;
399 struct btrfs_fs_info
*fs_info
= swarn
->dev
->fs_info
;
400 struct inode_fs_paths
*ipath
= NULL
;
401 struct btrfs_root
*local_root
;
402 struct btrfs_key key
;
404 local_root
= btrfs_get_fs_root(fs_info
, root
, true);
405 if (IS_ERR(local_root
)) {
406 ret
= PTR_ERR(local_root
);
411 * this makes the path point to (inum INODE_ITEM ioff)
414 key
.type
= BTRFS_INODE_ITEM_KEY
;
417 ret
= btrfs_search_slot(NULL
, local_root
, &key
, swarn
->path
, 0, 0);
419 btrfs_put_root(local_root
);
420 btrfs_release_path(swarn
->path
);
424 eb
= swarn
->path
->nodes
[0];
425 inode_item
= btrfs_item_ptr(eb
, swarn
->path
->slots
[0],
426 struct btrfs_inode_item
);
427 nlink
= btrfs_inode_nlink(eb
, inode_item
);
428 btrfs_release_path(swarn
->path
);
431 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
432 * uses GFP_NOFS in this context, so we keep it consistent but it does
433 * not seem to be strictly necessary.
435 nofs_flag
= memalloc_nofs_save();
436 ipath
= init_ipath(4096, local_root
, swarn
->path
);
437 memalloc_nofs_restore(nofs_flag
);
439 btrfs_put_root(local_root
);
440 ret
= PTR_ERR(ipath
);
444 ret
= paths_from_inode(inum
, ipath
);
450 * we deliberately ignore the bit ipath might have been too small to
451 * hold all of the paths here
453 for (i
= 0; i
< ipath
->fspath
->elem_cnt
; ++i
)
454 btrfs_warn_in_rcu(fs_info
,
455 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
456 swarn
->errstr
, swarn
->logical
,
457 btrfs_dev_name(swarn
->dev
),
460 fs_info
->sectorsize
, nlink
,
461 (char *)(unsigned long)ipath
->fspath
->val
[i
]);
463 btrfs_put_root(local_root
);
468 btrfs_warn_in_rcu(fs_info
,
469 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
470 swarn
->errstr
, swarn
->logical
,
471 btrfs_dev_name(swarn
->dev
),
473 root
, inum
, offset
, ret
);
479 static void scrub_print_common_warning(const char *errstr
, struct btrfs_device
*dev
,
480 bool is_super
, u64 logical
, u64 physical
)
482 struct btrfs_fs_info
*fs_info
= dev
->fs_info
;
483 struct btrfs_path
*path
;
484 struct btrfs_key found_key
;
485 struct extent_buffer
*eb
;
486 struct btrfs_extent_item
*ei
;
487 struct scrub_warning swarn
;
492 /* Super block error, no need to search extent tree. */
494 btrfs_warn_in_rcu(fs_info
, "%s on device %s, physical %llu",
495 errstr
, btrfs_dev_name(dev
), physical
);
498 path
= btrfs_alloc_path();
502 swarn
.physical
= physical
;
503 swarn
.logical
= logical
;
504 swarn
.errstr
= errstr
;
507 ret
= extent_from_logical(fs_info
, swarn
.logical
, path
, &found_key
,
512 swarn
.extent_item_size
= found_key
.offset
;
515 ei
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_extent_item
);
516 item_size
= btrfs_item_size(eb
, path
->slots
[0]);
518 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
519 unsigned long ptr
= 0;
524 ret
= tree_backref_for_extent(&ptr
, eb
, &found_key
, ei
,
525 item_size
, &ref_root
,
529 "failed to resolve tree backref for logical %llu: %d",
535 btrfs_warn_in_rcu(fs_info
,
536 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
537 errstr
, swarn
.logical
, btrfs_dev_name(dev
),
538 swarn
.physical
, (ref_level
? "node" : "leaf"),
539 ref_level
, ref_root
);
541 btrfs_release_path(path
);
543 struct btrfs_backref_walk_ctx ctx
= { 0 };
545 btrfs_release_path(path
);
547 ctx
.bytenr
= found_key
.objectid
;
548 ctx
.extent_item_pos
= swarn
.logical
- found_key
.objectid
;
549 ctx
.fs_info
= fs_info
;
554 iterate_extent_inodes(&ctx
, true, scrub_print_warning_inode
, &swarn
);
558 btrfs_free_path(path
);
561 static int fill_writer_pointer_gap(struct scrub_ctx
*sctx
, u64 physical
)
566 if (!btrfs_is_zoned(sctx
->fs_info
))
569 if (!btrfs_dev_is_sequential(sctx
->wr_tgtdev
, physical
))
572 if (sctx
->write_pointer
< physical
) {
573 length
= physical
- sctx
->write_pointer
;
575 ret
= btrfs_zoned_issue_zeroout(sctx
->wr_tgtdev
,
576 sctx
->write_pointer
, length
);
578 sctx
->write_pointer
= physical
;
583 static struct page
*scrub_stripe_get_page(struct scrub_stripe
*stripe
, int sector_nr
)
585 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
586 int page_index
= (sector_nr
<< fs_info
->sectorsize_bits
) >> PAGE_SHIFT
;
588 return stripe
->pages
[page_index
];
591 static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe
*stripe
,
594 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
596 return offset_in_page(sector_nr
<< fs_info
->sectorsize_bits
);
599 static void scrub_verify_one_metadata(struct scrub_stripe
*stripe
, int sector_nr
)
601 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
602 const u32 sectors_per_tree
= fs_info
->nodesize
>> fs_info
->sectorsize_bits
;
603 const u64 logical
= stripe
->logical
+ (sector_nr
<< fs_info
->sectorsize_bits
);
604 const struct page
*first_page
= scrub_stripe_get_page(stripe
, sector_nr
);
605 const unsigned int first_off
= scrub_stripe_get_page_offset(stripe
, sector_nr
);
606 SHASH_DESC_ON_STACK(shash
, fs_info
->csum_shash
);
607 u8 on_disk_csum
[BTRFS_CSUM_SIZE
];
608 u8 calculated_csum
[BTRFS_CSUM_SIZE
];
609 struct btrfs_header
*header
;
612 * Here we don't have a good way to attach the pages (and subpages)
613 * to a dummy extent buffer, thus we have to directly grab the members
616 header
= (struct btrfs_header
*)(page_address(first_page
) + first_off
);
617 memcpy(on_disk_csum
, header
->csum
, fs_info
->csum_size
);
619 if (logical
!= btrfs_stack_header_bytenr(header
)) {
620 bitmap_set(&stripe
->csum_error_bitmap
, sector_nr
, sectors_per_tree
);
621 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
622 btrfs_warn_rl(fs_info
,
623 "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
624 logical
, stripe
->mirror_num
,
625 btrfs_stack_header_bytenr(header
), logical
);
628 if (memcmp(header
->fsid
, fs_info
->fs_devices
->metadata_uuid
,
629 BTRFS_FSID_SIZE
) != 0) {
630 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
631 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
632 btrfs_warn_rl(fs_info
,
633 "tree block %llu mirror %u has bad fsid, has %pU want %pU",
634 logical
, stripe
->mirror_num
,
635 header
->fsid
, fs_info
->fs_devices
->fsid
);
638 if (memcmp(header
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
,
639 BTRFS_UUID_SIZE
) != 0) {
640 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
641 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
642 btrfs_warn_rl(fs_info
,
643 "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
644 logical
, stripe
->mirror_num
,
645 header
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
);
649 /* Now check tree block csum. */
650 shash
->tfm
= fs_info
->csum_shash
;
651 crypto_shash_init(shash
);
652 crypto_shash_update(shash
, page_address(first_page
) + first_off
+
653 BTRFS_CSUM_SIZE
, fs_info
->sectorsize
- BTRFS_CSUM_SIZE
);
655 for (int i
= sector_nr
+ 1; i
< sector_nr
+ sectors_per_tree
; i
++) {
656 struct page
*page
= scrub_stripe_get_page(stripe
, i
);
657 unsigned int page_off
= scrub_stripe_get_page_offset(stripe
, i
);
659 crypto_shash_update(shash
, page_address(page
) + page_off
,
660 fs_info
->sectorsize
);
663 crypto_shash_final(shash
, calculated_csum
);
664 if (memcmp(calculated_csum
, on_disk_csum
, fs_info
->csum_size
) != 0) {
665 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
666 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
667 btrfs_warn_rl(fs_info
,
668 "tree block %llu mirror %u has bad csum, has " CSUM_FMT
" want " CSUM_FMT
,
669 logical
, stripe
->mirror_num
,
670 CSUM_FMT_VALUE(fs_info
->csum_size
, on_disk_csum
),
671 CSUM_FMT_VALUE(fs_info
->csum_size
, calculated_csum
));
674 if (stripe
->sectors
[sector_nr
].generation
!=
675 btrfs_stack_header_generation(header
)) {
676 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
677 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
678 btrfs_warn_rl(fs_info
,
679 "tree block %llu mirror %u has bad generation, has %llu want %llu",
680 logical
, stripe
->mirror_num
,
681 btrfs_stack_header_generation(header
),
682 stripe
->sectors
[sector_nr
].generation
);
685 bitmap_clear(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
686 bitmap_clear(&stripe
->csum_error_bitmap
, sector_nr
, sectors_per_tree
);
687 bitmap_clear(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
690 static void scrub_verify_one_sector(struct scrub_stripe
*stripe
, int sector_nr
)
692 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
693 struct scrub_sector_verification
*sector
= &stripe
->sectors
[sector_nr
];
694 const u32 sectors_per_tree
= fs_info
->nodesize
>> fs_info
->sectorsize_bits
;
695 struct page
*page
= scrub_stripe_get_page(stripe
, sector_nr
);
696 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, sector_nr
);
697 u8 csum_buf
[BTRFS_CSUM_SIZE
];
700 ASSERT(sector_nr
>= 0 && sector_nr
< stripe
->nr_sectors
);
702 /* Sector not utilized, skip it. */
703 if (!test_bit(sector_nr
, &stripe
->extent_sector_bitmap
))
706 /* IO error, no need to check. */
707 if (test_bit(sector_nr
, &stripe
->io_error_bitmap
))
710 /* Metadata, verify the full tree block. */
711 if (sector
->is_metadata
) {
713 * Check if the tree block crosses the stripe boudary. If
714 * crossed the boundary, we cannot verify it but only give a
717 * This can only happen on a very old filesystem where chunks
718 * are not ensured to be stripe aligned.
720 if (unlikely(sector_nr
+ sectors_per_tree
> stripe
->nr_sectors
)) {
721 btrfs_warn_rl(fs_info
,
722 "tree block at %llu crosses stripe boundary %llu",
724 (sector_nr
<< fs_info
->sectorsize_bits
),
728 scrub_verify_one_metadata(stripe
, sector_nr
);
733 * Data is easier, we just verify the data csum (if we have it). For
734 * cases without csum, we have no other choice but to trust it.
737 clear_bit(sector_nr
, &stripe
->error_bitmap
);
741 ret
= btrfs_check_sector_csum(fs_info
, page
, pgoff
, csum_buf
, sector
->csum
);
743 set_bit(sector_nr
, &stripe
->csum_error_bitmap
);
744 set_bit(sector_nr
, &stripe
->error_bitmap
);
746 clear_bit(sector_nr
, &stripe
->csum_error_bitmap
);
747 clear_bit(sector_nr
, &stripe
->error_bitmap
);
751 /* Verify specified sectors of a stripe. */
752 static void scrub_verify_one_stripe(struct scrub_stripe
*stripe
, unsigned long bitmap
)
754 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
755 const u32 sectors_per_tree
= fs_info
->nodesize
>> fs_info
->sectorsize_bits
;
758 for_each_set_bit(sector_nr
, &bitmap
, stripe
->nr_sectors
) {
759 scrub_verify_one_sector(stripe
, sector_nr
);
760 if (stripe
->sectors
[sector_nr
].is_metadata
)
761 sector_nr
+= sectors_per_tree
- 1;
765 static int calc_sector_number(struct scrub_stripe
*stripe
, struct bio_vec
*first_bvec
)
769 for (i
= 0; i
< stripe
->nr_sectors
; i
++) {
770 if (scrub_stripe_get_page(stripe
, i
) == first_bvec
->bv_page
&&
771 scrub_stripe_get_page_offset(stripe
, i
) == first_bvec
->bv_offset
)
774 ASSERT(i
< stripe
->nr_sectors
);
779 * Repair read is different to the regular read:
781 * - Only reads the failed sectors
782 * - May have extra blocksize limits
784 static void scrub_repair_read_endio(struct btrfs_bio
*bbio
)
786 struct scrub_stripe
*stripe
= bbio
->private;
787 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
788 struct bio_vec
*bvec
;
789 int sector_nr
= calc_sector_number(stripe
, bio_first_bvec_all(&bbio
->bio
));
793 ASSERT(sector_nr
< stripe
->nr_sectors
);
795 bio_for_each_bvec_all(bvec
, &bbio
->bio
, i
)
796 bio_size
+= bvec
->bv_len
;
798 if (bbio
->bio
.bi_status
) {
799 bitmap_set(&stripe
->io_error_bitmap
, sector_nr
,
800 bio_size
>> fs_info
->sectorsize_bits
);
801 bitmap_set(&stripe
->error_bitmap
, sector_nr
,
802 bio_size
>> fs_info
->sectorsize_bits
);
804 bitmap_clear(&stripe
->io_error_bitmap
, sector_nr
,
805 bio_size
>> fs_info
->sectorsize_bits
);
808 if (atomic_dec_and_test(&stripe
->pending_io
))
809 wake_up(&stripe
->io_wait
);
812 static int calc_next_mirror(int mirror
, int num_copies
)
814 ASSERT(mirror
<= num_copies
);
815 return (mirror
+ 1 > num_copies
) ? 1 : mirror
+ 1;
818 static void scrub_stripe_submit_repair_read(struct scrub_stripe
*stripe
,
819 int mirror
, int blocksize
, bool wait
)
821 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
822 struct btrfs_bio
*bbio
= NULL
;
823 const unsigned long old_error_bitmap
= stripe
->error_bitmap
;
826 ASSERT(stripe
->mirror_num
>= 1);
827 ASSERT(atomic_read(&stripe
->pending_io
) == 0);
829 for_each_set_bit(i
, &old_error_bitmap
, stripe
->nr_sectors
) {
834 page
= scrub_stripe_get_page(stripe
, i
);
835 pgoff
= scrub_stripe_get_page_offset(stripe
, i
);
837 /* The current sector cannot be merged, submit the bio. */
838 if (bbio
&& ((i
> 0 && !test_bit(i
- 1, &stripe
->error_bitmap
)) ||
839 bbio
->bio
.bi_iter
.bi_size
>= blocksize
)) {
840 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
841 atomic_inc(&stripe
->pending_io
);
842 btrfs_submit_bio(bbio
, mirror
);
844 wait_scrub_stripe_io(stripe
);
849 bbio
= btrfs_bio_alloc(stripe
->nr_sectors
, REQ_OP_READ
,
850 fs_info
, scrub_repair_read_endio
, stripe
);
851 bbio
->bio
.bi_iter
.bi_sector
= (stripe
->logical
+
852 (i
<< fs_info
->sectorsize_bits
)) >> SECTOR_SHIFT
;
855 ret
= bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
856 ASSERT(ret
== fs_info
->sectorsize
);
859 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
860 atomic_inc(&stripe
->pending_io
);
861 btrfs_submit_bio(bbio
, mirror
);
863 wait_scrub_stripe_io(stripe
);
867 static void scrub_stripe_report_errors(struct scrub_ctx
*sctx
,
868 struct scrub_stripe
*stripe
)
870 static DEFINE_RATELIMIT_STATE(rs
, DEFAULT_RATELIMIT_INTERVAL
,
871 DEFAULT_RATELIMIT_BURST
);
872 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
873 struct btrfs_device
*dev
= NULL
;
875 int nr_data_sectors
= 0;
876 int nr_meta_sectors
= 0;
877 int nr_nodatacsum_sectors
= 0;
878 int nr_repaired_sectors
= 0;
881 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT
, &stripe
->state
))
885 * Init needed infos for error reporting.
887 * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
888 * thus no need for dev/physical, error reporting still needs dev and physical.
890 if (!bitmap_empty(&stripe
->init_error_bitmap
, stripe
->nr_sectors
)) {
891 u64 mapped_len
= fs_info
->sectorsize
;
892 struct btrfs_io_context
*bioc
= NULL
;
893 int stripe_index
= stripe
->mirror_num
- 1;
896 /* For scrub, our mirror_num should always start at 1. */
897 ASSERT(stripe
->mirror_num
>= 1);
898 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_GET_READ_MIRRORS
,
899 stripe
->logical
, &mapped_len
, &bioc
,
902 * If we failed, dev will be NULL, and later detailed reports
903 * will just be skipped.
907 physical
= bioc
->stripes
[stripe_index
].physical
;
908 dev
= bioc
->stripes
[stripe_index
].dev
;
909 btrfs_put_bioc(bioc
);
913 for_each_set_bit(sector_nr
, &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
) {
914 bool repaired
= false;
916 if (stripe
->sectors
[sector_nr
].is_metadata
) {
920 if (!stripe
->sectors
[sector_nr
].csum
)
921 nr_nodatacsum_sectors
++;
924 if (test_bit(sector_nr
, &stripe
->init_error_bitmap
) &&
925 !test_bit(sector_nr
, &stripe
->error_bitmap
)) {
926 nr_repaired_sectors
++;
930 /* Good sector from the beginning, nothing need to be done. */
931 if (!test_bit(sector_nr
, &stripe
->init_error_bitmap
))
935 * Report error for the corrupted sectors. If repaired, just
936 * output the message of repaired message.
940 btrfs_err_rl_in_rcu(fs_info
,
941 "fixed up error at logical %llu on dev %s physical %llu",
942 stripe
->logical
, btrfs_dev_name(dev
),
945 btrfs_err_rl_in_rcu(fs_info
,
946 "fixed up error at logical %llu on mirror %u",
947 stripe
->logical
, stripe
->mirror_num
);
952 /* The remaining are all for unrepaired. */
954 btrfs_err_rl_in_rcu(fs_info
,
955 "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
956 stripe
->logical
, btrfs_dev_name(dev
),
959 btrfs_err_rl_in_rcu(fs_info
,
960 "unable to fixup (regular) error at logical %llu on mirror %u",
961 stripe
->logical
, stripe
->mirror_num
);
964 if (test_bit(sector_nr
, &stripe
->io_error_bitmap
))
965 if (__ratelimit(&rs
) && dev
)
966 scrub_print_common_warning("i/o error", dev
, false,
967 stripe
->logical
, physical
);
968 if (test_bit(sector_nr
, &stripe
->csum_error_bitmap
))
969 if (__ratelimit(&rs
) && dev
)
970 scrub_print_common_warning("checksum error", dev
, false,
971 stripe
->logical
, physical
);
972 if (test_bit(sector_nr
, &stripe
->meta_error_bitmap
))
973 if (__ratelimit(&rs
) && dev
)
974 scrub_print_common_warning("header error", dev
, false,
975 stripe
->logical
, physical
);
978 spin_lock(&sctx
->stat_lock
);
979 sctx
->stat
.data_extents_scrubbed
+= stripe
->nr_data_extents
;
980 sctx
->stat
.tree_extents_scrubbed
+= stripe
->nr_meta_extents
;
981 sctx
->stat
.data_bytes_scrubbed
+= nr_data_sectors
<< fs_info
->sectorsize_bits
;
982 sctx
->stat
.tree_bytes_scrubbed
+= nr_meta_sectors
<< fs_info
->sectorsize_bits
;
983 sctx
->stat
.no_csum
+= nr_nodatacsum_sectors
;
984 sctx
->stat
.read_errors
+= stripe
->init_nr_io_errors
;
985 sctx
->stat
.csum_errors
+= stripe
->init_nr_csum_errors
;
986 sctx
->stat
.verify_errors
+= stripe
->init_nr_meta_errors
;
987 sctx
->stat
.uncorrectable_errors
+=
988 bitmap_weight(&stripe
->error_bitmap
, stripe
->nr_sectors
);
989 sctx
->stat
.corrected_errors
+= nr_repaired_sectors
;
990 spin_unlock(&sctx
->stat_lock
);
993 static void scrub_write_sectors(struct scrub_ctx
*sctx
, struct scrub_stripe
*stripe
,
994 unsigned long write_bitmap
, bool dev_replace
);
997 * The main entrance for all read related scrub work, including:
999 * - Wait for the initial read to finish
1000 * - Verify and locate any bad sectors
1001 * - Go through the remaining mirrors and try to read as large blocksize as
1003 * - Go through all mirrors (including the failed mirror) sector-by-sector
1004 * - Submit writeback for repaired sectors
1006 * Writeback for dev-replace does not happen here, it needs extra
1007 * synchronization for zoned devices.
1009 static void scrub_stripe_read_repair_worker(struct work_struct
*work
)
1011 struct scrub_stripe
*stripe
= container_of(work
, struct scrub_stripe
, work
);
1012 struct scrub_ctx
*sctx
= stripe
->sctx
;
1013 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1014 int num_copies
= btrfs_num_copies(fs_info
, stripe
->bg
->start
,
1015 stripe
->bg
->length
);
1019 ASSERT(stripe
->mirror_num
> 0);
1021 wait_scrub_stripe_io(stripe
);
1022 scrub_verify_one_stripe(stripe
, stripe
->extent_sector_bitmap
);
1023 /* Save the initial failed bitmap for later repair and report usage. */
1024 stripe
->init_error_bitmap
= stripe
->error_bitmap
;
1025 stripe
->init_nr_io_errors
= bitmap_weight(&stripe
->io_error_bitmap
,
1026 stripe
->nr_sectors
);
1027 stripe
->init_nr_csum_errors
= bitmap_weight(&stripe
->csum_error_bitmap
,
1028 stripe
->nr_sectors
);
1029 stripe
->init_nr_meta_errors
= bitmap_weight(&stripe
->meta_error_bitmap
,
1030 stripe
->nr_sectors
);
1032 if (bitmap_empty(&stripe
->init_error_bitmap
, stripe
->nr_sectors
))
1036 * Try all remaining mirrors.
1038 * Here we still try to read as large block as possible, as this is
1039 * faster and we have extra safety nets to rely on.
1041 for (mirror
= calc_next_mirror(stripe
->mirror_num
, num_copies
);
1042 mirror
!= stripe
->mirror_num
;
1043 mirror
= calc_next_mirror(mirror
, num_copies
)) {
1044 const unsigned long old_error_bitmap
= stripe
->error_bitmap
;
1046 scrub_stripe_submit_repair_read(stripe
, mirror
,
1047 BTRFS_STRIPE_LEN
, false);
1048 wait_scrub_stripe_io(stripe
);
1049 scrub_verify_one_stripe(stripe
, old_error_bitmap
);
1050 if (bitmap_empty(&stripe
->error_bitmap
, stripe
->nr_sectors
))
1055 * Last safety net, try re-checking all mirrors, including the failed
1056 * one, sector-by-sector.
1058 * As if one sector failed the drive's internal csum, the whole read
1059 * containing the offending sector would be marked as error.
1060 * Thus here we do sector-by-sector read.
1062 * This can be slow, thus we only try it as the last resort.
1065 for (i
= 0, mirror
= stripe
->mirror_num
;
1067 i
++, mirror
= calc_next_mirror(mirror
, num_copies
)) {
1068 const unsigned long old_error_bitmap
= stripe
->error_bitmap
;
1070 scrub_stripe_submit_repair_read(stripe
, mirror
,
1071 fs_info
->sectorsize
, true);
1072 wait_scrub_stripe_io(stripe
);
1073 scrub_verify_one_stripe(stripe
, old_error_bitmap
);
1074 if (bitmap_empty(&stripe
->error_bitmap
, stripe
->nr_sectors
))
1079 * Submit the repaired sectors. For zoned case, we cannot do repair
1080 * in-place, but queue the bg to be relocated.
1082 if (btrfs_is_zoned(fs_info
)) {
1083 if (!bitmap_empty(&stripe
->error_bitmap
, stripe
->nr_sectors
))
1084 btrfs_repair_one_zone(fs_info
, sctx
->stripes
[0].bg
->start
);
1085 } else if (!sctx
->readonly
) {
1086 unsigned long repaired
;
1088 bitmap_andnot(&repaired
, &stripe
->init_error_bitmap
,
1089 &stripe
->error_bitmap
, stripe
->nr_sectors
);
1090 scrub_write_sectors(sctx
, stripe
, repaired
, false);
1091 wait_scrub_stripe_io(stripe
);
1094 scrub_stripe_report_errors(sctx
, stripe
);
1095 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE
, &stripe
->state
);
1096 wake_up(&stripe
->repair_wait
);
1099 static void scrub_read_endio(struct btrfs_bio
*bbio
)
1101 struct scrub_stripe
*stripe
= bbio
->private;
1103 if (bbio
->bio
.bi_status
) {
1104 bitmap_set(&stripe
->io_error_bitmap
, 0, stripe
->nr_sectors
);
1105 bitmap_set(&stripe
->error_bitmap
, 0, stripe
->nr_sectors
);
1107 bitmap_clear(&stripe
->io_error_bitmap
, 0, stripe
->nr_sectors
);
1109 bio_put(&bbio
->bio
);
1110 if (atomic_dec_and_test(&stripe
->pending_io
)) {
1111 wake_up(&stripe
->io_wait
);
1112 INIT_WORK(&stripe
->work
, scrub_stripe_read_repair_worker
);
1113 queue_work(stripe
->bg
->fs_info
->scrub_workers
, &stripe
->work
);
1117 static void scrub_write_endio(struct btrfs_bio
*bbio
)
1119 struct scrub_stripe
*stripe
= bbio
->private;
1120 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1121 struct bio_vec
*bvec
;
1122 int sector_nr
= calc_sector_number(stripe
, bio_first_bvec_all(&bbio
->bio
));
1126 bio_for_each_bvec_all(bvec
, &bbio
->bio
, i
)
1127 bio_size
+= bvec
->bv_len
;
1129 if (bbio
->bio
.bi_status
) {
1130 unsigned long flags
;
1132 spin_lock_irqsave(&stripe
->write_error_lock
, flags
);
1133 bitmap_set(&stripe
->write_error_bitmap
, sector_nr
,
1134 bio_size
>> fs_info
->sectorsize_bits
);
1135 spin_unlock_irqrestore(&stripe
->write_error_lock
, flags
);
1137 bio_put(&bbio
->bio
);
1139 if (atomic_dec_and_test(&stripe
->pending_io
))
1140 wake_up(&stripe
->io_wait
);
1143 static void scrub_submit_write_bio(struct scrub_ctx
*sctx
,
1144 struct scrub_stripe
*stripe
,
1145 struct btrfs_bio
*bbio
, bool dev_replace
)
1147 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1148 u32 bio_len
= bbio
->bio
.bi_iter
.bi_size
;
1149 u32 bio_off
= (bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
) -
1152 fill_writer_pointer_gap(sctx
, stripe
->physical
+ bio_off
);
1153 atomic_inc(&stripe
->pending_io
);
1154 btrfs_submit_repair_write(bbio
, stripe
->mirror_num
, dev_replace
);
1155 if (!btrfs_is_zoned(fs_info
))
1158 * For zoned writeback, queue depth must be 1, thus we must wait for
1159 * the write to finish before the next write.
1161 wait_scrub_stripe_io(stripe
);
1164 * And also need to update the write pointer if write finished
1167 if (!test_bit(bio_off
>> fs_info
->sectorsize_bits
,
1168 &stripe
->write_error_bitmap
))
1169 sctx
->write_pointer
+= bio_len
;
1173 * Submit the write bio(s) for the sectors specified by @write_bitmap.
1175 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
1177 * - Only needs logical bytenr and mirror_num
1178 * Just like the scrub read path
1180 * - Would only result in writes to the specified mirror
1181 * Unlike the regular writeback path, which would write back to all stripes
1183 * - Handle dev-replace and read-repair writeback differently
1185 static void scrub_write_sectors(struct scrub_ctx
*sctx
, struct scrub_stripe
*stripe
,
1186 unsigned long write_bitmap
, bool dev_replace
)
1188 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1189 struct btrfs_bio
*bbio
= NULL
;
1192 for_each_set_bit(sector_nr
, &write_bitmap
, stripe
->nr_sectors
) {
1193 struct page
*page
= scrub_stripe_get_page(stripe
, sector_nr
);
1194 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, sector_nr
);
1197 /* We should only writeback sectors covered by an extent. */
1198 ASSERT(test_bit(sector_nr
, &stripe
->extent_sector_bitmap
));
1200 /* Cannot merge with previous sector, submit the current one. */
1201 if (bbio
&& sector_nr
&& !test_bit(sector_nr
- 1, &write_bitmap
)) {
1202 scrub_submit_write_bio(sctx
, stripe
, bbio
, dev_replace
);
1206 bbio
= btrfs_bio_alloc(stripe
->nr_sectors
, REQ_OP_WRITE
,
1207 fs_info
, scrub_write_endio
, stripe
);
1208 bbio
->bio
.bi_iter
.bi_sector
= (stripe
->logical
+
1209 (sector_nr
<< fs_info
->sectorsize_bits
)) >>
1212 ret
= bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
1213 ASSERT(ret
== fs_info
->sectorsize
);
1216 scrub_submit_write_bio(sctx
, stripe
, bbio
, dev_replace
);
1220 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1221 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1223 static void scrub_throttle_dev_io(struct scrub_ctx
*sctx
, struct btrfs_device
*device
,
1224 unsigned int bio_size
)
1226 const int time_slice
= 1000;
1232 bwlimit
= READ_ONCE(device
->scrub_speed_max
);
1237 * Slice is divided into intervals when the IO is submitted, adjust by
1238 * bwlimit and maximum of 64 intervals.
1240 div
= max_t(u32
, 1, (u32
)(bwlimit
/ (16 * 1024 * 1024)));
1241 div
= min_t(u32
, 64, div
);
1243 /* Start new epoch, set deadline */
1245 if (sctx
->throttle_deadline
== 0) {
1246 sctx
->throttle_deadline
= ktime_add_ms(now
, time_slice
/ div
);
1247 sctx
->throttle_sent
= 0;
1250 /* Still in the time to send? */
1251 if (ktime_before(now
, sctx
->throttle_deadline
)) {
1252 /* If current bio is within the limit, send it */
1253 sctx
->throttle_sent
+= bio_size
;
1254 if (sctx
->throttle_sent
<= div_u64(bwlimit
, div
))
1257 /* We're over the limit, sleep until the rest of the slice */
1258 delta
= ktime_ms_delta(sctx
->throttle_deadline
, now
);
1260 /* New request after deadline, start new epoch */
1267 timeout
= div_u64(delta
* HZ
, 1000);
1268 schedule_timeout_interruptible(timeout
);
1271 /* Next call will start the deadline period */
1272 sctx
->throttle_deadline
= 0;
1276 * Given a physical address, this will calculate it's
1277 * logical offset. if this is a parity stripe, it will return
1278 * the most left data stripe's logical offset.
1280 * return 0 if it is a data stripe, 1 means parity stripe.
1282 static int get_raid56_logic_offset(u64 physical
, int num
,
1283 struct map_lookup
*map
, u64
*offset
,
1289 const int data_stripes
= nr_data_stripes(map
);
1291 last_offset
= (physical
- map
->stripes
[num
].physical
) * data_stripes
;
1293 *stripe_start
= last_offset
;
1295 *offset
= last_offset
;
1296 for (i
= 0; i
< data_stripes
; i
++) {
1301 *offset
= last_offset
+ btrfs_stripe_nr_to_offset(i
);
1303 stripe_nr
= (u32
)(*offset
>> BTRFS_STRIPE_LEN_SHIFT
) / data_stripes
;
1305 /* Work out the disk rotation on this stripe-set */
1306 rot
= stripe_nr
% map
->num_stripes
;
1307 /* calculate which stripe this data locates */
1309 stripe_index
= rot
% map
->num_stripes
;
1310 if (stripe_index
== num
)
1312 if (stripe_index
< num
)
1315 *offset
= last_offset
+ btrfs_stripe_nr_to_offset(j
);
1320 * Return 0 if the extent item range covers any byte of the range.
1321 * Return <0 if the extent item is before @search_start.
1322 * Return >0 if the extent item is after @start_start + @search_len.
1324 static int compare_extent_item_range(struct btrfs_path
*path
,
1325 u64 search_start
, u64 search_len
)
1327 struct btrfs_fs_info
*fs_info
= path
->nodes
[0]->fs_info
;
1329 struct btrfs_key key
;
1331 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1332 ASSERT(key
.type
== BTRFS_EXTENT_ITEM_KEY
||
1333 key
.type
== BTRFS_METADATA_ITEM_KEY
);
1334 if (key
.type
== BTRFS_METADATA_ITEM_KEY
)
1335 len
= fs_info
->nodesize
;
1339 if (key
.objectid
+ len
<= search_start
)
1341 if (key
.objectid
>= search_start
+ search_len
)
1347 * Locate one extent item which covers any byte in range
1348 * [@search_start, @search_start + @search_length)
1350 * If the path is not initialized, we will initialize the search by doing
1351 * a btrfs_search_slot().
1352 * If the path is already initialized, we will use the path as the initial
1353 * slot, to avoid duplicated btrfs_search_slot() calls.
1355 * NOTE: If an extent item starts before @search_start, we will still
1356 * return the extent item. This is for data extent crossing stripe boundary.
1358 * Return 0 if we found such extent item, and @path will point to the extent item.
1359 * Return >0 if no such extent item can be found, and @path will be released.
1360 * Return <0 if hit fatal error, and @path will be released.
1362 static int find_first_extent_item(struct btrfs_root
*extent_root
,
1363 struct btrfs_path
*path
,
1364 u64 search_start
, u64 search_len
)
1366 struct btrfs_fs_info
*fs_info
= extent_root
->fs_info
;
1367 struct btrfs_key key
;
1370 /* Continue using the existing path */
1372 goto search_forward
;
1374 if (btrfs_fs_incompat(fs_info
, SKINNY_METADATA
))
1375 key
.type
= BTRFS_METADATA_ITEM_KEY
;
1377 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1378 key
.objectid
= search_start
;
1379 key
.offset
= (u64
)-1;
1381 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
1387 * Here we intentionally pass 0 as @min_objectid, as there could be
1388 * an extent item starting before @search_start.
1390 ret
= btrfs_previous_extent_item(extent_root
, path
, 0);
1394 * No matter whether we have found an extent item, the next loop will
1395 * properly do every check on the key.
1399 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1400 if (key
.objectid
>= search_start
+ search_len
)
1402 if (key
.type
!= BTRFS_METADATA_ITEM_KEY
&&
1403 key
.type
!= BTRFS_EXTENT_ITEM_KEY
)
1406 ret
= compare_extent_item_range(path
, search_start
, search_len
);
1413 if (path
->slots
[0] >= btrfs_header_nritems(path
->nodes
[0])) {
1414 ret
= btrfs_next_leaf(extent_root
, path
);
1416 /* Either no more item or fatal error */
1417 btrfs_release_path(path
);
1422 btrfs_release_path(path
);
1426 static void get_extent_info(struct btrfs_path
*path
, u64
*extent_start_ret
,
1427 u64
*size_ret
, u64
*flags_ret
, u64
*generation_ret
)
1429 struct btrfs_key key
;
1430 struct btrfs_extent_item
*ei
;
1432 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1433 ASSERT(key
.type
== BTRFS_METADATA_ITEM_KEY
||
1434 key
.type
== BTRFS_EXTENT_ITEM_KEY
);
1435 *extent_start_ret
= key
.objectid
;
1436 if (key
.type
== BTRFS_METADATA_ITEM_KEY
)
1437 *size_ret
= path
->nodes
[0]->fs_info
->nodesize
;
1439 *size_ret
= key
.offset
;
1440 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0], struct btrfs_extent_item
);
1441 *flags_ret
= btrfs_extent_flags(path
->nodes
[0], ei
);
1442 *generation_ret
= btrfs_extent_generation(path
->nodes
[0], ei
);
1445 static int sync_write_pointer_for_zoned(struct scrub_ctx
*sctx
, u64 logical
,
1446 u64 physical
, u64 physical_end
)
1448 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1451 if (!btrfs_is_zoned(fs_info
))
1454 mutex_lock(&sctx
->wr_lock
);
1455 if (sctx
->write_pointer
< physical_end
) {
1456 ret
= btrfs_sync_zone_write_pointer(sctx
->wr_tgtdev
, logical
,
1458 sctx
->write_pointer
);
1461 "zoned: failed to recover write pointer");
1463 mutex_unlock(&sctx
->wr_lock
);
1464 btrfs_dev_clear_zone_empty(sctx
->wr_tgtdev
, physical
);
1469 static void fill_one_extent_info(struct btrfs_fs_info
*fs_info
,
1470 struct scrub_stripe
*stripe
,
1471 u64 extent_start
, u64 extent_len
,
1472 u64 extent_flags
, u64 extent_gen
)
1474 for (u64 cur_logical
= max(stripe
->logical
, extent_start
);
1475 cur_logical
< min(stripe
->logical
+ BTRFS_STRIPE_LEN
,
1476 extent_start
+ extent_len
);
1477 cur_logical
+= fs_info
->sectorsize
) {
1478 const int nr_sector
= (cur_logical
- stripe
->logical
) >>
1479 fs_info
->sectorsize_bits
;
1480 struct scrub_sector_verification
*sector
=
1481 &stripe
->sectors
[nr_sector
];
1483 set_bit(nr_sector
, &stripe
->extent_sector_bitmap
);
1484 if (extent_flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
1485 sector
->is_metadata
= true;
1486 sector
->generation
= extent_gen
;
1491 static void scrub_stripe_reset_bitmaps(struct scrub_stripe
*stripe
)
1493 stripe
->extent_sector_bitmap
= 0;
1494 stripe
->init_error_bitmap
= 0;
1495 stripe
->init_nr_io_errors
= 0;
1496 stripe
->init_nr_csum_errors
= 0;
1497 stripe
->init_nr_meta_errors
= 0;
1498 stripe
->error_bitmap
= 0;
1499 stripe
->io_error_bitmap
= 0;
1500 stripe
->csum_error_bitmap
= 0;
1501 stripe
->meta_error_bitmap
= 0;
1505 * Locate one stripe which has at least one extent in its range.
1507 * Return 0 if found such stripe, and store its info into @stripe.
1508 * Return >0 if there is no such stripe in the specified range.
1509 * Return <0 for error.
1511 static int scrub_find_fill_first_stripe(struct btrfs_block_group
*bg
,
1512 struct btrfs_path
*extent_path
,
1513 struct btrfs_path
*csum_path
,
1514 struct btrfs_device
*dev
, u64 physical
,
1515 int mirror_num
, u64 logical_start
,
1517 struct scrub_stripe
*stripe
)
1519 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1520 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, bg
->start
);
1521 struct btrfs_root
*csum_root
= btrfs_csum_root(fs_info
, bg
->start
);
1522 const u64 logical_end
= logical_start
+ logical_len
;
1523 u64 cur_logical
= logical_start
;
1531 memset(stripe
->sectors
, 0, sizeof(struct scrub_sector_verification
) *
1532 stripe
->nr_sectors
);
1533 scrub_stripe_reset_bitmaps(stripe
);
1535 /* The range must be inside the bg. */
1536 ASSERT(logical_start
>= bg
->start
&& logical_end
<= bg
->start
+ bg
->length
);
1538 ret
= find_first_extent_item(extent_root
, extent_path
, logical_start
,
1540 /* Either error or not found. */
1543 get_extent_info(extent_path
, &extent_start
, &extent_len
, &extent_flags
,
1545 if (extent_flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
1546 stripe
->nr_meta_extents
++;
1547 if (extent_flags
& BTRFS_EXTENT_FLAG_DATA
)
1548 stripe
->nr_data_extents
++;
1549 cur_logical
= max(extent_start
, cur_logical
);
1552 * Round down to stripe boundary.
1554 * The extra calculation against bg->start is to handle block groups
1555 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
1557 stripe
->logical
= round_down(cur_logical
- bg
->start
, BTRFS_STRIPE_LEN
) +
1559 stripe
->physical
= physical
+ stripe
->logical
- logical_start
;
1562 stripe
->mirror_num
= mirror_num
;
1563 stripe_end
= stripe
->logical
+ BTRFS_STRIPE_LEN
- 1;
1565 /* Fill the first extent info into stripe->sectors[] array. */
1566 fill_one_extent_info(fs_info
, stripe
, extent_start
, extent_len
,
1567 extent_flags
, extent_gen
);
1568 cur_logical
= extent_start
+ extent_len
;
1570 /* Fill the extent info for the remaining sectors. */
1571 while (cur_logical
<= stripe_end
) {
1572 ret
= find_first_extent_item(extent_root
, extent_path
, cur_logical
,
1573 stripe_end
- cur_logical
+ 1);
1580 get_extent_info(extent_path
, &extent_start
, &extent_len
,
1581 &extent_flags
, &extent_gen
);
1582 if (extent_flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
1583 stripe
->nr_meta_extents
++;
1584 if (extent_flags
& BTRFS_EXTENT_FLAG_DATA
)
1585 stripe
->nr_data_extents
++;
1586 fill_one_extent_info(fs_info
, stripe
, extent_start
, extent_len
,
1587 extent_flags
, extent_gen
);
1588 cur_logical
= extent_start
+ extent_len
;
1591 /* Now fill the data csum. */
1592 if (bg
->flags
& BTRFS_BLOCK_GROUP_DATA
) {
1594 unsigned long csum_bitmap
= 0;
1596 /* Csum space should have already been allocated. */
1597 ASSERT(stripe
->csums
);
1600 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
1601 * should contain at most 16 sectors.
1603 ASSERT(BITS_PER_LONG
>= BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
);
1605 ret
= btrfs_lookup_csums_bitmap(csum_root
, csum_path
,
1606 stripe
->logical
, stripe_end
,
1607 stripe
->csums
, &csum_bitmap
);
1613 for_each_set_bit(sector_nr
, &csum_bitmap
, stripe
->nr_sectors
) {
1614 stripe
->sectors
[sector_nr
].csum
= stripe
->csums
+
1615 sector_nr
* fs_info
->csum_size
;
1618 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
);
1623 static void scrub_reset_stripe(struct scrub_stripe
*stripe
)
1625 scrub_stripe_reset_bitmaps(stripe
);
1627 stripe
->nr_meta_extents
= 0;
1628 stripe
->nr_data_extents
= 0;
1631 for (int i
= 0; i
< stripe
->nr_sectors
; i
++) {
1632 stripe
->sectors
[i
].is_metadata
= false;
1633 stripe
->sectors
[i
].csum
= NULL
;
1634 stripe
->sectors
[i
].generation
= 0;
1638 static void scrub_submit_extent_sector_read(struct scrub_ctx
*sctx
,
1639 struct scrub_stripe
*stripe
)
1641 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1642 struct btrfs_bio
*bbio
= NULL
;
1643 u64 stripe_len
= BTRFS_STRIPE_LEN
;
1644 int mirror
= stripe
->mirror_num
;
1647 atomic_inc(&stripe
->pending_io
);
1649 for_each_set_bit(i
, &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
) {
1650 struct page
*page
= scrub_stripe_get_page(stripe
, i
);
1651 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, i
);
1653 /* The current sector cannot be merged, submit the bio. */
1656 !test_bit(i
- 1, &stripe
->extent_sector_bitmap
)) ||
1657 bbio
->bio
.bi_iter
.bi_size
>= stripe_len
)) {
1658 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
1659 atomic_inc(&stripe
->pending_io
);
1660 btrfs_submit_bio(bbio
, mirror
);
1665 struct btrfs_io_stripe io_stripe
= {};
1666 struct btrfs_io_context
*bioc
= NULL
;
1667 const u64 logical
= stripe
->logical
+
1668 (i
<< fs_info
->sectorsize_bits
);
1671 bbio
= btrfs_bio_alloc(stripe
->nr_sectors
, REQ_OP_READ
,
1672 fs_info
, scrub_read_endio
, stripe
);
1673 bbio
->bio
.bi_iter
.bi_sector
= logical
>> SECTOR_SHIFT
;
1675 io_stripe
.is_scrub
= true;
1676 err
= btrfs_map_block(fs_info
, BTRFS_MAP_READ
, logical
,
1677 &stripe_len
, &bioc
, &io_stripe
,
1679 btrfs_put_bioc(bioc
);
1681 btrfs_bio_end_io(bbio
,
1682 errno_to_blk_status(err
));
1687 __bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
1691 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
1692 atomic_inc(&stripe
->pending_io
);
1693 btrfs_submit_bio(bbio
, mirror
);
1696 if (atomic_dec_and_test(&stripe
->pending_io
)) {
1697 wake_up(&stripe
->io_wait
);
1698 INIT_WORK(&stripe
->work
, scrub_stripe_read_repair_worker
);
1699 queue_work(stripe
->bg
->fs_info
->scrub_workers
, &stripe
->work
);
1703 static void scrub_submit_initial_read(struct scrub_ctx
*sctx
,
1704 struct scrub_stripe
*stripe
)
1706 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1707 struct btrfs_bio
*bbio
;
1708 int mirror
= stripe
->mirror_num
;
1711 ASSERT(stripe
->mirror_num
> 0);
1712 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
));
1714 if (btrfs_need_stripe_tree_update(fs_info
, stripe
->bg
->flags
)) {
1715 scrub_submit_extent_sector_read(sctx
, stripe
);
1719 bbio
= btrfs_bio_alloc(SCRUB_STRIPE_PAGES
, REQ_OP_READ
, fs_info
,
1720 scrub_read_endio
, stripe
);
1722 /* Read the whole stripe. */
1723 bbio
->bio
.bi_iter
.bi_sector
= stripe
->logical
>> SECTOR_SHIFT
;
1724 for (int i
= 0; i
< BTRFS_STRIPE_LEN
>> PAGE_SHIFT
; i
++) {
1727 ret
= bio_add_page(&bbio
->bio
, stripe
->pages
[i
], PAGE_SIZE
, 0);
1728 /* We should have allocated enough bio vectors. */
1729 ASSERT(ret
== PAGE_SIZE
);
1731 atomic_inc(&stripe
->pending_io
);
1734 * For dev-replace, either user asks to avoid the source dev, or
1735 * the device is missing, we try the next mirror instead.
1737 if (sctx
->is_dev_replace
&&
1738 (fs_info
->dev_replace
.cont_reading_from_srcdev_mode
==
1739 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID
||
1740 !stripe
->dev
->bdev
)) {
1741 int num_copies
= btrfs_num_copies(fs_info
, stripe
->bg
->start
,
1742 stripe
->bg
->length
);
1744 mirror
= calc_next_mirror(mirror
, num_copies
);
1746 btrfs_submit_bio(bbio
, mirror
);
1749 static bool stripe_has_metadata_error(struct scrub_stripe
*stripe
)
1753 for_each_set_bit(i
, &stripe
->error_bitmap
, stripe
->nr_sectors
) {
1754 if (stripe
->sectors
[i
].is_metadata
) {
1755 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1758 "stripe %llu has unrepaired metadata sector at %llu",
1760 stripe
->logical
+ (i
<< fs_info
->sectorsize_bits
));
1767 static void submit_initial_group_read(struct scrub_ctx
*sctx
,
1768 unsigned int first_slot
,
1769 unsigned int nr_stripes
)
1771 struct blk_plug plug
;
1773 ASSERT(first_slot
< SCRUB_TOTAL_STRIPES
);
1774 ASSERT(first_slot
+ nr_stripes
<= SCRUB_TOTAL_STRIPES
);
1776 scrub_throttle_dev_io(sctx
, sctx
->stripes
[0].dev
,
1777 btrfs_stripe_nr_to_offset(nr_stripes
));
1778 blk_start_plug(&plug
);
1779 for (int i
= 0; i
< nr_stripes
; i
++) {
1780 struct scrub_stripe
*stripe
= &sctx
->stripes
[first_slot
+ i
];
1782 /* Those stripes should be initialized. */
1783 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
));
1784 scrub_submit_initial_read(sctx
, stripe
);
1786 blk_finish_plug(&plug
);
1789 static int flush_scrub_stripes(struct scrub_ctx
*sctx
)
1791 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1792 struct scrub_stripe
*stripe
;
1793 const int nr_stripes
= sctx
->cur_stripe
;
1799 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &sctx
->stripes
[0].state
));
1801 /* Submit the stripes which are populated but not submitted. */
1802 if (nr_stripes
% SCRUB_STRIPES_PER_GROUP
) {
1803 const int first_slot
= round_down(nr_stripes
, SCRUB_STRIPES_PER_GROUP
);
1805 submit_initial_group_read(sctx
, first_slot
, nr_stripes
- first_slot
);
1808 for (int i
= 0; i
< nr_stripes
; i
++) {
1809 stripe
= &sctx
->stripes
[i
];
1811 wait_event(stripe
->repair_wait
,
1812 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE
, &stripe
->state
));
1815 /* Submit for dev-replace. */
1816 if (sctx
->is_dev_replace
) {
1818 * For dev-replace, if we know there is something wrong with
1819 * metadata, we should immedately abort.
1821 for (int i
= 0; i
< nr_stripes
; i
++) {
1822 if (stripe_has_metadata_error(&sctx
->stripes
[i
])) {
1827 for (int i
= 0; i
< nr_stripes
; i
++) {
1830 stripe
= &sctx
->stripes
[i
];
1832 ASSERT(stripe
->dev
== fs_info
->dev_replace
.srcdev
);
1834 bitmap_andnot(&good
, &stripe
->extent_sector_bitmap
,
1835 &stripe
->error_bitmap
, stripe
->nr_sectors
);
1836 scrub_write_sectors(sctx
, stripe
, good
, true);
1840 /* Wait for the above writebacks to finish. */
1841 for (int i
= 0; i
< nr_stripes
; i
++) {
1842 stripe
= &sctx
->stripes
[i
];
1844 wait_scrub_stripe_io(stripe
);
1845 scrub_reset_stripe(stripe
);
1848 sctx
->cur_stripe
= 0;
1852 static void raid56_scrub_wait_endio(struct bio
*bio
)
1854 complete(bio
->bi_private
);
1857 static int queue_scrub_stripe(struct scrub_ctx
*sctx
, struct btrfs_block_group
*bg
,
1858 struct btrfs_device
*dev
, int mirror_num
,
1859 u64 logical
, u32 length
, u64 physical
,
1860 u64
*found_logical_ret
)
1862 struct scrub_stripe
*stripe
;
1866 * There should always be one slot left, as caller filling the last
1867 * slot should flush them all.
1869 ASSERT(sctx
->cur_stripe
< SCRUB_TOTAL_STRIPES
);
1871 stripe
= &sctx
->stripes
[sctx
->cur_stripe
];
1872 scrub_reset_stripe(stripe
);
1873 ret
= scrub_find_fill_first_stripe(bg
, &sctx
->extent_path
,
1874 &sctx
->csum_path
, dev
, physical
,
1875 mirror_num
, logical
, length
, stripe
);
1876 /* Either >0 as no more extents or <0 for error. */
1879 if (found_logical_ret
)
1880 *found_logical_ret
= stripe
->logical
;
1883 /* We filled one group, submit it. */
1884 if (sctx
->cur_stripe
% SCRUB_STRIPES_PER_GROUP
== 0) {
1885 const int first_slot
= sctx
->cur_stripe
- SCRUB_STRIPES_PER_GROUP
;
1887 submit_initial_group_read(sctx
, first_slot
, SCRUB_STRIPES_PER_GROUP
);
1890 /* Last slot used, flush them all. */
1891 if (sctx
->cur_stripe
== SCRUB_TOTAL_STRIPES
)
1892 return flush_scrub_stripes(sctx
);
1896 static int scrub_raid56_parity_stripe(struct scrub_ctx
*sctx
,
1897 struct btrfs_device
*scrub_dev
,
1898 struct btrfs_block_group
*bg
,
1899 struct map_lookup
*map
,
1900 u64 full_stripe_start
)
1902 DECLARE_COMPLETION_ONSTACK(io_done
);
1903 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1904 struct btrfs_raid_bio
*rbio
;
1905 struct btrfs_io_context
*bioc
= NULL
;
1906 struct btrfs_path extent_path
= { 0 };
1907 struct btrfs_path csum_path
= { 0 };
1909 struct scrub_stripe
*stripe
;
1910 bool all_empty
= true;
1911 const int data_stripes
= nr_data_stripes(map
);
1912 unsigned long extent_bitmap
= 0;
1913 u64 length
= btrfs_stripe_nr_to_offset(data_stripes
);
1916 ASSERT(sctx
->raid56_data_stripes
);
1919 * For data stripe search, we cannot re-use the same extent/csum paths,
1920 * as the data stripe bytenr may be smaller than previous extent. Thus
1921 * we have to use our own extent/csum paths.
1923 extent_path
.search_commit_root
= 1;
1924 extent_path
.skip_locking
= 1;
1925 csum_path
.search_commit_root
= 1;
1926 csum_path
.skip_locking
= 1;
1928 for (int i
= 0; i
< data_stripes
; i
++) {
1933 stripe
= &sctx
->raid56_data_stripes
[i
];
1934 rot
= div_u64(full_stripe_start
- bg
->start
,
1935 data_stripes
) >> BTRFS_STRIPE_LEN_SHIFT
;
1936 stripe_index
= (i
+ rot
) % map
->num_stripes
;
1937 physical
= map
->stripes
[stripe_index
].physical
+
1938 btrfs_stripe_nr_to_offset(rot
);
1940 scrub_reset_stripe(stripe
);
1941 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT
, &stripe
->state
);
1942 ret
= scrub_find_fill_first_stripe(bg
, &extent_path
, &csum_path
,
1943 map
->stripes
[stripe_index
].dev
, physical
, 1,
1944 full_stripe_start
+ btrfs_stripe_nr_to_offset(i
),
1945 BTRFS_STRIPE_LEN
, stripe
);
1949 * No extent in this data stripe, need to manually mark them
1950 * initialized to make later read submission happy.
1953 stripe
->logical
= full_stripe_start
+
1954 btrfs_stripe_nr_to_offset(i
);
1955 stripe
->dev
= map
->stripes
[stripe_index
].dev
;
1956 stripe
->mirror_num
= 1;
1957 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
);
1961 /* Check if all data stripes are empty. */
1962 for (int i
= 0; i
< data_stripes
; i
++) {
1963 stripe
= &sctx
->raid56_data_stripes
[i
];
1964 if (!bitmap_empty(&stripe
->extent_sector_bitmap
, stripe
->nr_sectors
)) {
1974 for (int i
= 0; i
< data_stripes
; i
++) {
1975 stripe
= &sctx
->raid56_data_stripes
[i
];
1976 scrub_submit_initial_read(sctx
, stripe
);
1978 for (int i
= 0; i
< data_stripes
; i
++) {
1979 stripe
= &sctx
->raid56_data_stripes
[i
];
1981 wait_event(stripe
->repair_wait
,
1982 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE
, &stripe
->state
));
1984 /* For now, no zoned support for RAID56. */
1985 ASSERT(!btrfs_is_zoned(sctx
->fs_info
));
1988 * Now all data stripes are properly verified. Check if we have any
1989 * unrepaired, if so abort immediately or we could further corrupt the
1992 * During the loop, also populate extent_bitmap.
1994 for (int i
= 0; i
< data_stripes
; i
++) {
1995 unsigned long error
;
1997 stripe
= &sctx
->raid56_data_stripes
[i
];
2000 * We should only check the errors where there is an extent.
2001 * As we may hit an empty data stripe while it's missing.
2003 bitmap_and(&error
, &stripe
->error_bitmap
,
2004 &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
);
2005 if (!bitmap_empty(&error
, stripe
->nr_sectors
)) {
2007 "unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
2008 full_stripe_start
, i
, stripe
->nr_sectors
,
2013 bitmap_or(&extent_bitmap
, &extent_bitmap
,
2014 &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
);
2017 /* Now we can check and regenerate the P/Q stripe. */
2018 bio
= bio_alloc(NULL
, 1, REQ_OP_READ
, GFP_NOFS
);
2019 bio
->bi_iter
.bi_sector
= full_stripe_start
>> SECTOR_SHIFT
;
2020 bio
->bi_private
= &io_done
;
2021 bio
->bi_end_io
= raid56_scrub_wait_endio
;
2023 btrfs_bio_counter_inc_blocked(fs_info
);
2024 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_WRITE
, full_stripe_start
,
2025 &length
, &bioc
, NULL
, NULL
);
2027 btrfs_put_bioc(bioc
);
2028 btrfs_bio_counter_dec(fs_info
);
2031 rbio
= raid56_parity_alloc_scrub_rbio(bio
, bioc
, scrub_dev
, &extent_bitmap
,
2032 BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
);
2033 btrfs_put_bioc(bioc
);
2036 btrfs_bio_counter_dec(fs_info
);
2039 /* Use the recovered stripes as cache to avoid read them from disk again. */
2040 for (int i
= 0; i
< data_stripes
; i
++) {
2041 stripe
= &sctx
->raid56_data_stripes
[i
];
2043 raid56_parity_cache_data_pages(rbio
, stripe
->pages
,
2044 full_stripe_start
+ (i
<< BTRFS_STRIPE_LEN_SHIFT
));
2046 raid56_parity_submit_scrub_rbio(rbio
);
2047 wait_for_completion_io(&io_done
);
2048 ret
= blk_status_to_errno(bio
->bi_status
);
2050 btrfs_bio_counter_dec(fs_info
);
2052 btrfs_release_path(&extent_path
);
2053 btrfs_release_path(&csum_path
);
2059 * Scrub one range which can only has simple mirror based profile.
2060 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
2063 * Since we may need to handle a subset of block group, we need @logical_start
2064 * and @logical_length parameter.
2066 static int scrub_simple_mirror(struct scrub_ctx
*sctx
,
2067 struct btrfs_block_group
*bg
,
2068 struct map_lookup
*map
,
2069 u64 logical_start
, u64 logical_length
,
2070 struct btrfs_device
*device
,
2071 u64 physical
, int mirror_num
)
2073 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2074 const u64 logical_end
= logical_start
+ logical_length
;
2075 u64 cur_logical
= logical_start
;
2078 /* The range must be inside the bg */
2079 ASSERT(logical_start
>= bg
->start
&& logical_end
<= bg
->start
+ bg
->length
);
2081 /* Go through each extent items inside the logical range */
2082 while (cur_logical
< logical_end
) {
2084 u64 cur_physical
= physical
+ cur_logical
- logical_start
;
2087 if (atomic_read(&fs_info
->scrub_cancel_req
) ||
2088 atomic_read(&sctx
->cancel_req
)) {
2093 if (atomic_read(&fs_info
->scrub_pause_req
)) {
2094 /* Push queued extents */
2095 scrub_blocked_if_needed(fs_info
);
2097 /* Block group removed? */
2098 spin_lock(&bg
->lock
);
2099 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &bg
->runtime_flags
)) {
2100 spin_unlock(&bg
->lock
);
2104 spin_unlock(&bg
->lock
);
2106 ret
= queue_scrub_stripe(sctx
, bg
, device
, mirror_num
,
2107 cur_logical
, logical_end
- cur_logical
,
2108 cur_physical
, &found_logical
);
2110 /* No more extent, just update the accounting */
2111 sctx
->stat
.last_physical
= physical
+ logical_length
;
2118 cur_logical
= found_logical
+ BTRFS_STRIPE_LEN
;
2120 /* Don't hold CPU for too long time */
2126 /* Calculate the full stripe length for simple stripe based profiles */
2127 static u64
simple_stripe_full_stripe_len(const struct map_lookup
*map
)
2129 ASSERT(map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
2130 BTRFS_BLOCK_GROUP_RAID10
));
2132 return btrfs_stripe_nr_to_offset(map
->num_stripes
/ map
->sub_stripes
);
2135 /* Get the logical bytenr for the stripe */
2136 static u64
simple_stripe_get_logical(struct map_lookup
*map
,
2137 struct btrfs_block_group
*bg
,
2140 ASSERT(map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
2141 BTRFS_BLOCK_GROUP_RAID10
));
2142 ASSERT(stripe_index
< map
->num_stripes
);
2145 * (stripe_index / sub_stripes) gives how many data stripes we need to
2148 return btrfs_stripe_nr_to_offset(stripe_index
/ map
->sub_stripes
) +
2152 /* Get the mirror number for the stripe */
2153 static int simple_stripe_mirror_num(struct map_lookup
*map
, int stripe_index
)
2155 ASSERT(map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
2156 BTRFS_BLOCK_GROUP_RAID10
));
2157 ASSERT(stripe_index
< map
->num_stripes
);
2159 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
2160 return stripe_index
% map
->sub_stripes
+ 1;
2163 static int scrub_simple_stripe(struct scrub_ctx
*sctx
,
2164 struct btrfs_block_group
*bg
,
2165 struct map_lookup
*map
,
2166 struct btrfs_device
*device
,
2169 const u64 logical_increment
= simple_stripe_full_stripe_len(map
);
2170 const u64 orig_logical
= simple_stripe_get_logical(map
, bg
, stripe_index
);
2171 const u64 orig_physical
= map
->stripes
[stripe_index
].physical
;
2172 const int mirror_num
= simple_stripe_mirror_num(map
, stripe_index
);
2173 u64 cur_logical
= orig_logical
;
2174 u64 cur_physical
= orig_physical
;
2177 while (cur_logical
< bg
->start
+ bg
->length
) {
2179 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
2180 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
2183 ret
= scrub_simple_mirror(sctx
, bg
, map
, cur_logical
,
2184 BTRFS_STRIPE_LEN
, device
, cur_physical
,
2188 /* Skip to next stripe which belongs to the target device */
2189 cur_logical
+= logical_increment
;
2190 /* For physical offset, we just go to next stripe */
2191 cur_physical
+= BTRFS_STRIPE_LEN
;
2196 static noinline_for_stack
int scrub_stripe(struct scrub_ctx
*sctx
,
2197 struct btrfs_block_group
*bg
,
2198 struct extent_map
*em
,
2199 struct btrfs_device
*scrub_dev
,
2202 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2203 struct map_lookup
*map
= em
->map_lookup
;
2204 const u64 profile
= map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
;
2205 const u64 chunk_logical
= bg
->start
;
2208 u64 physical
= map
->stripes
[stripe_index
].physical
;
2209 const u64 dev_stripe_len
= btrfs_calc_stripe_length(em
);
2210 const u64 physical_end
= physical
+ dev_stripe_len
;
2213 /* The logical increment after finishing one stripe */
2215 /* Offset inside the chunk */
2220 /* Extent_path should be released by now. */
2221 ASSERT(sctx
->extent_path
.nodes
[0] == NULL
);
2223 scrub_blocked_if_needed(fs_info
);
2225 if (sctx
->is_dev_replace
&&
2226 btrfs_dev_is_sequential(sctx
->wr_tgtdev
, physical
)) {
2227 mutex_lock(&sctx
->wr_lock
);
2228 sctx
->write_pointer
= physical
;
2229 mutex_unlock(&sctx
->wr_lock
);
2232 /* Prepare the extra data stripes used by RAID56. */
2233 if (profile
& BTRFS_BLOCK_GROUP_RAID56_MASK
) {
2234 ASSERT(sctx
->raid56_data_stripes
== NULL
);
2236 sctx
->raid56_data_stripes
= kcalloc(nr_data_stripes(map
),
2237 sizeof(struct scrub_stripe
),
2239 if (!sctx
->raid56_data_stripes
) {
2243 for (int i
= 0; i
< nr_data_stripes(map
); i
++) {
2244 ret
= init_scrub_stripe(fs_info
,
2245 &sctx
->raid56_data_stripes
[i
]);
2248 sctx
->raid56_data_stripes
[i
].bg
= bg
;
2249 sctx
->raid56_data_stripes
[i
].sctx
= sctx
;
2253 * There used to be a big double loop to handle all profiles using the
2254 * same routine, which grows larger and more gross over time.
2256 * So here we handle each profile differently, so simpler profiles
2257 * have simpler scrubbing function.
2259 if (!(profile
& (BTRFS_BLOCK_GROUP_RAID0
| BTRFS_BLOCK_GROUP_RAID10
|
2260 BTRFS_BLOCK_GROUP_RAID56_MASK
))) {
2262 * Above check rules out all complex profile, the remaining
2263 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
2264 * mirrored duplication without stripe.
2266 * Only @physical and @mirror_num needs to calculated using
2269 ret
= scrub_simple_mirror(sctx
, bg
, map
, bg
->start
, bg
->length
,
2270 scrub_dev
, map
->stripes
[stripe_index
].physical
,
2275 if (profile
& (BTRFS_BLOCK_GROUP_RAID0
| BTRFS_BLOCK_GROUP_RAID10
)) {
2276 ret
= scrub_simple_stripe(sctx
, bg
, map
, scrub_dev
, stripe_index
);
2277 offset
= btrfs_stripe_nr_to_offset(stripe_index
/ map
->sub_stripes
);
2281 /* Only RAID56 goes through the old code */
2282 ASSERT(map
->type
& BTRFS_BLOCK_GROUP_RAID56_MASK
);
2285 /* Calculate the logical end of the stripe */
2286 get_raid56_logic_offset(physical_end
, stripe_index
,
2287 map
, &logic_end
, NULL
);
2288 logic_end
+= chunk_logical
;
2290 /* Initialize @offset in case we need to go to out: label */
2291 get_raid56_logic_offset(physical
, stripe_index
, map
, &offset
, NULL
);
2292 increment
= btrfs_stripe_nr_to_offset(nr_data_stripes(map
));
2295 * Due to the rotation, for RAID56 it's better to iterate each stripe
2296 * using their physical offset.
2298 while (physical
< physical_end
) {
2299 ret
= get_raid56_logic_offset(physical
, stripe_index
, map
,
2300 &logical
, &stripe_logical
);
2301 logical
+= chunk_logical
;
2303 /* it is parity strip */
2304 stripe_logical
+= chunk_logical
;
2305 ret
= scrub_raid56_parity_stripe(sctx
, scrub_dev
, bg
,
2306 map
, stripe_logical
);
2313 * Now we're at a data stripe, scrub each extents in the range.
2315 * At this stage, if we ignore the repair part, inside each data
2316 * stripe it is no different than SINGLE profile.
2317 * We can reuse scrub_simple_mirror() here, as the repair part
2318 * is still based on @mirror_num.
2320 ret
= scrub_simple_mirror(sctx
, bg
, map
, logical
, BTRFS_STRIPE_LEN
,
2321 scrub_dev
, physical
, 1);
2325 logical
+= increment
;
2326 physical
+= BTRFS_STRIPE_LEN
;
2327 spin_lock(&sctx
->stat_lock
);
2329 sctx
->stat
.last_physical
=
2330 map
->stripes
[stripe_index
].physical
+ dev_stripe_len
;
2332 sctx
->stat
.last_physical
= physical
;
2333 spin_unlock(&sctx
->stat_lock
);
2338 ret2
= flush_scrub_stripes(sctx
);
2341 btrfs_release_path(&sctx
->extent_path
);
2342 btrfs_release_path(&sctx
->csum_path
);
2344 if (sctx
->raid56_data_stripes
) {
2345 for (int i
= 0; i
< nr_data_stripes(map
); i
++)
2346 release_scrub_stripe(&sctx
->raid56_data_stripes
[i
]);
2347 kfree(sctx
->raid56_data_stripes
);
2348 sctx
->raid56_data_stripes
= NULL
;
2351 if (sctx
->is_dev_replace
&& ret
>= 0) {
2354 ret2
= sync_write_pointer_for_zoned(sctx
,
2355 chunk_logical
+ offset
,
2356 map
->stripes
[stripe_index
].physical
,
2362 return ret
< 0 ? ret
: 0;
2365 static noinline_for_stack
int scrub_chunk(struct scrub_ctx
*sctx
,
2366 struct btrfs_block_group
*bg
,
2367 struct btrfs_device
*scrub_dev
,
2371 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2372 struct extent_map_tree
*map_tree
= &fs_info
->mapping_tree
;
2373 struct map_lookup
*map
;
2374 struct extent_map
*em
;
2378 read_lock(&map_tree
->lock
);
2379 em
= lookup_extent_mapping(map_tree
, bg
->start
, bg
->length
);
2380 read_unlock(&map_tree
->lock
);
2384 * Might have been an unused block group deleted by the cleaner
2385 * kthread or relocation.
2387 spin_lock(&bg
->lock
);
2388 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED
, &bg
->runtime_flags
))
2390 spin_unlock(&bg
->lock
);
2394 if (em
->start
!= bg
->start
)
2396 if (em
->len
< dev_extent_len
)
2399 map
= em
->map_lookup
;
2400 for (i
= 0; i
< map
->num_stripes
; ++i
) {
2401 if (map
->stripes
[i
].dev
->bdev
== scrub_dev
->bdev
&&
2402 map
->stripes
[i
].physical
== dev_offset
) {
2403 ret
= scrub_stripe(sctx
, bg
, em
, scrub_dev
, i
);
2409 free_extent_map(em
);
2414 static int finish_extent_writes_for_zoned(struct btrfs_root
*root
,
2415 struct btrfs_block_group
*cache
)
2417 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
2418 struct btrfs_trans_handle
*trans
;
2420 if (!btrfs_is_zoned(fs_info
))
2423 btrfs_wait_block_group_reservations(cache
);
2424 btrfs_wait_nocow_writers(cache
);
2425 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, cache
->start
, cache
->length
);
2427 trans
= btrfs_join_transaction(root
);
2429 return PTR_ERR(trans
);
2430 return btrfs_commit_transaction(trans
);
2433 static noinline_for_stack
2434 int scrub_enumerate_chunks(struct scrub_ctx
*sctx
,
2435 struct btrfs_device
*scrub_dev
, u64 start
, u64 end
)
2437 struct btrfs_dev_extent
*dev_extent
= NULL
;
2438 struct btrfs_path
*path
;
2439 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2440 struct btrfs_root
*root
= fs_info
->dev_root
;
2445 struct extent_buffer
*l
;
2446 struct btrfs_key key
;
2447 struct btrfs_key found_key
;
2448 struct btrfs_block_group
*cache
;
2449 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
2451 path
= btrfs_alloc_path();
2455 path
->reada
= READA_FORWARD
;
2456 path
->search_commit_root
= 1;
2457 path
->skip_locking
= 1;
2459 key
.objectid
= scrub_dev
->devid
;
2461 key
.type
= BTRFS_DEV_EXTENT_KEY
;
2466 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2470 if (path
->slots
[0] >=
2471 btrfs_header_nritems(path
->nodes
[0])) {
2472 ret
= btrfs_next_leaf(root
, path
);
2485 slot
= path
->slots
[0];
2487 btrfs_item_key_to_cpu(l
, &found_key
, slot
);
2489 if (found_key
.objectid
!= scrub_dev
->devid
)
2492 if (found_key
.type
!= BTRFS_DEV_EXTENT_KEY
)
2495 if (found_key
.offset
>= end
)
2498 if (found_key
.offset
< key
.offset
)
2501 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
2502 dev_extent_len
= btrfs_dev_extent_length(l
, dev_extent
);
2504 if (found_key
.offset
+ dev_extent_len
<= start
)
2507 chunk_offset
= btrfs_dev_extent_chunk_offset(l
, dev_extent
);
2510 * get a reference on the corresponding block group to prevent
2511 * the chunk from going away while we scrub it
2513 cache
= btrfs_lookup_block_group(fs_info
, chunk_offset
);
2515 /* some chunks are removed but not committed to disk yet,
2516 * continue scrubbing */
2520 ASSERT(cache
->start
<= chunk_offset
);
2522 * We are using the commit root to search for device extents, so
2523 * that means we could have found a device extent item from a
2524 * block group that was deleted in the current transaction. The
2525 * logical start offset of the deleted block group, stored at
2526 * @chunk_offset, might be part of the logical address range of
2527 * a new block group (which uses different physical extents).
2528 * In this case btrfs_lookup_block_group() has returned the new
2529 * block group, and its start address is less than @chunk_offset.
2531 * We skip such new block groups, because it's pointless to
2532 * process them, as we won't find their extents because we search
2533 * for them using the commit root of the extent tree. For a device
2534 * replace it's also fine to skip it, we won't miss copying them
2535 * to the target device because we have the write duplication
2536 * setup through the regular write path (by btrfs_map_block()),
2537 * and we have committed a transaction when we started the device
2538 * replace, right after setting up the device replace state.
2540 if (cache
->start
< chunk_offset
) {
2541 btrfs_put_block_group(cache
);
2545 if (sctx
->is_dev_replace
&& btrfs_is_zoned(fs_info
)) {
2546 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY
, &cache
->runtime_flags
)) {
2547 btrfs_put_block_group(cache
);
2553 * Make sure that while we are scrubbing the corresponding block
2554 * group doesn't get its logical address and its device extents
2555 * reused for another block group, which can possibly be of a
2556 * different type and different profile. We do this to prevent
2557 * false error detections and crashes due to bogus attempts to
2560 spin_lock(&cache
->lock
);
2561 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &cache
->runtime_flags
)) {
2562 spin_unlock(&cache
->lock
);
2563 btrfs_put_block_group(cache
);
2566 btrfs_freeze_block_group(cache
);
2567 spin_unlock(&cache
->lock
);
2570 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
2571 * to avoid deadlock caused by:
2572 * btrfs_inc_block_group_ro()
2573 * -> btrfs_wait_for_commit()
2574 * -> btrfs_commit_transaction()
2575 * -> btrfs_scrub_pause()
2577 scrub_pause_on(fs_info
);
2580 * Don't do chunk preallocation for scrub.
2582 * This is especially important for SYSTEM bgs, or we can hit
2583 * -EFBIG from btrfs_finish_chunk_alloc() like:
2584 * 1. The only SYSTEM bg is marked RO.
2585 * Since SYSTEM bg is small, that's pretty common.
2586 * 2. New SYSTEM bg will be allocated
2587 * Due to regular version will allocate new chunk.
2588 * 3. New SYSTEM bg is empty and will get cleaned up
2589 * Before cleanup really happens, it's marked RO again.
2590 * 4. Empty SYSTEM bg get scrubbed
2593 * This can easily boost the amount of SYSTEM chunks if cleaner
2594 * thread can't be triggered fast enough, and use up all space
2595 * of btrfs_super_block::sys_chunk_array
2597 * While for dev replace, we need to try our best to mark block
2598 * group RO, to prevent race between:
2599 * - Write duplication
2600 * Contains latest data
2602 * Contains data from commit tree
2604 * If target block group is not marked RO, nocow writes can
2605 * be overwritten by scrub copy, causing data corruption.
2606 * So for dev-replace, it's not allowed to continue if a block
2609 ret
= btrfs_inc_block_group_ro(cache
, sctx
->is_dev_replace
);
2610 if (!ret
&& sctx
->is_dev_replace
) {
2611 ret
= finish_extent_writes_for_zoned(root
, cache
);
2613 btrfs_dec_block_group_ro(cache
);
2614 scrub_pause_off(fs_info
);
2615 btrfs_put_block_group(cache
);
2622 } else if (ret
== -ENOSPC
&& !sctx
->is_dev_replace
&&
2623 !(cache
->flags
& BTRFS_BLOCK_GROUP_RAID56_MASK
)) {
2625 * btrfs_inc_block_group_ro return -ENOSPC when it
2626 * failed in creating new chunk for metadata.
2627 * It is not a problem for scrub, because
2628 * metadata are always cowed, and our scrub paused
2629 * commit_transactions.
2631 * For RAID56 chunks, we have to mark them read-only
2632 * for scrub, as later we would use our own cache
2633 * out of RAID56 realm.
2634 * Thus we want the RAID56 bg to be marked RO to
2635 * prevent RMW from screwing up out cache.
2638 } else if (ret
== -ETXTBSY
) {
2640 "skipping scrub of block group %llu due to active swapfile",
2642 scrub_pause_off(fs_info
);
2647 "failed setting block group ro: %d", ret
);
2648 btrfs_unfreeze_block_group(cache
);
2649 btrfs_put_block_group(cache
);
2650 scrub_pause_off(fs_info
);
2655 * Now the target block is marked RO, wait for nocow writes to
2656 * finish before dev-replace.
2657 * COW is fine, as COW never overwrites extents in commit tree.
2659 if (sctx
->is_dev_replace
) {
2660 btrfs_wait_nocow_writers(cache
);
2661 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, cache
->start
,
2665 scrub_pause_off(fs_info
);
2666 down_write(&dev_replace
->rwsem
);
2667 dev_replace
->cursor_right
= found_key
.offset
+ dev_extent_len
;
2668 dev_replace
->cursor_left
= found_key
.offset
;
2669 dev_replace
->item_needs_writeback
= 1;
2670 up_write(&dev_replace
->rwsem
);
2672 ret
= scrub_chunk(sctx
, cache
, scrub_dev
, found_key
.offset
,
2674 if (sctx
->is_dev_replace
&&
2675 !btrfs_finish_block_group_to_copy(dev_replace
->srcdev
,
2676 cache
, found_key
.offset
))
2679 down_write(&dev_replace
->rwsem
);
2680 dev_replace
->cursor_left
= dev_replace
->cursor_right
;
2681 dev_replace
->item_needs_writeback
= 1;
2682 up_write(&dev_replace
->rwsem
);
2685 btrfs_dec_block_group_ro(cache
);
2688 * We might have prevented the cleaner kthread from deleting
2689 * this block group if it was already unused because we raced
2690 * and set it to RO mode first. So add it back to the unused
2691 * list, otherwise it might not ever be deleted unless a manual
2692 * balance is triggered or it becomes used and unused again.
2694 spin_lock(&cache
->lock
);
2695 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED
, &cache
->runtime_flags
) &&
2696 !cache
->ro
&& cache
->reserved
== 0 && cache
->used
== 0) {
2697 spin_unlock(&cache
->lock
);
2698 if (btrfs_test_opt(fs_info
, DISCARD_ASYNC
))
2699 btrfs_discard_queue_work(&fs_info
->discard_ctl
,
2702 btrfs_mark_bg_unused(cache
);
2704 spin_unlock(&cache
->lock
);
2707 btrfs_unfreeze_block_group(cache
);
2708 btrfs_put_block_group(cache
);
2711 if (sctx
->is_dev_replace
&&
2712 atomic64_read(&dev_replace
->num_write_errors
) > 0) {
2716 if (sctx
->stat
.malloc_errors
> 0) {
2721 key
.offset
= found_key
.offset
+ dev_extent_len
;
2722 btrfs_release_path(path
);
2725 btrfs_free_path(path
);
2730 static int scrub_one_super(struct scrub_ctx
*sctx
, struct btrfs_device
*dev
,
2731 struct page
*page
, u64 physical
, u64 generation
)
2733 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2734 struct bio_vec bvec
;
2736 struct btrfs_super_block
*sb
= page_address(page
);
2739 bio_init(&bio
, dev
->bdev
, &bvec
, 1, REQ_OP_READ
);
2740 bio
.bi_iter
.bi_sector
= physical
>> SECTOR_SHIFT
;
2741 __bio_add_page(&bio
, page
, BTRFS_SUPER_INFO_SIZE
, 0);
2742 ret
= submit_bio_wait(&bio
);
2747 ret
= btrfs_check_super_csum(fs_info
, sb
);
2749 btrfs_err_rl(fs_info
,
2750 "super block at physical %llu devid %llu has bad csum",
2751 physical
, dev
->devid
);
2754 if (btrfs_super_generation(sb
) != generation
) {
2755 btrfs_err_rl(fs_info
,
2756 "super block at physical %llu devid %llu has bad generation %llu expect %llu",
2757 physical
, dev
->devid
,
2758 btrfs_super_generation(sb
), generation
);
2762 return btrfs_validate_super(fs_info
, sb
, -1);
2765 static noinline_for_stack
int scrub_supers(struct scrub_ctx
*sctx
,
2766 struct btrfs_device
*scrub_dev
)
2773 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2775 if (BTRFS_FS_ERROR(fs_info
))
2778 page
= alloc_page(GFP_KERNEL
);
2780 spin_lock(&sctx
->stat_lock
);
2781 sctx
->stat
.malloc_errors
++;
2782 spin_unlock(&sctx
->stat_lock
);
2786 /* Seed devices of a new filesystem has their own generation. */
2787 if (scrub_dev
->fs_devices
!= fs_info
->fs_devices
)
2788 gen
= scrub_dev
->generation
;
2790 gen
= btrfs_get_last_trans_committed(fs_info
);
2792 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
2793 bytenr
= btrfs_sb_offset(i
);
2794 if (bytenr
+ BTRFS_SUPER_INFO_SIZE
>
2795 scrub_dev
->commit_total_bytes
)
2797 if (!btrfs_check_super_location(scrub_dev
, bytenr
))
2800 ret
= scrub_one_super(sctx
, scrub_dev
, page
, bytenr
, gen
);
2802 spin_lock(&sctx
->stat_lock
);
2803 sctx
->stat
.super_errors
++;
2804 spin_unlock(&sctx
->stat_lock
);
2811 static void scrub_workers_put(struct btrfs_fs_info
*fs_info
)
2813 if (refcount_dec_and_mutex_lock(&fs_info
->scrub_workers_refcnt
,
2814 &fs_info
->scrub_lock
)) {
2815 struct workqueue_struct
*scrub_workers
= fs_info
->scrub_workers
;
2817 fs_info
->scrub_workers
= NULL
;
2818 mutex_unlock(&fs_info
->scrub_lock
);
2821 destroy_workqueue(scrub_workers
);
2826 * get a reference count on fs_info->scrub_workers. start worker if necessary
2828 static noinline_for_stack
int scrub_workers_get(struct btrfs_fs_info
*fs_info
)
2830 struct workqueue_struct
*scrub_workers
= NULL
;
2831 unsigned int flags
= WQ_FREEZABLE
| WQ_UNBOUND
;
2832 int max_active
= fs_info
->thread_pool_size
;
2835 if (refcount_inc_not_zero(&fs_info
->scrub_workers_refcnt
))
2838 scrub_workers
= alloc_workqueue("btrfs-scrub", flags
, max_active
);
2842 mutex_lock(&fs_info
->scrub_lock
);
2843 if (refcount_read(&fs_info
->scrub_workers_refcnt
) == 0) {
2844 ASSERT(fs_info
->scrub_workers
== NULL
);
2845 fs_info
->scrub_workers
= scrub_workers
;
2846 refcount_set(&fs_info
->scrub_workers_refcnt
, 1);
2847 mutex_unlock(&fs_info
->scrub_lock
);
2850 /* Other thread raced in and created the workers for us */
2851 refcount_inc(&fs_info
->scrub_workers_refcnt
);
2852 mutex_unlock(&fs_info
->scrub_lock
);
2856 destroy_workqueue(scrub_workers
);
2860 int btrfs_scrub_dev(struct btrfs_fs_info
*fs_info
, u64 devid
, u64 start
,
2861 u64 end
, struct btrfs_scrub_progress
*progress
,
2862 int readonly
, int is_dev_replace
)
2864 struct btrfs_dev_lookup_args args
= { .devid
= devid
};
2865 struct scrub_ctx
*sctx
;
2867 struct btrfs_device
*dev
;
2868 unsigned int nofs_flag
;
2869 bool need_commit
= false;
2871 if (btrfs_fs_closing(fs_info
))
2874 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
2875 ASSERT(fs_info
->nodesize
<= BTRFS_STRIPE_LEN
);
2878 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
2879 * value (max nodesize / min sectorsize), thus nodesize should always
2882 ASSERT(fs_info
->nodesize
<=
2883 SCRUB_MAX_SECTORS_PER_BLOCK
<< fs_info
->sectorsize_bits
);
2885 /* Allocate outside of device_list_mutex */
2886 sctx
= scrub_setup_ctx(fs_info
, is_dev_replace
);
2888 return PTR_ERR(sctx
);
2890 ret
= scrub_workers_get(fs_info
);
2894 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
2895 dev
= btrfs_find_device(fs_info
->fs_devices
, &args
);
2896 if (!dev
|| (test_bit(BTRFS_DEV_STATE_MISSING
, &dev
->dev_state
) &&
2898 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2903 if (!is_dev_replace
&& !readonly
&&
2904 !test_bit(BTRFS_DEV_STATE_WRITEABLE
, &dev
->dev_state
)) {
2905 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2906 btrfs_err_in_rcu(fs_info
,
2907 "scrub on devid %llu: filesystem on %s is not writable",
2908 devid
, btrfs_dev_name(dev
));
2913 mutex_lock(&fs_info
->scrub_lock
);
2914 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA
, &dev
->dev_state
) ||
2915 test_bit(BTRFS_DEV_STATE_REPLACE_TGT
, &dev
->dev_state
)) {
2916 mutex_unlock(&fs_info
->scrub_lock
);
2917 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2922 down_read(&fs_info
->dev_replace
.rwsem
);
2923 if (dev
->scrub_ctx
||
2925 btrfs_dev_replace_is_ongoing(&fs_info
->dev_replace
))) {
2926 up_read(&fs_info
->dev_replace
.rwsem
);
2927 mutex_unlock(&fs_info
->scrub_lock
);
2928 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2932 up_read(&fs_info
->dev_replace
.rwsem
);
2934 sctx
->readonly
= readonly
;
2935 dev
->scrub_ctx
= sctx
;
2936 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2939 * checking @scrub_pause_req here, we can avoid
2940 * race between committing transaction and scrubbing.
2942 __scrub_blocked_if_needed(fs_info
);
2943 atomic_inc(&fs_info
->scrubs_running
);
2944 mutex_unlock(&fs_info
->scrub_lock
);
2947 * In order to avoid deadlock with reclaim when there is a transaction
2948 * trying to pause scrub, make sure we use GFP_NOFS for all the
2949 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
2950 * invoked by our callees. The pausing request is done when the
2951 * transaction commit starts, and it blocks the transaction until scrub
2952 * is paused (done at specific points at scrub_stripe() or right above
2953 * before incrementing fs_info->scrubs_running).
2955 nofs_flag
= memalloc_nofs_save();
2956 if (!is_dev_replace
) {
2957 u64 old_super_errors
;
2959 spin_lock(&sctx
->stat_lock
);
2960 old_super_errors
= sctx
->stat
.super_errors
;
2961 spin_unlock(&sctx
->stat_lock
);
2963 btrfs_info(fs_info
, "scrub: started on devid %llu", devid
);
2965 * by holding device list mutex, we can
2966 * kick off writing super in log tree sync.
2968 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
2969 ret
= scrub_supers(sctx
, dev
);
2970 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2972 spin_lock(&sctx
->stat_lock
);
2974 * Super block errors found, but we can not commit transaction
2975 * at current context, since btrfs_commit_transaction() needs
2976 * to pause the current running scrub (hold by ourselves).
2978 if (sctx
->stat
.super_errors
> old_super_errors
&& !sctx
->readonly
)
2980 spin_unlock(&sctx
->stat_lock
);
2984 ret
= scrub_enumerate_chunks(sctx
, dev
, start
, end
);
2985 memalloc_nofs_restore(nofs_flag
);
2987 atomic_dec(&fs_info
->scrubs_running
);
2988 wake_up(&fs_info
->scrub_pause_wait
);
2991 memcpy(progress
, &sctx
->stat
, sizeof(*progress
));
2993 if (!is_dev_replace
)
2994 btrfs_info(fs_info
, "scrub: %s on devid %llu with status: %d",
2995 ret
? "not finished" : "finished", devid
, ret
);
2997 mutex_lock(&fs_info
->scrub_lock
);
2998 dev
->scrub_ctx
= NULL
;
2999 mutex_unlock(&fs_info
->scrub_lock
);
3001 scrub_workers_put(fs_info
);
3002 scrub_put_ctx(sctx
);
3005 * We found some super block errors before, now try to force a
3006 * transaction commit, as scrub has finished.
3009 struct btrfs_trans_handle
*trans
;
3011 trans
= btrfs_start_transaction(fs_info
->tree_root
, 0);
3012 if (IS_ERR(trans
)) {
3013 ret
= PTR_ERR(trans
);
3015 "scrub: failed to start transaction to fix super block errors: %d", ret
);
3018 ret
= btrfs_commit_transaction(trans
);
3021 "scrub: failed to commit transaction to fix super block errors: %d", ret
);
3025 scrub_workers_put(fs_info
);
3027 scrub_free_ctx(sctx
);
3032 void btrfs_scrub_pause(struct btrfs_fs_info
*fs_info
)
3034 mutex_lock(&fs_info
->scrub_lock
);
3035 atomic_inc(&fs_info
->scrub_pause_req
);
3036 while (atomic_read(&fs_info
->scrubs_paused
) !=
3037 atomic_read(&fs_info
->scrubs_running
)) {
3038 mutex_unlock(&fs_info
->scrub_lock
);
3039 wait_event(fs_info
->scrub_pause_wait
,
3040 atomic_read(&fs_info
->scrubs_paused
) ==
3041 atomic_read(&fs_info
->scrubs_running
));
3042 mutex_lock(&fs_info
->scrub_lock
);
3044 mutex_unlock(&fs_info
->scrub_lock
);
3047 void btrfs_scrub_continue(struct btrfs_fs_info
*fs_info
)
3049 atomic_dec(&fs_info
->scrub_pause_req
);
3050 wake_up(&fs_info
->scrub_pause_wait
);
3053 int btrfs_scrub_cancel(struct btrfs_fs_info
*fs_info
)
3055 mutex_lock(&fs_info
->scrub_lock
);
3056 if (!atomic_read(&fs_info
->scrubs_running
)) {
3057 mutex_unlock(&fs_info
->scrub_lock
);
3061 atomic_inc(&fs_info
->scrub_cancel_req
);
3062 while (atomic_read(&fs_info
->scrubs_running
)) {
3063 mutex_unlock(&fs_info
->scrub_lock
);
3064 wait_event(fs_info
->scrub_pause_wait
,
3065 atomic_read(&fs_info
->scrubs_running
) == 0);
3066 mutex_lock(&fs_info
->scrub_lock
);
3068 atomic_dec(&fs_info
->scrub_cancel_req
);
3069 mutex_unlock(&fs_info
->scrub_lock
);
3074 int btrfs_scrub_cancel_dev(struct btrfs_device
*dev
)
3076 struct btrfs_fs_info
*fs_info
= dev
->fs_info
;
3077 struct scrub_ctx
*sctx
;
3079 mutex_lock(&fs_info
->scrub_lock
);
3080 sctx
= dev
->scrub_ctx
;
3082 mutex_unlock(&fs_info
->scrub_lock
);
3085 atomic_inc(&sctx
->cancel_req
);
3086 while (dev
->scrub_ctx
) {
3087 mutex_unlock(&fs_info
->scrub_lock
);
3088 wait_event(fs_info
->scrub_pause_wait
,
3089 dev
->scrub_ctx
== NULL
);
3090 mutex_lock(&fs_info
->scrub_lock
);
3092 mutex_unlock(&fs_info
->scrub_lock
);
3097 int btrfs_scrub_progress(struct btrfs_fs_info
*fs_info
, u64 devid
,
3098 struct btrfs_scrub_progress
*progress
)
3100 struct btrfs_dev_lookup_args args
= { .devid
= devid
};
3101 struct btrfs_device
*dev
;
3102 struct scrub_ctx
*sctx
= NULL
;
3104 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
3105 dev
= btrfs_find_device(fs_info
->fs_devices
, &args
);
3107 sctx
= dev
->scrub_ctx
;
3109 memcpy(progress
, &sctx
->stat
, sizeof(*progress
));
3110 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
3112 return dev
? (sctx
? 0 : -ENOTCONN
) : -ENODEV
;