From f02b384767b6547431f9d74147db8db34b0ebd29 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 29 Nov 2012 18:21:21 -0800 Subject: [PATCH] 3.6-stable patches added patches: md-raid10-close-race-that-lose-writes-lost-when-replacement-completes.patch --- ...ites-lost-when-replacement-completes.patch | 184 ++++++++++++++++++ queue-3.6/series | 1 + 2 files changed, 185 insertions(+) create mode 100644 queue-3.6/md-raid10-close-race-that-lose-writes-lost-when-replacement-completes.patch diff --git a/queue-3.6/md-raid10-close-race-that-lose-writes-lost-when-replacement-completes.patch b/queue-3.6/md-raid10-close-race-that-lose-writes-lost-when-replacement-completes.patch new file mode 100644 index 00000000000..2b541648029 --- /dev/null +++ b/queue-3.6/md-raid10-close-race-that-lose-writes-lost-when-replacement-completes.patch @@ -0,0 +1,184 @@ +From e7c0c3fa29280d62aa5e11101a674bb3064bd791 Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Thu, 22 Nov 2012 14:42:49 +1100 +Subject: md/raid10: close race that lose writes lost when replacement completes. + +From: NeilBrown + +commit e7c0c3fa29280d62aa5e11101a674bb3064bd791 upstream. + +When a replacement operation completes there is a small window +when the original device is marked 'faulty' and the replacement +still looks like a replacement. The faulty should be removed and +the replacement moved in place very quickly, bit it isn't instant. + +So the code write out to the array must handle the possibility that +the only working device for some slot in the replacement - but it +doesn't. If the primary device is faulty it just gives up. This +can lead to corruption. + +So make the code more robust: if either the primary or the +replacement is present and working, write to them. Only when +neither are present do we give up. + +This bug has been present since replacement was introduced in +3.3, so it is suitable for any -stable kernel since then. + +Reported-by: "George Spelvin" +Cc: stable@vger.kernel.org +Signed-off-by: NeilBrown +Signed-off-by: George Spelvin +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid10.c | 114 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 59 insertions(+), 55 deletions(-) + +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -1287,18 +1287,21 @@ retry_write: + blocked_rdev = rrdev; + break; + } ++ if (rdev && (test_bit(Faulty, &rdev->flags) ++ || test_bit(Unmerged, &rdev->flags))) ++ rdev = NULL; + if (rrdev && (test_bit(Faulty, &rrdev->flags) + || test_bit(Unmerged, &rrdev->flags))) + rrdev = NULL; + + r10_bio->devs[i].bio = NULL; + r10_bio->devs[i].repl_bio = NULL; +- if (!rdev || test_bit(Faulty, &rdev->flags) || +- test_bit(Unmerged, &rdev->flags)) { ++ ++ if (!rdev && !rrdev) { + set_bit(R10BIO_Degraded, &r10_bio->state); + continue; + } +- if (test_bit(WriteErrorSeen, &rdev->flags)) { ++ if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; +@@ -1340,8 +1343,10 @@ retry_write: + max_sectors = good_sectors; + } + } +- r10_bio->devs[i].bio = bio; +- atomic_inc(&rdev->nr_pending); ++ if (rdev) { ++ r10_bio->devs[i].bio = bio; ++ atomic_inc(&rdev->nr_pending); ++ } + if (rrdev) { + r10_bio->devs[i].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); +@@ -1397,58 +1402,57 @@ retry_write: + for (i = 0; i < conf->copies; i++) { + struct bio *mbio; + int d = r10_bio->devs[i].devnum; +- if (!r10_bio->devs[i].bio) +- continue; + +- mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); +- md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, +- max_sectors); +- r10_bio->devs[i].bio = mbio; +- +- mbio->bi_sector = (r10_bio->devs[i].addr+ +- choose_data_offset(r10_bio, +- conf->mirrors[d].rdev)); +- mbio->bi_bdev = conf->mirrors[d].rdev->bdev; +- mbio->bi_end_io = raid10_end_write_request; +- mbio->bi_rw = WRITE | do_sync | do_fua; +- mbio->bi_private = r10_bio; +- +- atomic_inc(&r10_bio->remaining); +- spin_lock_irqsave(&conf->device_lock, flags); +- bio_list_add(&conf->pending_bio_list, mbio); +- conf->pending_count++; +- spin_unlock_irqrestore(&conf->device_lock, flags); +- if (!mddev_check_plugged(mddev)) +- md_wakeup_thread(mddev->thread); +- +- if (!r10_bio->devs[i].repl_bio) +- continue; ++ if (r10_bio->devs[i].bio) { ++ struct md_rdev *rdev = conf->mirrors[d].rdev; ++ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); ++ md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, ++ max_sectors); ++ r10_bio->devs[i].bio = mbio; ++ ++ mbio->bi_sector = (r10_bio->devs[i].addr + ++ choose_data_offset(r10_bio, rdev)); ++ mbio->bi_bdev = rdev->bdev; ++ mbio->bi_end_io = raid10_end_write_request; ++ mbio->bi_rw = WRITE | do_sync | do_fua; ++ mbio->bi_private = r10_bio; ++ ++ atomic_inc(&r10_bio->remaining); ++ spin_lock_irqsave(&conf->device_lock, flags); ++ bio_list_add(&conf->pending_bio_list, mbio); ++ conf->pending_count++; ++ spin_unlock_irqrestore(&conf->device_lock, flags); ++ if (!mddev_check_plugged(mddev)) ++ md_wakeup_thread(mddev->thread); ++ } + +- mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); +- md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, +- max_sectors); +- r10_bio->devs[i].repl_bio = mbio; +- +- /* We are actively writing to the original device +- * so it cannot disappear, so the replacement cannot +- * become NULL here +- */ +- mbio->bi_sector = (r10_bio->devs[i].addr + +- choose_data_offset( +- r10_bio, +- conf->mirrors[d].replacement)); +- mbio->bi_bdev = conf->mirrors[d].replacement->bdev; +- mbio->bi_end_io = raid10_end_write_request; +- mbio->bi_rw = WRITE | do_sync | do_fua; +- mbio->bi_private = r10_bio; +- +- atomic_inc(&r10_bio->remaining); +- spin_lock_irqsave(&conf->device_lock, flags); +- bio_list_add(&conf->pending_bio_list, mbio); +- conf->pending_count++; +- spin_unlock_irqrestore(&conf->device_lock, flags); +- if (!mddev_check_plugged(mddev)) +- md_wakeup_thread(mddev->thread); ++ if (r10_bio->devs[i].repl_bio) { ++ struct md_rdev *rdev = conf->mirrors[d].replacement; ++ if (rdev == NULL) { ++ /* Replacement just got moved to main 'rdev' */ ++ smp_mb(); ++ rdev = conf->mirrors[d].rdev; ++ } ++ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); ++ md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, ++ max_sectors); ++ r10_bio->devs[i].repl_bio = mbio; ++ ++ mbio->bi_sector = (r10_bio->devs[i].addr + ++ choose_data_offset(r10_bio, rdev)); ++ mbio->bi_bdev = rdev->bdev; ++ mbio->bi_end_io = raid10_end_write_request; ++ mbio->bi_rw = WRITE | do_sync | do_fua; ++ mbio->bi_private = r10_bio; ++ ++ atomic_inc(&r10_bio->remaining); ++ spin_lock_irqsave(&conf->device_lock, flags); ++ bio_list_add(&conf->pending_bio_list, mbio); ++ conf->pending_count++; ++ spin_unlock_irqrestore(&conf->device_lock, flags); ++ if (!mddev_check_plugged(mddev)) ++ md_wakeup_thread(mddev->thread); ++ } + } + + /* Don't remove the bias on 'remaining' (one_write_done) until diff --git a/queue-3.6/series b/queue-3.6/series index f8b4c650161..707a6bdaf54 100644 --- a/queue-3.6/series +++ b/queue-3.6/series @@ -51,3 +51,4 @@ mpi-fix-compilation-on-mips-with-gcc-4.4-and-newer.patch ext4-remove-erroneous-ext4_superblock_csum_set-in-update_backups.patch powerpc-eeh-lock-module-while-handling-eeh-event.patch mmc-sdhci-s3c-fix-the-wrong-number-of-max-bus-clocks.patch +md-raid10-close-race-that-lose-writes-lost-when-replacement-completes.patch -- 2.47.3