]>
Commit | Line | Data |
---|---|---|
8b77cbea GKH |
1 | From 2446dba03f9dabe0b477a126cbeb377854785b47 Mon Sep 17 00:00:00 2001 |
2 | From: NeilBrown <neilb@suse.de> | |
3 | Date: Thu, 31 Jul 2014 10:16:29 +1000 | |
4 | Subject: md/raid1,raid10: always abort recover on write error. | |
5 | ||
6 | From: NeilBrown <neilb@suse.de> | |
7 | ||
8 | commit 2446dba03f9dabe0b477a126cbeb377854785b47 upstream. | |
9 | ||
10 | Currently we don't abort recovery on a write error if the write error | |
11 | to the recovering device was triggerd by normal IO (as opposed to | |
12 | recovery IO). | |
13 | ||
14 | This means that for one bitmap region, the recovery might write to the | |
15 | recovering device for a few sectors, then not bother for subsequent | |
16 | sectors (as it never writes to failed devices). In this case | |
17 | the bitmap bit will be cleared, but it really shouldn't. | |
18 | ||
19 | The result is that if the recovering device fails and is then re-added | |
20 | (after fixing whatever hardware problem triggerred the failure), | |
21 | the second recovery won't redo the region it was in the middle of, | |
22 | so some of the device will not be recovered properly. | |
23 | ||
24 | If we abort the recovery, the region being processes will be cancelled | |
25 | (bit not cleared) and the whole region will be retried. | |
26 | ||
27 | As the bug can result in data corruption the patch is suitable for | |
28 | -stable. For kernels prior to 3.11 there is a conflict in raid10.c | |
29 | which will require care. | |
30 | ||
31 | Original-from: jiao hui <jiaohui@bwstor.com.cn> | |
32 | Reported-and-tested-by: jiao hui <jiaohui@bwstor.com.cn> | |
33 | Signed-off-by: NeilBrown <neilb@suse.de> | |
34 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
35 | ||
36 | --- | |
37 | drivers/md/raid1.c | 8 ++++---- | |
38 | drivers/md/raid10.c | 11 +++++------ | |
39 | 2 files changed, 9 insertions(+), 10 deletions(-) | |
40 | ||
41 | --- a/drivers/md/raid1.c | |
42 | +++ b/drivers/md/raid1.c | |
43 | @@ -1501,12 +1501,12 @@ static void error(struct mddev *mddev, s | |
44 | mddev->degraded++; | |
45 | set_bit(Faulty, &rdev->flags); | |
46 | spin_unlock_irqrestore(&conf->device_lock, flags); | |
47 | - /* | |
48 | - * if recovery is running, make sure it aborts. | |
49 | - */ | |
50 | - set_bit(MD_RECOVERY_INTR, &mddev->recovery); | |
51 | } else | |
52 | set_bit(Faulty, &rdev->flags); | |
53 | + /* | |
54 | + * if recovery is running, make sure it aborts. | |
55 | + */ | |
56 | + set_bit(MD_RECOVERY_INTR, &mddev->recovery); | |
57 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | |
58 | printk(KERN_ALERT | |
59 | "md/raid1:%s: Disk failure on %s, disabling device.\n" | |
60 | --- a/drivers/md/raid10.c | |
61 | +++ b/drivers/md/raid10.c | |
62 | @@ -1684,13 +1684,12 @@ static void error(struct mddev *mddev, s | |
63 | spin_unlock_irqrestore(&conf->device_lock, flags); | |
64 | return; | |
65 | } | |
66 | - if (test_and_clear_bit(In_sync, &rdev->flags)) { | |
67 | + if (test_and_clear_bit(In_sync, &rdev->flags)) | |
68 | mddev->degraded++; | |
69 | - /* | |
70 | - * if recovery is running, make sure it aborts. | |
71 | - */ | |
72 | - set_bit(MD_RECOVERY_INTR, &mddev->recovery); | |
73 | - } | |
74 | + /* | |
75 | + * If recovery is running, make sure it aborts. | |
76 | + */ | |
77 | + set_bit(MD_RECOVERY_INTR, &mddev->recovery); | |
78 | set_bit(Blocked, &rdev->flags); | |
79 | set_bit(Faulty, &rdev->flags); | |
80 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |