]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
md/raid5: avoid R5_Overlap races while breaking stripe batches
authorChen Cheng <chencheng@fnnas.com>
Fri, 19 Jun 2026 04:10:13 +0000 (12:10 +0800)
committerYu Kuai <yukuai@fygo.io>
Tue, 23 Jun 2026 01:44:11 +0000 (09:44 +0800)
KCSAN report a race in break_stripe_batch_list() vs. raid5_make_request()
on sh->dev[i].flags (plain word write vs. atomic bit op)..

and .. one possible scenario is:

CPU1                            CPU2
break_stripe_batch_list(sh1)
-> handle sh2
-> lock(sh2)
-> sh2->batch_head = NULL
-> unlock(sh2)
-> test_and_clear_bit(R5_Overlap, sh2->dev[i].flags)
-> wake_up_bit(sh2->dev[i].flags)
                                raid5_make_request()
                                -> add_all_stripe_bios(sh2)
                                -> lock(sh2)
                                -> stripe_bio_overlaps(sh2) returns true
   batch_head is NULL, so new bio overlap
   exist bio on sh2 -> true
                                -> set_bit(R5_Overlap, sh2->dev[i].flags)
                                -> unlock(sh2)
                                -> wait_on_bit(sh2->dev[i].flags)
-> sh2->dev[i].flags = sh1->dev[i].flags & ~R5_Overlap

No wait_up_bit(), CPU2 could be wait_on_bit() forever...

Fix by :
- Expand the protect zone.
- Use batch_head's device flag's snaphot when no held head_sh->stripe_lock.
- Move sh/head_sh->batch_head = NULL to the end of protected zone , and ,
  any concurrent add_all_stripe_bios() grabs sh->stripe_lock now either:
- see batch_head != null, and , is rejected by stripe_bio_overlaps()
  under the lock (no R5_Overlap wait ) , or ,
- sees batch_head == NULL, only after dev[i].flags has already been
  set and the prior R5_Overlap waiters worken.

KCSAN report:
================================================
  BUG: KCSAN: data-race in break_stripe_batch_list / raid5_make_request

  write (marked) to 0xffff8e89c8117548 of 8 bytes by task 4042 on cpu 0:
    raid5_make_request+0xea0/0x2930
    md_handle_request+0x4a2/0xa40
    md_submit_bio+0x109/0x1a0
    __submit_bio+0x2ec/0x390
    submit_bio_noacct_nocheck+0x457/0x710
    submit_bio_noacct+0x2a7/0xc20
    submit_bio+0x56/0x250
    blkdev_direct_IO+0x54c/0xda0
    blkdev_write_iter+0x38f/0x570
    aio_write+0x22b/0x490
    io_submit_one+0xa51/0xf70
    __x64_sys_io_submit+0xf7/0x220
    x64_sys_call+0x1907/0x1c60
    do_syscall_64+0x130/0x570
    entry_SYSCALL_64_after_hwframe+0x76/0x7e

  read to 0xffff8e89c8117548 of 8 bytes by task 4010 on cpu 5:
    break_stripe_batch_list+0x249/0x480
    handle_stripe_clean_event+0x720/0x9b0
    handle_stripe+0x32fb/0x4500
    handle_active_stripes.isra.0+0x6e0/0xa50
    raid5d+0x7e0/0xba0
    md_thread+0x15a/0x2d0
    kthread+0x1e3/0x220
    ret_from_fork+0x37a/0x410
    ret_from_fork_asm+0x1a/0x30

  value changed: 0x0000000000000019 -> 0x0000000000000099 --> R5_Overlap

Fixes: fb642b92c267 ("md/raid5: duplicate some more handle_stripe_clean_event code in break_stripe_batch_list")
Signed-off-by: Chen Cheng <chencheng@fnnas.com>
Link: https://patch.msgid.link/20260619041013.1207148-1-chencheng@fnnas.com
Signed-off-by: Yu Kuai <yukuai@fygo.io>
drivers/md/raid5.c

index 6d982c54f2d170c4c6bb545829d653387bd05a1d..0c5c9fb0606eee74e8c2919e5f1698f31333b4a7 100644 (file)
@@ -4885,14 +4885,14 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
                sh->check_state = head_sh->check_state;
                sh->reconstruct_state = head_sh->reconstruct_state;
                spin_lock_irq(&sh->stripe_lock);
-               sh->batch_head = NULL;
-               spin_unlock_irq(&sh->stripe_lock);
                for (i = 0; i < sh->disks; i++) {
                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                                wake_up_bit(&sh->dev[i].flags, R5_Overlap);
-                       sh->dev[i].flags = head_sh->dev[i].flags &
+                       sh->dev[i].flags = READ_ONCE(head_sh->dev[i].flags) &
                                (~((1 << R5_WriteError) | (1 << R5_Overlap)));
                }
+               sh->batch_head = NULL;
+               spin_unlock_irq(&sh->stripe_lock);
 
                state = READ_ONCE(sh->state);
                if (handle_flags == 0 || (state & handle_flags))
@@ -4900,11 +4900,11 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
                raid5_release_stripe(sh);
        }
        spin_lock_irq(&head_sh->stripe_lock);
-       head_sh->batch_head = NULL;
-       spin_unlock_irq(&head_sh->stripe_lock);
        for (i = 0; i < head_sh->disks; i++)
                if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
                        wake_up_bit(&head_sh->dev[i].flags, R5_Overlap);
+       head_sh->batch_head = NULL;
+       spin_unlock_irq(&head_sh->stripe_lock);
 
        state = READ_ONCE(head_sh->state);
        if (state & handle_flags)