]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
md/raid5: use mempool to allocate stripe_request_ctx
authorYu Kuai <yukuai@fnnas.com>
Wed, 14 Jan 2026 17:12:33 +0000 (01:12 +0800)
committerYu Kuai <yukuai@fnnas.com>
Mon, 26 Jan 2026 05:11:29 +0000 (13:11 +0800)
On the one hand, stripe_request_ctx is 72 bytes, and it's a bit huge for
a stack variable.

On the other hand, the bitmap sectors_to_do is a fixed size, result in
max_hw_sector_kb of raid5 array is at most 256 * 4k = 1Mb, and this will
make full stripe IO impossible for the array that chunk_size * data_disks
is bigger. Allocate ctx during runtime will make it possible to get rid
of this limit.

Link: https://lore.kernel.org/linux-raid/20260114171241.3043364-6-yukuai@fnnas.com
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Li Nan <linan122@huawei.com>
drivers/md/md.h
drivers/md/raid1-10.c
drivers/md/raid5.c
drivers/md/raid5.h

index 6ee18045f41c3d53c4b5e11dc238460074f1174f..b8c5dec12b62f0b9b969e033aa6142b398c036dc 100644 (file)
 #include <trace/events/block.h>
 
 #define MaxSector (~(sector_t)0)
+/*
+ * Number of guaranteed raid bios in case of extreme VM load:
+ */
+#define        NR_RAID_BIOS 256
 
 enum md_submodule_type {
        MD_PERSONALITY = 0,
index 521625756128a369b3de57dcdd244d22c447a2b1..c33099925f230e6178017c40c31278b7d97c6743 100644 (file)
@@ -3,11 +3,6 @@
 #define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 
-/*
- * Number of guaranteed raid bios in case of extreme VM load:
- */
-#define        NR_RAID_BIOS 256
-
 /* when we get a read error on a read-only array, we redirect to another
  * device without failing the first device, or trying to over-write to
  * correct the read error.  To keep track of bad blocks on a per-bio
index 6d44609f62f301b402a776b9df62c1c5db7e1062..2fa63bd2431aa5b0c6718b4dc52cce8f202f5c3d 100644 (file)
@@ -6084,13 +6084,13 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
 static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 {
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
-       bool on_wq;
        struct r5conf *conf = mddev->private;
-       sector_t logical_sector;
-       struct stripe_request_ctx ctx = {};
        const int rw = bio_data_dir(bi);
+       struct stripe_request_ctx *ctx;
+       sector_t logical_sector;
        enum stripe_result res;
        int s, stripe_cnt;
+       bool on_wq;
 
        if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
                int ret = log_handle_flush_request(conf, bi);
@@ -6102,11 +6102,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
                                return true;
                }
                /* ret == -EAGAIN, fallback */
-               /*
-                * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
-                * we need to flush journal device
-                */
-               ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
        }
 
        md_write_start(mddev, bi);
@@ -6129,16 +6124,25 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
        }
 
        logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
-       ctx.first_sector = logical_sector;
-       ctx.last_sector = bio_end_sector(bi);
        bi->bi_next = NULL;
 
-       stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
+       ctx = mempool_alloc(conf->ctx_pool, GFP_NOIO);
+       memset(ctx, 0, sizeof(*ctx));
+       ctx->first_sector = logical_sector;
+       ctx->last_sector = bio_end_sector(bi);
+       /*
+        * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
+        * we need to flush journal device
+        */
+       if (unlikely(bi->bi_opf & REQ_PREFLUSH))
+               ctx->do_flush = true;
+
+       stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx->last_sector - logical_sector,
                                           RAID5_STRIPE_SECTORS(conf));
-       bitmap_set(ctx.sectors_to_do, 0, stripe_cnt);
+       bitmap_set(ctx->sectors_to_do, 0, stripe_cnt);
 
        pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
-                bi->bi_iter.bi_sector, ctx.last_sector);
+                bi->bi_iter.bi_sector, ctx->last_sector);
 
        /* Bail out if conflicts with reshape and REQ_NOWAIT is set */
        if ((bi->bi_opf & REQ_NOWAIT) &&
@@ -6146,6 +6150,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
                bio_wouldblock_error(bi);
                if (rw == WRITE)
                        md_write_end(mddev);
+               mempool_free(ctx, conf->ctx_pool);
                return true;
        }
        md_account_bio(mddev, &bi);
@@ -6164,10 +6169,10 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
                add_wait_queue(&conf->wait_for_reshape, &wait);
                on_wq = true;
        }
-       s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
+       s = (logical_sector - ctx->first_sector) >> RAID5_STRIPE_SHIFT(conf);
 
        while (1) {
-               res = make_stripe_request(mddev, conf, &ctx, logical_sector,
+               res = make_stripe_request(mddev, conf, ctx, logical_sector,
                                          bi);
                if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE)
                        break;
@@ -6184,9 +6189,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
                         * raid5_activate_delayed() from making progress
                         * and thus deadlocking.
                         */
-                       if (ctx.batch_last) {
-                               raid5_release_stripe(ctx.batch_last);
-                               ctx.batch_last = NULL;
+                       if (ctx->batch_last) {
+                               raid5_release_stripe(ctx->batch_last);
+                               ctx->batch_last = NULL;
                        }
 
                        wait_woken(&wait, TASK_UNINTERRUPTIBLE,
@@ -6194,21 +6199,23 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
                        continue;
                }
 
-               s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s);
+               s = find_next_bit_wrap(ctx->sectors_to_do, stripe_cnt, s);
                if (s == stripe_cnt)
                        break;
 
-               logical_sector = ctx.first_sector +
+               logical_sector = ctx->first_sector +
                        (s << RAID5_STRIPE_SHIFT(conf));
        }
        if (unlikely(on_wq))
                remove_wait_queue(&conf->wait_for_reshape, &wait);
 
-       if (ctx.batch_last)
-               raid5_release_stripe(ctx.batch_last);
+       if (ctx->batch_last)
+               raid5_release_stripe(ctx->batch_last);
 
        if (rw == WRITE)
                md_write_end(mddev);
+
+       mempool_free(ctx, conf->ctx_pool);
        if (res == STRIPE_WAIT_RESHAPE) {
                md_free_cloned_bio(bi);
                return false;
@@ -7378,6 +7385,9 @@ static void free_conf(struct r5conf *conf)
        bioset_exit(&conf->bio_split);
        kfree(conf->stripe_hashtbl);
        kfree(conf->pending_data);
+
+       mempool_destroy(conf->ctx_pool);
+
        kfree(conf);
 }
 
@@ -8061,6 +8071,13 @@ static int raid5_run(struct mddev *mddev)
                        goto abort;
        }
 
+       conf->ctx_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS,
+                                       sizeof(struct stripe_request_ctx));
+       if (!conf->ctx_pool) {
+               ret = -ENOMEM;
+               goto abort;
+       }
+
        ret = log_init(conf, journal_dev, raid5_has_ppl(conf));
        if (ret)
                goto abort;
index eafc6e9ed6ee1cbd2e7daa36a8d0177188843989..6e3f07119fa4a7a1fc6147d35403696616cbd66a 100644 (file)
@@ -690,6 +690,8 @@ struct r5conf {
        struct list_head        pending_list;
        int                     pending_data_cnt;
        struct r5pending_data   *next_pending_data;
+
+       mempool_t               *ctx_pool;
 };
 
 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE