]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
workqueue: Process rescuer work items one-by-one using a cursor
authorLai Jiangshan <jiangshan.ljs@antgroup.com>
Mon, 8 Dec 2025 13:25:18 +0000 (21:25 +0800)
committerTejun Heo <tj@kernel.org>
Mon, 8 Dec 2025 19:17:49 +0000 (09:17 -1000)
Previously, the rescuer scanned for all matching work items at once and
processed them within a single rescuer thread, which could cause one
blocking work item to stall all others.

Make the rescuer process work items one-by-one instead of slurping all
matches in a single pass.

Break the rescuer loop after finding and processing the first matching
work item, then restart the search to pick up the next. This gives
normal worker threads a chance to process other items which gives them
the opportunity to be processed instead of waiting on the rescuer's
queue and prevents a blocking work item from stalling the rest once
memory pressure is relieved.

Introduce a dummy cursor work item to avoid potentially O(N^2)
rescans of the work list.  The marker records the resume position for
the next scan, eliminating redundant traversals.

Also introduce RESCUER_BATCH to control the maximum number of work items
the rescuer processes in each turn, and move on to other PWQs when the
limit is reached.

Cc: ying chen <yc1082463@gmail.com>
Reported-by: ying chen <yc1082463@gmail.com>
Fixes: e22bee782b3b ("workqueue: implement concurrency managed dynamic worker pool")
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
kernel/workqueue.c

index f8371aa54dcadc485db6182788f4f2fde20b4a0a..7f9225936cd9848569b8cd8435647a6dffb63211 100644 (file)
@@ -117,6 +117,8 @@ enum wq_internal_consts {
        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
 
+       RESCUER_BATCH           = 16,           /* process items per turn */
+
        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give MIN_NICE.
@@ -286,6 +288,7 @@ struct pool_workqueue {
        struct list_head        pending_node;   /* LN: node on wq_node_nr_active->pending_pwqs */
        struct list_head        pwqs_node;      /* WR: node on wq->pwqs */
        struct list_head        mayday_node;    /* MD: node on wq->maydays */
+       struct work_struct      mayday_cursor;  /* L: cursor on pool->worklist */
 
        u64                     stats[PWQ_NR_STATS];
 
@@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
        return NULL;
 }
 
+static void mayday_cursor_func(struct work_struct *work)
+{
+       /* should not be processed, only for marking position */
+       BUG();
+}
+
 /**
  * move_linked_works - move linked works to a list
  * @work: start of series of works to be scheduled
@@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
 
        lockdep_assert_held(&pool->lock);
 
+       /* The cursor work should not be processed */
+       if (unlikely(work->func == mayday_cursor_func)) {
+               /* only worker_thread() can possibly take this branch */
+               WARN_ON_ONCE(worker->rescue_wq);
+               if (nextp)
+                       *nextp = list_next_entry(work, entry);
+               list_del_init(&work->entry);
+               return false;
+       }
+
        /*
         * A single work shouldn't be executed concurrently by multiple workers.
         * __queue_work() ensures that @work doesn't jump to a different pool
@@ -3439,22 +3458,30 @@ sleep:
 static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
 {
        struct worker_pool *pool = pwq->pool;
+       struct work_struct *cursor = &pwq->mayday_cursor;
        struct work_struct *work, *n;
 
        /* need rescue? */
        if (!pwq->nr_active || !need_to_create_worker(pool))
                return false;
 
-       /*
-        * Slurp in all works issued via this workqueue and
-        * process'em.
-        */
-       list_for_each_entry_safe(work, n, &pool->worklist, entry) {
-               if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
+       /* search from the start or cursor if available */
+       if (list_empty(&cursor->entry))
+               work = list_first_entry(&pool->worklist, struct work_struct, entry);
+       else
+               work = list_next_entry(cursor, entry);
+
+       /* find the next work item to rescue */
+       list_for_each_entry_safe_from(work, n, &pool->worklist, entry) {
+               if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) {
                        pwq->stats[PWQ_STAT_RESCUED]++;
+                       /* put the cursor for next search */
+                       list_move_tail(&cursor->entry, &n->entry);
+                       return true;
+               }
        }
 
-       return !list_empty(&rescuer->scheduled);
+       return false;
 }
 
 /**
@@ -3511,6 +3538,7 @@ repeat:
                struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
                                        struct pool_workqueue, mayday_node);
                struct worker_pool *pool = pwq->pool;
+               unsigned int count = 0;
 
                __set_current_state(TASK_RUNNING);
                list_del_init(&pwq->mayday_node);
@@ -3523,25 +3551,27 @@ repeat:
 
                WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
 
-               if (assign_rescuer_work(pwq, rescuer)) {
+               while (assign_rescuer_work(pwq, rescuer)) {
                        process_scheduled_works(rescuer);
 
                        /*
-                        * The above execution of rescued work items could
-                        * have created more to rescue through
-                        * pwq_activate_first_inactive() or chained
-                        * queueing.  Let's put @pwq back on mayday list so
-                        * that such back-to-back work items, which may be
-                        * being used to relieve memory pressure, don't
-                        * incur MAYDAY_INTERVAL delay inbetween.
+                        * If the per-turn work item limit is reached and other
+                        * PWQs are in mayday, requeue mayday for this PWQ and
+                        * let the rescuer handle the other PWQs first.
                         */
-                       if (pwq->nr_active && need_to_create_worker(pool)) {
+                       if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) &&
+                           pwq->nr_active && need_to_create_worker(pool)) {
                                raw_spin_lock(&wq_mayday_lock);
                                send_mayday(pwq);
                                raw_spin_unlock(&wq_mayday_lock);
+                               break;
                        }
                }
 
+               /* The cursor can not be left behind without the rescuer watching it. */
+               if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node))
+                       list_del_init(&pwq->mayday_cursor.entry);
+
                /*
                 * Leave this pool. Notify regular workers; otherwise, we end up
                 * with 0 concurrency and stalling the execution.
@@ -5160,6 +5190,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
        INIT_LIST_HEAD(&pwq->pwqs_node);
        INIT_LIST_HEAD(&pwq->mayday_node);
        kthread_init_work(&pwq->release_work, pwq_release_workfn);
+
+       /*
+        * Set the dummy cursor work with valid function and get_work_pwq().
+        *
+        * The cursor work should only be in the pwq->pool->worklist, and
+        * should not be treated as a processable work item.
+        *
+        * WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
+        * surprise for kernel debugging tools and reviewers.
+        */
+       INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func);
+       atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq |
+                       WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE);
 }
 
 /* sync @pwq with the current state of its associated wq and link it */