1 From a46197fa531d3f2cf00b43a84babd3bc6f14d656 Mon Sep 17 00:00:00 2001
2 From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
3 Date: Wed, 3 Apr 2024 16:36:17 +0200
4 Subject: Revert "workqueue: Implement system-wide nr_active enforcement for unbound workqueues"
6 From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
8 This reverts commit 5a70baec2294e8a7d0fcc4558741c23e752dad5c which is
10 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
12 include/linux/workqueue.h | 35 ----
13 kernel/workqueue.c | 341 ++++------------------------------------------
14 2 files changed, 35 insertions(+), 341 deletions(-)
16 --- a/include/linux/workqueue.h
17 +++ b/include/linux/workqueue.h
18 @@ -405,13 +405,6 @@ enum {
19 WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */
20 WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE,
21 WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2,
24 - * Per-node default cap on min_active. Unless explicitly set, min_active
25 - * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see
26 - * workqueue_struct->min_active definition.
28 - WQ_DFL_MIN_ACTIVE = 8,
32 @@ -454,33 +447,11 @@ extern struct workqueue_struct *system_f
33 * alloc_workqueue - allocate a workqueue
34 * @fmt: printf format for the name of the workqueue
36 - * @max_active: max in-flight work items, 0 for default
37 + * @max_active: max in-flight work items per CPU, 0 for default
38 * remaining args: args for @fmt
40 - * For a per-cpu workqueue, @max_active limits the number of in-flight work
41 - * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be
42 - * executing at most one work item for the workqueue.
44 - * For unbound workqueues, @max_active limits the number of in-flight work items
45 - * for the whole system. e.g. @max_active of 16 indicates that that there can be
46 - * at most 16 work items executing for the workqueue in the whole system.
48 - * As sharing the same active counter for an unbound workqueue across multiple
49 - * NUMA nodes can be expensive, @max_active is distributed to each NUMA node
50 - * according to the proportion of the number of online CPUs and enforced
53 - * Depending on online CPU distribution, a node may end up with per-node
54 - * max_active which is significantly lower than @max_active, which can lead to
55 - * deadlocks if the per-node concurrency limit is lower than the maximum number
56 - * of interdependent work items for the workqueue.
58 - * To guarantee forward progress regardless of online CPU distribution, the
59 - * concurrency limit on every node is guaranteed to be equal to or greater than
60 - * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means
61 - * that the sum of per-node max_active's may be larger than @max_active.
63 - * For detailed information on %WQ_* flags, please refer to
64 + * Allocate a workqueue with the specified parameters. For detailed
65 + * information on WQ_* flags, please refer to
66 * Documentation/core-api/workqueue.rst.
69 --- a/kernel/workqueue.c
70 +++ b/kernel/workqueue.c
71 @@ -122,9 +122,6 @@ enum {
73 * L: pool->lock protected. Access with pool->lock held.
75 - * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
78 * K: Only modified by worker while holding pool->lock. Can be safely read by
79 * self, while holding pool->lock or from IRQ context if %current is the
81 @@ -246,18 +243,17 @@ struct pool_workqueue {
82 * pwq->inactive_works instead of pool->worklist and marked with
83 * WORK_STRUCT_INACTIVE.
85 - * All work items marked with WORK_STRUCT_INACTIVE do not participate in
86 - * nr_active and all work items in pwq->inactive_works are marked with
87 - * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
88 - * in pwq->inactive_works. Some of them are ready to run in
89 - * pool->worklist or worker->scheduled. Those work itmes are only struct
90 - * wq_barrier which is used for flush_work() and should not participate
91 - * in nr_active. For non-barrier work item, it is marked with
92 - * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
93 + * All work items marked with WORK_STRUCT_INACTIVE do not participate
94 + * in pwq->nr_active and all work items in pwq->inactive_works are
95 + * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE
96 + * work items are in pwq->inactive_works. Some of them are ready to
97 + * run in pool->worklist or worker->scheduled. Those work itmes are
98 + * only struct wq_barrier which is used for flush_work() and should
99 + * not participate in pwq->nr_active. For non-barrier work item, it
100 + * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
102 int nr_active; /* L: nr of active works */
103 struct list_head inactive_works; /* L: inactive works */
104 - struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
105 struct list_head pwqs_node; /* WR: node on wq->pwqs */
106 struct list_head mayday_node; /* MD: node on wq->maydays */
108 @@ -289,19 +285,9 @@ struct wq_device;
109 * on each CPU, in an unbound workqueue, max_active applies to the whole system.
110 * As sharing a single nr_active across multiple sockets can be very expensive,
111 * the counting and enforcement is per NUMA node.
113 - * The following struct is used to enforce per-node max_active. When a pwq wants
114 - * to start executing a work item, it should increment ->nr using
115 - * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
116 - * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
117 - * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
118 - * round-robin order.
120 struct wq_node_nr_active {
121 - int max; /* per-node max_active */
122 - atomic_t nr; /* per-node nr_active */
123 - raw_spinlock_t lock; /* nests inside pool locks */
124 - struct list_head pending_pwqs; /* LN: pwqs with inactive works */
125 + atomic_t nr; /* per-node nr_active count */
129 @@ -324,12 +310,8 @@ struct workqueue_struct {
130 struct worker *rescuer; /* MD: rescue worker */
132 int nr_drainers; /* WQ: drain in progress */
134 - /* See alloc_workqueue() function comment for info on min/max_active */
135 int max_active; /* WO: max active works */
136 - int min_active; /* WO: min active works */
137 int saved_max_active; /* WQ: saved max_active */
138 - int saved_min_active; /* WQ: saved min_active */
140 struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
141 struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */
142 @@ -675,19 +657,6 @@ static struct pool_workqueue *unbound_pw
143 lockdep_is_held(&wq->mutex));
147 - * unbound_effective_cpumask - effective cpumask of an unbound workqueue
148 - * @wq: workqueue of interest
150 - * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
151 - * is masked with wq_unbound_cpumask to determine the effective cpumask. The
152 - * default pwq is always mapped to the pool with the current effective cpumask.
154 -static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
156 - return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
159 static unsigned int work_color_to_flags(int color)
161 return color << WORK_STRUCT_COLOR_SHIFT;
162 @@ -1483,46 +1452,6 @@ static struct wq_node_nr_active *wq_node
166 - * wq_update_node_max_active - Update per-node max_actives to use
167 - * @wq: workqueue to update
168 - * @off_cpu: CPU that's going down, -1 if a CPU is not going down
170 - * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
171 - * distributed among nodes according to the proportions of numbers of online
172 - * cpus. The result is always between @wq->min_active and max_active.
174 -static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
176 - struct cpumask *effective = unbound_effective_cpumask(wq);
177 - int min_active = READ_ONCE(wq->min_active);
178 - int max_active = READ_ONCE(wq->max_active);
179 - int total_cpus, node;
181 - lockdep_assert_held(&wq->mutex);
183 - if (!cpumask_test_cpu(off_cpu, effective))
186 - total_cpus = cpumask_weight_and(effective, cpu_online_mask);
190 - for_each_node(node) {
193 - node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
194 - if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
197 - wq_node_nr_active(wq, node)->max =
198 - clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
199 - min_active, max_active);
202 - wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active;
206 * get_pwq - get an extra reference on the specified pool_workqueue
207 * @pwq: pool_workqueue to get
209 @@ -1619,98 +1548,35 @@ static bool pwq_activate_work(struct poo
213 -static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
215 - int max = READ_ONCE(nna->max);
220 - old = atomic_read(&nna->nr);
223 - tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
230 * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
231 * @pwq: pool_workqueue of interest
232 - * @fill: max_active may have increased, try to increase concurrency level
234 * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
235 * successfully obtained. %false otherwise.
237 -static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
238 +static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq)
240 struct workqueue_struct *wq = pwq->wq;
241 struct worker_pool *pool = pwq->pool;
242 struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
243 - bool obtained = false;
246 lockdep_assert_held(&pool->lock);
249 - /* per-cpu workqueue, pwq->nr_active is sufficient */
250 - obtained = pwq->nr_active < READ_ONCE(wq->max_active);
255 - * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
256 - * already waiting on $nna, pwq_dec_nr_active() will maintain the
257 - * concurrency level. Don't jump the line.
259 - * We need to ignore the pending test after max_active has increased as
260 - * pwq_dec_nr_active() can only maintain the concurrency level but not
261 - * increase it. This is indicated by @fill.
263 - if (!list_empty(&pwq->pending_node) && likely(!fill))
266 - obtained = tryinc_node_nr_active(nna);
271 - * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
272 - * and try again. The smp_mb() is paired with the implied memory barrier
273 - * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
274 - * we see the decremented $nna->nr or they see non-empty
275 - * $nna->pending_pwqs.
277 - raw_spin_lock(&nna->lock);
279 - if (list_empty(&pwq->pending_node))
280 - list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
281 - else if (likely(!fill))
286 - obtained = tryinc_node_nr_active(nna);
289 - * If @fill, @pwq might have already been pending. Being spuriously
290 - * pending in cold paths doesn't affect anything. Let's leave it be.
292 - if (obtained && likely(!fill))
293 - list_del_init(&pwq->pending_node);
294 + obtained = pwq->nr_active < READ_ONCE(wq->max_active);
297 - raw_spin_unlock(&nna->lock);
303 + atomic_inc(&nna->nr);
309 * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
310 * @pwq: pool_workqueue of interest
311 - * @fill: max_active may have increased, try to increase concurrency level
313 * Activate the first inactive work item of @pwq if available and allowed by
315 @@ -1718,13 +1584,13 @@ out:
316 * Returns %true if an inactive work item has been activated. %false if no
317 * inactive work item is found or max_active limit is reached.
319 -static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
320 +static bool pwq_activate_first_inactive(struct pool_workqueue *pwq)
322 struct work_struct *work =
323 list_first_entry_or_null(&pwq->inactive_works,
324 struct work_struct, entry);
326 - if (work && pwq_tryinc_nr_active(pwq, fill)) {
327 + if (work && pwq_tryinc_nr_active(pwq)) {
328 __pwq_activate_work(pwq, work);
331 @@ -1733,92 +1599,10 @@ static bool pwq_activate_first_inactive(
335 - * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
336 - * @nna: wq_node_nr_active to activate a pending pwq for
337 - * @caller_pool: worker_pool the caller is locking
339 - * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
340 - * @caller_pool may be unlocked and relocked to lock other worker_pools.
342 -static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
343 - struct worker_pool *caller_pool)
345 - struct worker_pool *locked_pool = caller_pool;
346 - struct pool_workqueue *pwq;
347 - struct work_struct *work;
349 - lockdep_assert_held(&caller_pool->lock);
351 - raw_spin_lock(&nna->lock);
353 - pwq = list_first_entry_or_null(&nna->pending_pwqs,
354 - struct pool_workqueue, pending_node);
359 - * If @pwq is for a different pool than @locked_pool, we need to lock
360 - * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
361 - * / lock dance. For that, we also need to release @nna->lock as it's
362 - * nested inside pool locks.
364 - if (pwq->pool != locked_pool) {
365 - raw_spin_unlock(&locked_pool->lock);
366 - locked_pool = pwq->pool;
367 - if (!raw_spin_trylock(&locked_pool->lock)) {
368 - raw_spin_unlock(&nna->lock);
369 - raw_spin_lock(&locked_pool->lock);
370 - raw_spin_lock(&nna->lock);
376 - * $pwq may not have any inactive work items due to e.g. cancellations.
377 - * Drop it from pending_pwqs and see if there's another one.
379 - work = list_first_entry_or_null(&pwq->inactive_works,
380 - struct work_struct, entry);
382 - list_del_init(&pwq->pending_node);
387 - * Acquire an nr_active count and activate the inactive work item. If
388 - * $pwq still has inactive work items, rotate it to the end of the
389 - * pending_pwqs so that we round-robin through them. This means that
390 - * inactive work items are not activated in queueing order which is fine
391 - * given that there has never been any ordering across different pwqs.
393 - if (likely(tryinc_node_nr_active(nna))) {
395 - __pwq_activate_work(pwq, work);
397 - if (list_empty(&pwq->inactive_works))
398 - list_del_init(&pwq->pending_node);
400 - list_move_tail(&pwq->pending_node, &nna->pending_pwqs);
402 - /* if activating a foreign pool, make sure it's running */
403 - if (pwq->pool != caller_pool)
404 - kick_pool(pwq->pool);
408 - raw_spin_unlock(&nna->lock);
409 - if (locked_pool != caller_pool) {
410 - raw_spin_unlock(&locked_pool->lock);
411 - raw_spin_lock(&caller_pool->lock);
416 * pwq_dec_nr_active - Retire an active count
417 * @pwq: pool_workqueue of interest
419 * Decrement @pwq's nr_active and try to activate the first inactive work item.
420 - * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
422 static void pwq_dec_nr_active(struct pool_workqueue *pwq)
424 @@ -1838,29 +1622,12 @@ static void pwq_dec_nr_active(struct poo
425 * inactive work item on @pwq itself.
428 - pwq_activate_first_inactive(pwq, false);
429 + pwq_activate_first_inactive(pwq);
434 - * If @pwq is for an unbound workqueue, it's more complicated because
435 - * multiple pwqs and pools may be sharing the nr_active count. When a
436 - * pwq needs to wait for an nr_active count, it puts itself on
437 - * $nna->pending_pwqs. The following atomic_dec_return()'s implied
438 - * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
439 - * guarantee that either we see non-empty pending_pwqs or they see
440 - * decremented $nna->nr.
442 - * $nna->max may change as CPUs come online/offline and @pwq->wq's
443 - * max_active gets updated. However, it is guaranteed to be equal to or
444 - * larger than @pwq->wq->min_active which is above zero unless freezing.
445 - * This maintains the forward progress guarantee.
447 - if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
450 - if (!list_empty(&nna->pending_pwqs))
451 - node_activate_pending_pwq(nna, pool);
452 + atomic_dec(&nna->nr);
453 + pwq_activate_first_inactive(pwq);
457 @@ -2181,7 +1948,7 @@ retry:
458 * @work must also queue behind existing inactive work items to maintain
459 * ordering when max_active changes. See wq_adjust_max_active().
461 - if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
462 + if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq)) {
463 if (list_empty(&pool->worklist))
464 pool->watchdog_ts = jiffies;
466 @@ -3414,7 +3181,7 @@ static void insert_wq_barrier(struct poo
468 barr->task = current;
470 - /* The barrier work item does not participate in nr_active. */
471 + /* The barrier work item does not participate in pwq->nr_active. */
472 work_flags |= WORK_STRUCT_INACTIVE;
475 @@ -4330,8 +4097,6 @@ static void free_node_nr_active(struct w
476 static void init_node_nr_active(struct wq_node_nr_active *nna)
478 atomic_set(&nna->nr, 0);
479 - raw_spin_lock_init(&nna->lock);
480 - INIT_LIST_HEAD(&nna->pending_pwqs);
484 @@ -4571,15 +4336,6 @@ static void pwq_release_workfn(struct kt
485 mutex_unlock(&wq_pool_mutex);
488 - if (!list_empty(&pwq->pending_node)) {
489 - struct wq_node_nr_active *nna =
490 - wq_node_nr_active(pwq->wq, pwq->pool->node);
492 - raw_spin_lock_irq(&nna->lock);
493 - list_del_init(&pwq->pending_node);
494 - raw_spin_unlock_irq(&nna->lock);
497 call_rcu(&pwq->rcu, rcu_free_pwq);
500 @@ -4605,7 +4361,6 @@ static void init_pwq(struct pool_workque
501 pwq->flush_color = -1;
503 INIT_LIST_HEAD(&pwq->inactive_works);
504 - INIT_LIST_HEAD(&pwq->pending_node);
505 INIT_LIST_HEAD(&pwq->pwqs_node);
506 INIT_LIST_HEAD(&pwq->mayday_node);
507 kthread_init_work(&pwq->release_work, pwq_release_workfn);
508 @@ -4813,9 +4568,6 @@ static void apply_wqattrs_commit(struct
510 ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);
512 - /* update node_nr_active->max */
513 - wq_update_node_max_active(ctx->wq, -1);
515 mutex_unlock(&ctx->wq->mutex);
518 @@ -5089,35 +4841,24 @@ static int init_rescuer(struct workqueue
519 static void wq_adjust_max_active(struct workqueue_struct *wq)
522 - int new_max, new_min;
524 lockdep_assert_held(&wq->mutex);
526 if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
530 - new_max = wq->saved_max_active;
531 - new_min = wq->saved_min_active;
532 + WRITE_ONCE(wq->max_active, 0);
536 - if (wq->max_active == new_max && wq->min_active == new_min)
537 + if (wq->max_active == wq->saved_max_active)
541 - * Update @wq->max/min_active and then kick inactive work items if more
542 + * Update @wq->max_active and then kick inactive work items if more
543 * active work items are allowed. This doesn't break work item ordering
544 * because new work items are always queued behind existing inactive
545 * work items if there are any.
547 - WRITE_ONCE(wq->max_active, new_max);
548 - WRITE_ONCE(wq->min_active, new_min);
550 - if (wq->flags & WQ_UNBOUND)
551 - wq_update_node_max_active(wq, -1);
555 + WRITE_ONCE(wq->max_active, wq->saved_max_active);
558 * Round-robin through pwq's activating the first inactive work item
559 @@ -5132,7 +4873,7 @@ static void wq_adjust_max_active(struct
561 /* can be called during early boot w/ irq disabled */
562 raw_spin_lock_irqsave(&pwq->pool->lock, flags);
563 - if (pwq_activate_first_inactive(pwq, true)) {
564 + if (pwq_activate_first_inactive(pwq)) {
566 kick_pool(pwq->pool);
568 @@ -5194,9 +4935,7 @@ struct workqueue_struct *alloc_workqueue
571 wq->max_active = max_active;
572 - wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
573 - wq->saved_max_active = wq->max_active;
574 - wq->saved_min_active = wq->min_active;
575 + wq->saved_max_active = max_active;
576 mutex_init(&wq->mutex);
577 atomic_set(&wq->nr_pwqs_to_flush, 0);
578 INIT_LIST_HEAD(&wq->pwqs);
579 @@ -5362,8 +5101,7 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
580 * @wq: target workqueue
581 * @max_active: new max_active value.
583 - * Set max_active of @wq to @max_active. See the alloc_workqueue() function
585 + * Set max_active of @wq to @max_active.
588 * Don't call from IRQ context.
589 @@ -5380,9 +5118,6 @@ void workqueue_set_max_active(struct wor
591 wq->flags &= ~__WQ_ORDERED;
592 wq->saved_max_active = max_active;
593 - if (wq->flags & WQ_UNBOUND)
594 - wq->saved_min_active = min(wq->saved_min_active, max_active);
596 wq_adjust_max_active(wq);
598 mutex_unlock(&wq->mutex);
599 @@ -6064,10 +5799,6 @@ int workqueue_online_cpu(unsigned int cp
601 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
602 wq_update_pod(wq, tcpu, cpu, true);
604 - mutex_lock(&wq->mutex);
605 - wq_update_node_max_active(wq, -1);
606 - mutex_unlock(&wq->mutex);
610 @@ -6096,10 +5827,6 @@ int workqueue_offline_cpu(unsigned int c
612 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
613 wq_update_pod(wq, tcpu, cpu, false);
615 - mutex_lock(&wq->mutex);
616 - wq_update_node_max_active(wq, cpu);
617 - mutex_unlock(&wq->mutex);
620 mutex_unlock(&wq_pool_mutex);
621 @@ -7296,12 +7023,8 @@ void __init workqueue_init_topology(void
622 * combinations to apply per-pod sharing.
624 list_for_each_entry(wq, &workqueues, list) {
625 - for_each_online_cpu(cpu)
626 + for_each_online_cpu(cpu) {
627 wq_update_pod(wq, cpu, cpu, true);
628 - if (wq->flags & WQ_UNBOUND) {
629 - mutex_lock(&wq->mutex);
630 - wq_update_node_max_active(wq, -1);
631 - mutex_unlock(&wq->mutex);