]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched/wait: Drop WQ_FLAG_EXCLUSIVE from add_wait_queue_priority()
authorSean Christopherson <seanjc@google.com>
Thu, 22 May 2025 23:52:16 +0000 (16:52 -0700)
committerSean Christopherson <seanjc@google.com>
Mon, 23 Jun 2025 16:50:57 +0000 (09:50 -0700)
Drop the setting of WQ_FLAG_EXCLUSIVE from add_wait_queue_priority() and
instead have callers manually add the flag prior to adding their structure
to the queue.  Blindly setting WQ_FLAG_EXCLUSIVE is flawed, as the nature
of exclusive, priority waiters means that only the first waiter added will
ever receive notifications.

Pushing the flawed behavior to callers will allow fixing the problem one
hypervisor at a time (KVM added the flawed API, and then KVM's code was
copy+pasted nearly verbatim by Xen and Hyper-V), and will also allow for
adding an API that provides true exclusivity, i.e. that guarantees at most
one priority waiter is in the queue.

Opportunistically add a comment in Hyper-V to call out the mess.  Xen
privcmd's irqfd_wakefup() doesn't actually operate in exclusive mode, i.e.
can be "fixed" simply by dropping WQ_FLAG_EXCLUSIVE.  And KVM is primed to
switch to the aforementioned fully exclusive API, i.e. won't be carrying
the flawed code for long.

No functional change intended.

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250522235223.3178519-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
drivers/hv/mshv_eventfd.c
drivers/xen/privcmd.c
kernel/sched/wait.c
virt/kvm/eventfd.c

index 8dd22be2ca0b4c6c8e78df28429d33d9893285de..b348928871c2f43bbab2ce8785ae5b320e1fb90c 100644 (file)
@@ -368,6 +368,14 @@ static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
                        container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
 
        irqfd->irqfd_wqh = wqh;
+
+       /*
+        * TODO: Ensure there isn't already an exclusive, priority waiter, e.g.
+        * that the irqfd isn't already bound to another partition.  Only the
+        * first exclusive waiter encountered will be notified, and
+        * add_wait_queue_priority() doesn't enforce exclusivity.
+        */
+       irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE;
        add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
 }
 
index 13a10f3294a80d8daa231ac556a998b2e494378a..c08ec8a7d27c1d37bcf84cb7131993c16785e7bb 100644 (file)
@@ -957,6 +957,7 @@ irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
        struct privcmd_kernel_irqfd *kirqfd =
                container_of(pt, struct privcmd_kernel_irqfd, pt);
 
+       kirqfd->wait.flags |= WQ_FLAG_EXCLUSIVE;
        add_wait_queue_priority(wqh, &kirqfd->wait);
 }
 
index 51e38f5f47018c953e31c834dc6385c182359359..4ab3ab195277a413cc3eae15d50d51b66ab15fb5 100644 (file)
@@ -40,7 +40,7 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
 {
        unsigned long flags;
 
-       wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+       wq_entry->flags |= WQ_FLAG_PRIORITY;
        spin_lock_irqsave(&wq_head->lock, flags);
        __add_wait_queue(wq_head, wq_entry);
        spin_unlock_irqrestore(&wq_head->lock, flags);
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(remove_wait_queue);
  * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
  * the list and any non-exclusive tasks will be woken first. A priority task
  * may be at the head of the list, and can consume the event without any other
- * tasks being woken.
+ * tasks being woken if it's also an exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
index a9a13f919de84e0732353812a0083662c3f784e2..f8c2486f95d5936471fd98dca69be1f749a3c399 100644 (file)
@@ -316,6 +316,7 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh,
        init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
 
        spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_);
+       irqfd->wait.flags |= WQ_FLAG_EXCLUSIVE;
        add_wait_queue_priority(wqh, &irqfd->wait);
        spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_);