1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
5 #include <linux/file.h>
7 #include <linux/slab.h>
8 #include <linux/poll.h>
9 #include <linux/hashtable.h>
10 #include <linux/io_uring.h>
12 #include <trace/events/io_uring.h>
14 #include <uapi/linux/io_uring.h>
23 struct io_poll_update
{
29 bool update_user_data
;
32 struct io_poll_table
{
33 struct poll_table_struct pt
;
38 /* output value, set only if arm poll returns >0 */
42 #define IO_POLL_CANCEL_FLAG BIT(31)
43 #define IO_POLL_RETRY_FLAG BIT(30)
44 #define IO_POLL_REF_MASK GENMASK(29, 0)
47 * We usually have 1-2 refs taken, 128 is more than enough and we want to
48 * maximise the margin between this amount and the moment when it overflows.
50 #define IO_POLL_REF_BIAS 128
52 #define IO_WQE_F_DOUBLE 1
54 static int io_poll_wake(struct wait_queue_entry
*wait
, unsigned mode
, int sync
,
57 static inline struct io_kiocb
*wqe_to_req(struct wait_queue_entry
*wqe
)
59 unsigned long priv
= (unsigned long)wqe
->private;
61 return (struct io_kiocb
*)(priv
& ~IO_WQE_F_DOUBLE
);
64 static inline bool wqe_is_double(struct wait_queue_entry
*wqe
)
66 unsigned long priv
= (unsigned long)wqe
->private;
68 return priv
& IO_WQE_F_DOUBLE
;
71 static bool io_poll_get_ownership_slowpath(struct io_kiocb
*req
)
76 * poll_refs are already elevated and we don't have much hope for
77 * grabbing the ownership. Instead of incrementing set a retry flag
78 * to notify the loop that there might have been some change.
80 v
= atomic_fetch_or(IO_POLL_RETRY_FLAG
, &req
->poll_refs
);
81 if (v
& IO_POLL_REF_MASK
)
83 return !(atomic_fetch_inc(&req
->poll_refs
) & IO_POLL_REF_MASK
);
87 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
88 * bump it and acquire ownership. It's disallowed to modify requests while not
89 * owning it, that prevents from races for enqueueing task_work's and b/w
90 * arming poll and wakeups.
92 static inline bool io_poll_get_ownership(struct io_kiocb
*req
)
94 if (unlikely(atomic_read(&req
->poll_refs
) >= IO_POLL_REF_BIAS
))
95 return io_poll_get_ownership_slowpath(req
);
96 return !(atomic_fetch_inc(&req
->poll_refs
) & IO_POLL_REF_MASK
);
99 static void io_poll_mark_cancelled(struct io_kiocb
*req
)
101 atomic_or(IO_POLL_CANCEL_FLAG
, &req
->poll_refs
);
104 static struct io_poll
*io_poll_get_double(struct io_kiocb
*req
)
106 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
107 if (req
->opcode
== IORING_OP_POLL_ADD
)
108 return req
->async_data
;
109 return req
->apoll
->double_poll
;
112 static struct io_poll
*io_poll_get_single(struct io_kiocb
*req
)
114 if (req
->opcode
== IORING_OP_POLL_ADD
)
115 return io_kiocb_to_cmd(req
, struct io_poll
);
116 return &req
->apoll
->poll
;
119 static void io_poll_req_insert(struct io_kiocb
*req
)
121 struct io_hash_table
*table
= &req
->ctx
->cancel_table
;
122 u32 index
= hash_long(req
->cqe
.user_data
, table
->hash_bits
);
123 struct io_hash_bucket
*hb
= &table
->hbs
[index
];
125 spin_lock(&hb
->lock
);
126 hlist_add_head(&req
->hash_node
, &hb
->list
);
127 spin_unlock(&hb
->lock
);
130 static void io_poll_req_delete(struct io_kiocb
*req
, struct io_ring_ctx
*ctx
)
132 struct io_hash_table
*table
= &req
->ctx
->cancel_table
;
133 u32 index
= hash_long(req
->cqe
.user_data
, table
->hash_bits
);
134 spinlock_t
*lock
= &table
->hbs
[index
].lock
;
137 hash_del(&req
->hash_node
);
141 static void io_poll_req_insert_locked(struct io_kiocb
*req
)
143 struct io_hash_table
*table
= &req
->ctx
->cancel_table_locked
;
144 u32 index
= hash_long(req
->cqe
.user_data
, table
->hash_bits
);
146 lockdep_assert_held(&req
->ctx
->uring_lock
);
148 hlist_add_head(&req
->hash_node
, &table
->hbs
[index
].list
);
151 static void io_poll_tw_hash_eject(struct io_kiocb
*req
, struct io_tw_state
*ts
)
153 struct io_ring_ctx
*ctx
= req
->ctx
;
155 if (req
->flags
& REQ_F_HASH_LOCKED
) {
157 * ->cancel_table_locked is protected by ->uring_lock in
158 * contrast to per bucket spinlocks. Likely, tctx_task_work()
159 * already grabbed the mutex for us, but there is a chance it
163 hash_del(&req
->hash_node
);
164 req
->flags
&= ~REQ_F_HASH_LOCKED
;
166 io_poll_req_delete(req
, ctx
);
170 static void io_init_poll_iocb(struct io_poll
*poll
, __poll_t events
)
173 #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
174 /* mask in events that we always want/need */
175 poll
->events
= events
| IO_POLL_UNMASK
;
176 INIT_LIST_HEAD(&poll
->wait
.entry
);
177 init_waitqueue_func_entry(&poll
->wait
, io_poll_wake
);
180 static inline void io_poll_remove_entry(struct io_poll
*poll
)
182 struct wait_queue_head
*head
= smp_load_acquire(&poll
->head
);
185 spin_lock_irq(&head
->lock
);
186 list_del_init(&poll
->wait
.entry
);
188 spin_unlock_irq(&head
->lock
);
192 static void io_poll_remove_entries(struct io_kiocb
*req
)
195 * Nothing to do if neither of those flags are set. Avoid dipping
196 * into the poll/apoll/double cachelines if we can.
198 if (!(req
->flags
& (REQ_F_SINGLE_POLL
| REQ_F_DOUBLE_POLL
)))
202 * While we hold the waitqueue lock and the waitqueue is nonempty,
203 * wake_up_pollfree() will wait for us. However, taking the waitqueue
204 * lock in the first place can race with the waitqueue being freed.
206 * We solve this as eventpoll does: by taking advantage of the fact that
207 * all users of wake_up_pollfree() will RCU-delay the actual free. If
208 * we enter rcu_read_lock() and see that the pointer to the queue is
209 * non-NULL, we can then lock it without the memory being freed out from
212 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
213 * case the caller deletes the entry from the queue, leaving it empty.
214 * In that case, only RCU prevents the queue memory from being freed.
217 if (req
->flags
& REQ_F_SINGLE_POLL
)
218 io_poll_remove_entry(io_poll_get_single(req
));
219 if (req
->flags
& REQ_F_DOUBLE_POLL
)
220 io_poll_remove_entry(io_poll_get_double(req
));
226 IOU_POLL_NO_ACTION
= 1,
227 IOU_POLL_REMOVE_POLL_USE_RES
= 2,
228 IOU_POLL_REISSUE
= 3,
229 IOU_POLL_REQUEUE
= 4,
232 static void __io_poll_execute(struct io_kiocb
*req
, int mask
)
236 io_req_set_res(req
, mask
, 0);
237 req
->io_task_work
.func
= io_poll_task_func
;
239 trace_io_uring_task_add(req
, mask
);
241 if (!(req
->flags
& REQ_F_POLL_NO_LAZY
))
242 flags
= IOU_F_TWQ_LAZY_WAKE
;
243 __io_req_task_work_add(req
, flags
);
246 static inline void io_poll_execute(struct io_kiocb
*req
, int res
)
248 if (io_poll_get_ownership(req
))
249 __io_poll_execute(req
, res
);
253 * All poll tw should go through this. Checks for poll events, manages
254 * references, does rewait, etc.
256 * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action
257 * require, which is either spurious wakeup or multishot CQE is served.
258 * IOU_POLL_DONE when it's done with the request, then the mask is stored in
259 * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot
260 * poll and that the result is stored in req->cqe.
262 static int io_poll_check_events(struct io_kiocb
*req
, struct io_tw_state
*ts
)
266 /* req->task == current here, checking PF_EXITING is safe */
267 if (unlikely(req
->task
->flags
& PF_EXITING
))
271 v
= atomic_read(&req
->poll_refs
);
273 if (unlikely(v
!= 1)) {
274 /* tw should be the owner and so have some refs */
275 if (WARN_ON_ONCE(!(v
& IO_POLL_REF_MASK
)))
276 return IOU_POLL_NO_ACTION
;
277 if (v
& IO_POLL_CANCEL_FLAG
)
280 * cqe.res contains only events of the first wake up
281 * and all others are to be lost. Redo vfs_poll() to get
284 if ((v
& IO_POLL_REF_MASK
) != 1)
287 if (v
& IO_POLL_RETRY_FLAG
) {
290 * We won't find new events that came in between
291 * vfs_poll and the ref put unless we clear the
294 atomic_andnot(IO_POLL_RETRY_FLAG
, &req
->poll_refs
);
295 v
&= ~IO_POLL_RETRY_FLAG
;
299 /* the mask was stashed in __io_poll_execute */
301 struct poll_table_struct pt
= { ._key
= req
->apoll_events
};
302 req
->cqe
.res
= vfs_poll(req
->file
, &pt
) & req
->apoll_events
;
304 * We got woken with a mask, but someone else got to
305 * it first. The above vfs_poll() doesn't add us back
306 * to the waitqueue, so if we get nothing back, we
307 * should be safe and attempt a reissue.
309 if (unlikely(!req
->cqe
.res
)) {
310 /* Multishot armed need not reissue */
311 if (!(req
->apoll_events
& EPOLLONESHOT
))
313 return IOU_POLL_REISSUE
;
316 if (req
->apoll_events
& EPOLLONESHOT
)
317 return IOU_POLL_DONE
;
319 /* multishot, just fill a CQE and proceed */
320 if (!(req
->flags
& REQ_F_APOLL_MULTISHOT
)) {
321 __poll_t mask
= mangle_poll(req
->cqe
.res
&
324 if (!io_fill_cqe_req_aux(req
, ts
->locked
, mask
,
325 IORING_CQE_F_MORE
)) {
326 io_req_set_res(req
, mask
, 0);
327 return IOU_POLL_REMOVE_POLL_USE_RES
;
330 int ret
= io_poll_issue(req
, ts
);
331 if (ret
== IOU_STOP_MULTISHOT
)
332 return IOU_POLL_REMOVE_POLL_USE_RES
;
333 else if (ret
== IOU_REQUEUE
)
334 return IOU_POLL_REQUEUE
;
339 /* force the next iteration to vfs_poll() */
343 * Release all references, retry if someone tried to restart
344 * task_work while we were executing it.
346 } while (atomic_sub_return(v
& IO_POLL_REF_MASK
, &req
->poll_refs
) &
349 return IOU_POLL_NO_ACTION
;
352 void io_poll_task_func(struct io_kiocb
*req
, struct io_tw_state
*ts
)
356 ret
= io_poll_check_events(req
, ts
);
357 if (ret
== IOU_POLL_NO_ACTION
) {
359 } else if (ret
== IOU_POLL_REQUEUE
) {
360 __io_poll_execute(req
, 0);
363 io_poll_remove_entries(req
);
364 io_poll_tw_hash_eject(req
, ts
);
366 if (req
->opcode
== IORING_OP_POLL_ADD
) {
367 if (ret
== IOU_POLL_DONE
) {
368 struct io_poll
*poll
;
370 poll
= io_kiocb_to_cmd(req
, struct io_poll
);
371 req
->cqe
.res
= mangle_poll(req
->cqe
.res
& poll
->events
);
372 } else if (ret
== IOU_POLL_REISSUE
) {
373 io_req_task_submit(req
, ts
);
375 } else if (ret
!= IOU_POLL_REMOVE_POLL_USE_RES
) {
380 io_req_set_res(req
, req
->cqe
.res
, 0);
381 io_req_task_complete(req
, ts
);
383 io_tw_lock(req
->ctx
, ts
);
385 if (ret
== IOU_POLL_REMOVE_POLL_USE_RES
)
386 io_req_task_complete(req
, ts
);
387 else if (ret
== IOU_POLL_DONE
|| ret
== IOU_POLL_REISSUE
)
388 io_req_task_submit(req
, ts
);
390 io_req_defer_failed(req
, ret
);
394 static void io_poll_cancel_req(struct io_kiocb
*req
)
396 io_poll_mark_cancelled(req
);
397 /* kick tw, which should complete the request */
398 io_poll_execute(req
, 0);
401 #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI)
403 static __cold
int io_pollfree_wake(struct io_kiocb
*req
, struct io_poll
*poll
)
405 io_poll_mark_cancelled(req
);
406 /* we have to kick tw in case it's not already */
407 io_poll_execute(req
, 0);
410 * If the waitqueue is being freed early but someone is already
411 * holds ownership over it, we have to tear down the request as
412 * best we can. That means immediately removing the request from
413 * its waitqueue and preventing all further accesses to the
414 * waitqueue via the request.
416 list_del_init(&poll
->wait
.entry
);
419 * Careful: this *must* be the last step, since as soon
420 * as req->head is NULL'ed out, the request can be
421 * completed and freed, since aio_poll_complete_work()
422 * will no longer need to take the waitqueue lock.
424 smp_store_release(&poll
->head
, NULL
);
428 static int io_poll_wake(struct wait_queue_entry
*wait
, unsigned mode
, int sync
,
431 struct io_kiocb
*req
= wqe_to_req(wait
);
432 struct io_poll
*poll
= container_of(wait
, struct io_poll
, wait
);
433 __poll_t mask
= key_to_poll(key
);
435 if (unlikely(mask
& POLLFREE
))
436 return io_pollfree_wake(req
, poll
);
438 /* for instances that support it check for an event match first */
439 if (mask
&& !(mask
& (poll
->events
& ~IO_ASYNC_POLL_COMMON
)))
442 if (io_poll_get_ownership(req
)) {
444 * If we trigger a multishot poll off our own wakeup path,
445 * disable multishot as there is a circular dependency between
446 * CQ posting and triggering the event.
448 if (mask
& EPOLL_URING_WAKE
)
449 poll
->events
|= EPOLLONESHOT
;
451 /* optional, saves extra locking for removal in tw handler */
452 if (mask
&& poll
->events
& EPOLLONESHOT
) {
453 list_del_init(&poll
->wait
.entry
);
455 if (wqe_is_double(wait
))
456 req
->flags
&= ~REQ_F_DOUBLE_POLL
;
458 req
->flags
&= ~REQ_F_SINGLE_POLL
;
460 __io_poll_execute(req
, mask
);
465 /* fails only when polling is already completing by the first entry */
466 static bool io_poll_double_prepare(struct io_kiocb
*req
)
468 struct wait_queue_head
*head
;
469 struct io_poll
*poll
= io_poll_get_single(req
);
471 /* head is RCU protected, see io_poll_remove_entries() comments */
473 head
= smp_load_acquire(&poll
->head
);
475 * poll arm might not hold ownership and so race for req->flags with
476 * io_poll_wake(). There is only one poll entry queued, serialise with
477 * it by taking its head lock. As we're still arming the tw hanlder
478 * is not going to be run, so there are no races with it.
481 spin_lock_irq(&head
->lock
);
482 req
->flags
|= REQ_F_DOUBLE_POLL
;
483 if (req
->opcode
== IORING_OP_POLL_ADD
)
484 req
->flags
|= REQ_F_ASYNC_DATA
;
485 spin_unlock_irq(&head
->lock
);
491 static void __io_queue_proc(struct io_poll
*poll
, struct io_poll_table
*pt
,
492 struct wait_queue_head
*head
,
493 struct io_poll
**poll_ptr
)
495 struct io_kiocb
*req
= pt
->req
;
496 unsigned long wqe_private
= (unsigned long) req
;
499 * The file being polled uses multiple waitqueues for poll handling
500 * (e.g. one for read, one for write). Setup a separate io_poll
503 if (unlikely(pt
->nr_entries
)) {
504 struct io_poll
*first
= poll
;
506 /* double add on the same waitqueue head, ignore */
507 if (first
->head
== head
)
509 /* already have a 2nd entry, fail a third attempt */
511 if ((*poll_ptr
)->head
== head
)
517 poll
= kmalloc(sizeof(*poll
), GFP_ATOMIC
);
523 /* mark as double wq entry */
524 wqe_private
|= IO_WQE_F_DOUBLE
;
525 io_init_poll_iocb(poll
, first
->events
);
526 if (!io_poll_double_prepare(req
)) {
527 /* the request is completing, just back off */
533 /* fine to modify, there is no poll queued to race with us */
534 req
->flags
|= REQ_F_SINGLE_POLL
;
539 poll
->wait
.private = (void *) wqe_private
;
541 if (poll
->events
& EPOLLEXCLUSIVE
) {
543 * Exclusive waits may only wake a limited amount of entries
544 * rather than all of them, this may interfere with lazy
545 * wake if someone does wait(events > 1). Ensure we don't do
546 * lazy wake for those, as we need to process each one as they
549 req
->flags
|= REQ_F_POLL_NO_LAZY
;
550 add_wait_queue_exclusive(head
, &poll
->wait
);
552 add_wait_queue(head
, &poll
->wait
);
556 static void io_poll_queue_proc(struct file
*file
, struct wait_queue_head
*head
,
557 struct poll_table_struct
*p
)
559 struct io_poll_table
*pt
= container_of(p
, struct io_poll_table
, pt
);
560 struct io_poll
*poll
= io_kiocb_to_cmd(pt
->req
, struct io_poll
);
562 __io_queue_proc(poll
, pt
, head
,
563 (struct io_poll
**) &pt
->req
->async_data
);
566 static bool io_poll_can_finish_inline(struct io_kiocb
*req
,
567 struct io_poll_table
*pt
)
569 return pt
->owning
|| io_poll_get_ownership(req
);
572 static void io_poll_add_hash(struct io_kiocb
*req
)
574 if (req
->flags
& REQ_F_HASH_LOCKED
)
575 io_poll_req_insert_locked(req
);
577 io_poll_req_insert(req
);
581 * Returns 0 when it's handed over for polling. The caller owns the requests if
582 * it returns non-zero, but otherwise should not touch it. Negative values
583 * contain an error code. When the result is >0, the polling has completed
584 * inline and ipt.result_mask is set to the mask.
586 static int __io_arm_poll_handler(struct io_kiocb
*req
,
587 struct io_poll
*poll
,
588 struct io_poll_table
*ipt
, __poll_t mask
,
589 unsigned issue_flags
)
591 INIT_HLIST_NODE(&req
->hash_node
);
592 io_init_poll_iocb(poll
, mask
);
593 poll
->file
= req
->file
;
594 req
->apoll_events
= poll
->events
;
601 * Polling is either completed here or via task_work, so if we're in the
602 * task context we're naturally serialised with tw by merit of running
603 * the same task. When it's io-wq, take the ownership to prevent tw
604 * from running. However, when we're in the task context, skip taking
605 * it as an optimisation.
607 * Note: even though the request won't be completed/freed, without
608 * ownership we still can race with io_poll_wake().
609 * io_poll_can_finish_inline() tries to deal with that.
611 ipt
->owning
= issue_flags
& IO_URING_F_UNLOCKED
;
612 atomic_set(&req
->poll_refs
, (int)ipt
->owning
);
614 /* io-wq doesn't hold uring_lock */
615 if (issue_flags
& IO_URING_F_UNLOCKED
)
616 req
->flags
&= ~REQ_F_HASH_LOCKED
;
618 mask
= vfs_poll(req
->file
, &ipt
->pt
) & poll
->events
;
620 if (unlikely(ipt
->error
|| !ipt
->nr_entries
)) {
621 io_poll_remove_entries(req
);
623 if (!io_poll_can_finish_inline(req
, ipt
)) {
624 io_poll_mark_cancelled(req
);
626 } else if (mask
&& (poll
->events
& EPOLLET
)) {
627 ipt
->result_mask
= mask
;
630 return ipt
->error
?: -EINVAL
;
634 ((poll
->events
& (EPOLLET
|EPOLLONESHOT
)) == (EPOLLET
|EPOLLONESHOT
))) {
635 if (!io_poll_can_finish_inline(req
, ipt
)) {
636 io_poll_add_hash(req
);
639 io_poll_remove_entries(req
);
640 ipt
->result_mask
= mask
;
641 /* no one else has access to the req, forget about the ref */
645 io_poll_add_hash(req
);
647 if (mask
&& (poll
->events
& EPOLLET
) &&
648 io_poll_can_finish_inline(req
, ipt
)) {
649 __io_poll_execute(req
, mask
);
655 * Try to release ownership. If we see a change of state, e.g.
656 * poll was waken up, queue up a tw, it'll deal with it.
658 if (atomic_cmpxchg(&req
->poll_refs
, 1, 0) != 1)
659 __io_poll_execute(req
, 0);
664 static void io_async_queue_proc(struct file
*file
, struct wait_queue_head
*head
,
665 struct poll_table_struct
*p
)
667 struct io_poll_table
*pt
= container_of(p
, struct io_poll_table
, pt
);
668 struct async_poll
*apoll
= pt
->req
->apoll
;
670 __io_queue_proc(&apoll
->poll
, pt
, head
, &apoll
->double_poll
);
674 * We can't reliably detect loops in repeated poll triggers and issue
675 * subsequently failing. But rather than fail these immediately, allow a
676 * certain amount of retries before we give up. Given that this condition
677 * should _rarely_ trigger even once, we should be fine with a larger value.
679 #define APOLL_MAX_RETRY 128
681 static struct async_poll
*io_req_alloc_apoll(struct io_kiocb
*req
,
682 unsigned issue_flags
)
684 struct io_ring_ctx
*ctx
= req
->ctx
;
685 struct io_cache_entry
*entry
;
686 struct async_poll
*apoll
;
688 if (req
->flags
& REQ_F_POLLED
) {
690 kfree(apoll
->double_poll
);
691 } else if (!(issue_flags
& IO_URING_F_UNLOCKED
)) {
692 entry
= io_alloc_cache_get(&ctx
->apoll_cache
);
695 apoll
= container_of(entry
, struct async_poll
, cache
);
696 apoll
->poll
.retries
= APOLL_MAX_RETRY
;
699 apoll
= kmalloc(sizeof(*apoll
), GFP_ATOMIC
);
700 if (unlikely(!apoll
))
702 apoll
->poll
.retries
= APOLL_MAX_RETRY
;
704 apoll
->double_poll
= NULL
;
706 if (unlikely(!--apoll
->poll
.retries
))
711 int io_arm_poll_handler(struct io_kiocb
*req
, unsigned issue_flags
)
713 const struct io_issue_def
*def
= &io_issue_defs
[req
->opcode
];
714 struct async_poll
*apoll
;
715 struct io_poll_table ipt
;
716 __poll_t mask
= POLLPRI
| POLLERR
| EPOLLET
;
720 * apoll requests already grab the mutex to complete in the tw handler,
721 * so removal from the mutex-backed hash is free, use it by default.
723 req
->flags
|= REQ_F_HASH_LOCKED
;
725 if (!def
->pollin
&& !def
->pollout
)
726 return IO_APOLL_ABORTED
;
727 if (!io_file_can_poll(req
))
728 return IO_APOLL_ABORTED
;
729 if (!(req
->flags
& REQ_F_APOLL_MULTISHOT
))
730 mask
|= EPOLLONESHOT
;
733 mask
|= EPOLLIN
| EPOLLRDNORM
;
735 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
736 if (req
->flags
& REQ_F_CLEAR_POLLIN
)
739 mask
|= EPOLLOUT
| EPOLLWRNORM
;
741 if (def
->poll_exclusive
)
742 mask
|= EPOLLEXCLUSIVE
;
744 apoll
= io_req_alloc_apoll(req
, issue_flags
);
746 return IO_APOLL_ABORTED
;
747 req
->flags
&= ~(REQ_F_SINGLE_POLL
| REQ_F_DOUBLE_POLL
);
748 req
->flags
|= REQ_F_POLLED
;
749 ipt
.pt
._qproc
= io_async_queue_proc
;
751 io_kbuf_recycle(req
, issue_flags
);
753 ret
= __io_arm_poll_handler(req
, &apoll
->poll
, &ipt
, mask
, issue_flags
);
755 return ret
> 0 ? IO_APOLL_READY
: IO_APOLL_ABORTED
;
756 trace_io_uring_poll_arm(req
, mask
, apoll
->poll
.events
);
760 static __cold
bool io_poll_remove_all_table(struct task_struct
*tsk
,
761 struct io_hash_table
*table
,
764 unsigned nr_buckets
= 1U << table
->hash_bits
;
765 struct hlist_node
*tmp
;
766 struct io_kiocb
*req
;
770 for (i
= 0; i
< nr_buckets
; i
++) {
771 struct io_hash_bucket
*hb
= &table
->hbs
[i
];
773 spin_lock(&hb
->lock
);
774 hlist_for_each_entry_safe(req
, tmp
, &hb
->list
, hash_node
) {
775 if (io_match_task_safe(req
, tsk
, cancel_all
)) {
776 hlist_del_init(&req
->hash_node
);
777 io_poll_cancel_req(req
);
781 spin_unlock(&hb
->lock
);
787 * Returns true if we found and killed one or more poll requests
789 __cold
bool io_poll_remove_all(struct io_ring_ctx
*ctx
, struct task_struct
*tsk
,
791 __must_hold(&ctx
->uring_lock
)
795 ret
= io_poll_remove_all_table(tsk
, &ctx
->cancel_table
, cancel_all
);
796 ret
|= io_poll_remove_all_table(tsk
, &ctx
->cancel_table_locked
, cancel_all
);
800 static struct io_kiocb
*io_poll_find(struct io_ring_ctx
*ctx
, bool poll_only
,
801 struct io_cancel_data
*cd
,
802 struct io_hash_table
*table
,
803 struct io_hash_bucket
**out_bucket
)
805 struct io_kiocb
*req
;
806 u32 index
= hash_long(cd
->data
, table
->hash_bits
);
807 struct io_hash_bucket
*hb
= &table
->hbs
[index
];
811 spin_lock(&hb
->lock
);
812 hlist_for_each_entry(req
, &hb
->list
, hash_node
) {
813 if (cd
->data
!= req
->cqe
.user_data
)
815 if (poll_only
&& req
->opcode
!= IORING_OP_POLL_ADD
)
817 if (cd
->flags
& IORING_ASYNC_CANCEL_ALL
) {
818 if (io_cancel_match_sequence(req
, cd
->seq
))
824 spin_unlock(&hb
->lock
);
828 static struct io_kiocb
*io_poll_file_find(struct io_ring_ctx
*ctx
,
829 struct io_cancel_data
*cd
,
830 struct io_hash_table
*table
,
831 struct io_hash_bucket
**out_bucket
)
833 unsigned nr_buckets
= 1U << table
->hash_bits
;
834 struct io_kiocb
*req
;
839 for (i
= 0; i
< nr_buckets
; i
++) {
840 struct io_hash_bucket
*hb
= &table
->hbs
[i
];
842 spin_lock(&hb
->lock
);
843 hlist_for_each_entry(req
, &hb
->list
, hash_node
) {
844 if (io_cancel_req_match(req
, cd
)) {
849 spin_unlock(&hb
->lock
);
854 static int io_poll_disarm(struct io_kiocb
*req
)
858 if (!io_poll_get_ownership(req
))
860 io_poll_remove_entries(req
);
861 hash_del(&req
->hash_node
);
865 static int __io_poll_cancel(struct io_ring_ctx
*ctx
, struct io_cancel_data
*cd
,
866 struct io_hash_table
*table
)
868 struct io_hash_bucket
*bucket
;
869 struct io_kiocb
*req
;
871 if (cd
->flags
& (IORING_ASYNC_CANCEL_FD
| IORING_ASYNC_CANCEL_OP
|
872 IORING_ASYNC_CANCEL_ANY
))
873 req
= io_poll_file_find(ctx
, cd
, table
, &bucket
);
875 req
= io_poll_find(ctx
, false, cd
, table
, &bucket
);
878 io_poll_cancel_req(req
);
880 spin_unlock(&bucket
->lock
);
881 return req
? 0 : -ENOENT
;
884 int io_poll_cancel(struct io_ring_ctx
*ctx
, struct io_cancel_data
*cd
,
885 unsigned issue_flags
)
889 ret
= __io_poll_cancel(ctx
, cd
, &ctx
->cancel_table
);
893 io_ring_submit_lock(ctx
, issue_flags
);
894 ret
= __io_poll_cancel(ctx
, cd
, &ctx
->cancel_table_locked
);
895 io_ring_submit_unlock(ctx
, issue_flags
);
899 static __poll_t
io_poll_parse_events(const struct io_uring_sqe
*sqe
,
904 events
= READ_ONCE(sqe
->poll32_events
);
906 events
= swahw32(events
);
908 if (!(flags
& IORING_POLL_ADD_MULTI
))
909 events
|= EPOLLONESHOT
;
910 if (!(flags
& IORING_POLL_ADD_LEVEL
))
912 return demangle_poll(events
) |
913 (events
& (EPOLLEXCLUSIVE
|EPOLLONESHOT
|EPOLLET
));
916 int io_poll_remove_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
918 struct io_poll_update
*upd
= io_kiocb_to_cmd(req
, struct io_poll_update
);
921 if (sqe
->buf_index
|| sqe
->splice_fd_in
)
923 flags
= READ_ONCE(sqe
->len
);
924 if (flags
& ~(IORING_POLL_UPDATE_EVENTS
| IORING_POLL_UPDATE_USER_DATA
|
925 IORING_POLL_ADD_MULTI
))
927 /* meaningless without update */
928 if (flags
== IORING_POLL_ADD_MULTI
)
931 upd
->old_user_data
= READ_ONCE(sqe
->addr
);
932 upd
->update_events
= flags
& IORING_POLL_UPDATE_EVENTS
;
933 upd
->update_user_data
= flags
& IORING_POLL_UPDATE_USER_DATA
;
935 upd
->new_user_data
= READ_ONCE(sqe
->off
);
936 if (!upd
->update_user_data
&& upd
->new_user_data
)
938 if (upd
->update_events
)
939 upd
->events
= io_poll_parse_events(sqe
, flags
);
940 else if (sqe
->poll32_events
)
946 int io_poll_add_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
948 struct io_poll
*poll
= io_kiocb_to_cmd(req
, struct io_poll
);
951 if (sqe
->buf_index
|| sqe
->off
|| sqe
->addr
)
953 flags
= READ_ONCE(sqe
->len
);
954 if (flags
& ~IORING_POLL_ADD_MULTI
)
956 if ((flags
& IORING_POLL_ADD_MULTI
) && (req
->flags
& REQ_F_CQE_SKIP
))
959 poll
->events
= io_poll_parse_events(sqe
, flags
);
963 int io_poll_add(struct io_kiocb
*req
, unsigned int issue_flags
)
965 struct io_poll
*poll
= io_kiocb_to_cmd(req
, struct io_poll
);
966 struct io_poll_table ipt
;
969 ipt
.pt
._qproc
= io_poll_queue_proc
;
972 * If sqpoll or single issuer, there is no contention for ->uring_lock
973 * and we'll end up holding it in tw handlers anyway.
975 if (req
->ctx
->flags
& (IORING_SETUP_SQPOLL
|IORING_SETUP_SINGLE_ISSUER
))
976 req
->flags
|= REQ_F_HASH_LOCKED
;
978 ret
= __io_arm_poll_handler(req
, poll
, &ipt
, poll
->events
, issue_flags
);
980 io_req_set_res(req
, ipt
.result_mask
, 0);
983 return ret
?: IOU_ISSUE_SKIP_COMPLETE
;
986 int io_poll_remove(struct io_kiocb
*req
, unsigned int issue_flags
)
988 struct io_poll_update
*poll_update
= io_kiocb_to_cmd(req
, struct io_poll_update
);
989 struct io_ring_ctx
*ctx
= req
->ctx
;
990 struct io_cancel_data cd
= { .ctx
= ctx
, .data
= poll_update
->old_user_data
, };
991 struct io_hash_bucket
*bucket
;
992 struct io_kiocb
*preq
;
994 struct io_tw_state ts
= { .locked
= true };
996 io_ring_submit_lock(ctx
, issue_flags
);
997 preq
= io_poll_find(ctx
, true, &cd
, &ctx
->cancel_table
, &bucket
);
998 ret2
= io_poll_disarm(preq
);
1000 spin_unlock(&bucket
->lock
);
1003 if (ret2
!= -ENOENT
) {
1008 preq
= io_poll_find(ctx
, true, &cd
, &ctx
->cancel_table_locked
, &bucket
);
1009 ret2
= io_poll_disarm(preq
);
1011 spin_unlock(&bucket
->lock
);
1018 if (WARN_ON_ONCE(preq
->opcode
!= IORING_OP_POLL_ADD
)) {
1023 if (poll_update
->update_events
|| poll_update
->update_user_data
) {
1024 /* only mask one event flags, keep behavior flags */
1025 if (poll_update
->update_events
) {
1026 struct io_poll
*poll
= io_kiocb_to_cmd(preq
, struct io_poll
);
1028 poll
->events
&= ~0xffff;
1029 poll
->events
|= poll_update
->events
& 0xffff;
1030 poll
->events
|= IO_POLL_UNMASK
;
1032 if (poll_update
->update_user_data
)
1033 preq
->cqe
.user_data
= poll_update
->new_user_data
;
1035 ret2
= io_poll_add(preq
, issue_flags
& ~IO_URING_F_UNLOCKED
);
1036 /* successfully updated, don't complete poll request */
1037 if (!ret2
|| ret2
== -EIOCBQUEUED
)
1042 io_req_set_res(preq
, -ECANCELED
, 0);
1043 io_req_task_complete(preq
, &ts
);
1045 io_ring_submit_unlock(ctx
, issue_flags
);
1050 /* complete update request, we're done with it */
1051 io_req_set_res(req
, ret
, 0);
1055 void io_apoll_cache_free(struct io_cache_entry
*entry
)
1057 kfree(container_of(entry
, struct async_poll
, cache
));