]>
Commit | Line | Data |
---|---|---|
17437f31 JA |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Contains the core associated with submission side polling of the SQ | |
4 | * ring, offloading submissions from the application to a kernel thread. | |
5 | */ | |
6 | #include <linux/kernel.h> | |
7 | #include <linux/errno.h> | |
8 | #include <linux/file.h> | |
9 | #include <linux/mm.h> | |
10 | #include <linux/slab.h> | |
11 | #include <linux/audit.h> | |
12 | #include <linux/security.h> | |
13 | #include <linux/io_uring.h> | |
14 | ||
15 | #include <uapi/linux/io_uring.h> | |
16 | ||
17437f31 | 17 | #include "io_uring.h" |
ff183d42 | 18 | #include "napi.h" |
17437f31 JA |
19 | #include "sqpoll.h" |
20 | ||
21 | #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 | |
af5d68f8 | 22 | #define IORING_TW_CAP_ENTRIES_VALUE 8 |
17437f31 JA |
23 | |
24 | enum { | |
25 | IO_SQ_THREAD_SHOULD_STOP = 0, | |
26 | IO_SQ_THREAD_SHOULD_PARK, | |
27 | }; | |
28 | ||
29 | void io_sq_thread_unpark(struct io_sq_data *sqd) | |
30 | __releases(&sqd->lock) | |
31 | { | |
32 | WARN_ON_ONCE(sqd->thread == current); | |
33 | ||
34 | /* | |
35 | * Do the dance but not conditional clear_bit() because it'd race with | |
36 | * other threads incrementing park_pending and setting the bit. | |
37 | */ | |
38 | clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); | |
39 | if (atomic_dec_return(&sqd->park_pending)) | |
40 | set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); | |
41 | mutex_unlock(&sqd->lock); | |
42 | } | |
43 | ||
44 | void io_sq_thread_park(struct io_sq_data *sqd) | |
45 | __acquires(&sqd->lock) | |
46 | { | |
47 | WARN_ON_ONCE(sqd->thread == current); | |
48 | ||
49 | atomic_inc(&sqd->park_pending); | |
50 | set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); | |
51 | mutex_lock(&sqd->lock); | |
52 | if (sqd->thread) | |
53 | wake_up_process(sqd->thread); | |
54 | } | |
55 | ||
56 | void io_sq_thread_stop(struct io_sq_data *sqd) | |
57 | { | |
58 | WARN_ON_ONCE(sqd->thread == current); | |
59 | WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); | |
60 | ||
61 | set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); | |
62 | mutex_lock(&sqd->lock); | |
63 | if (sqd->thread) | |
64 | wake_up_process(sqd->thread); | |
65 | mutex_unlock(&sqd->lock); | |
66 | wait_for_completion(&sqd->exited); | |
67 | } | |
68 | ||
69 | void io_put_sq_data(struct io_sq_data *sqd) | |
70 | { | |
71 | if (refcount_dec_and_test(&sqd->refs)) { | |
72 | WARN_ON_ONCE(atomic_read(&sqd->park_pending)); | |
73 | ||
74 | io_sq_thread_stop(sqd); | |
75 | kfree(sqd); | |
76 | } | |
77 | } | |
78 | ||
79 | static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) | |
80 | { | |
81 | struct io_ring_ctx *ctx; | |
82 | unsigned sq_thread_idle = 0; | |
83 | ||
84 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) | |
85 | sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); | |
86 | sqd->sq_thread_idle = sq_thread_idle; | |
87 | } | |
88 | ||
89 | void io_sq_thread_finish(struct io_ring_ctx *ctx) | |
90 | { | |
91 | struct io_sq_data *sqd = ctx->sq_data; | |
92 | ||
93 | if (sqd) { | |
94 | io_sq_thread_park(sqd); | |
95 | list_del_init(&ctx->sqd_list); | |
96 | io_sqd_update_thread_idle(sqd); | |
97 | io_sq_thread_unpark(sqd); | |
98 | ||
99 | io_put_sq_data(sqd); | |
100 | ctx->sq_data = NULL; | |
101 | } | |
102 | } | |
103 | ||
104 | static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) | |
105 | { | |
106 | struct io_ring_ctx *ctx_attach; | |
107 | struct io_sq_data *sqd; | |
108 | struct fd f; | |
109 | ||
110 | f = fdget(p->wq_fd); | |
111 | if (!f.file) | |
112 | return ERR_PTR(-ENXIO); | |
113 | if (!io_is_uring_fops(f.file)) { | |
114 | fdput(f); | |
115 | return ERR_PTR(-EINVAL); | |
116 | } | |
117 | ||
118 | ctx_attach = f.file->private_data; | |
119 | sqd = ctx_attach->sq_data; | |
120 | if (!sqd) { | |
121 | fdput(f); | |
122 | return ERR_PTR(-EINVAL); | |
123 | } | |
124 | if (sqd->task_tgid != current->tgid) { | |
125 | fdput(f); | |
126 | return ERR_PTR(-EPERM); | |
127 | } | |
128 | ||
129 | refcount_inc(&sqd->refs); | |
130 | fdput(f); | |
131 | return sqd; | |
132 | } | |
133 | ||
134 | static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, | |
135 | bool *attached) | |
136 | { | |
137 | struct io_sq_data *sqd; | |
138 | ||
139 | *attached = false; | |
140 | if (p->flags & IORING_SETUP_ATTACH_WQ) { | |
141 | sqd = io_attach_sq_data(p); | |
142 | if (!IS_ERR(sqd)) { | |
143 | *attached = true; | |
144 | return sqd; | |
145 | } | |
146 | /* fall through for EPERM case, setup new sqd/task */ | |
147 | if (PTR_ERR(sqd) != -EPERM) | |
148 | return sqd; | |
149 | } | |
150 | ||
151 | sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); | |
152 | if (!sqd) | |
153 | return ERR_PTR(-ENOMEM); | |
154 | ||
155 | atomic_set(&sqd->park_pending, 0); | |
156 | refcount_set(&sqd->refs, 1); | |
157 | INIT_LIST_HEAD(&sqd->ctx_list); | |
158 | mutex_init(&sqd->lock); | |
159 | init_waitqueue_head(&sqd->wait); | |
160 | init_completion(&sqd->exited); | |
161 | return sqd; | |
162 | } | |
163 | ||
164 | static inline bool io_sqd_events_pending(struct io_sq_data *sqd) | |
165 | { | |
166 | return READ_ONCE(sqd->state); | |
167 | } | |
168 | ||
169 | static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) | |
170 | { | |
171 | unsigned int to_submit; | |
172 | int ret = 0; | |
173 | ||
174 | to_submit = io_sqring_entries(ctx); | |
175 | /* if we're handling multiple rings, cap submit size for fairness */ | |
176 | if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) | |
177 | to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; | |
178 | ||
179 | if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { | |
180 | const struct cred *creds = NULL; | |
181 | ||
182 | if (ctx->sq_creds != current_cred()) | |
183 | creds = override_creds(ctx->sq_creds); | |
184 | ||
185 | mutex_lock(&ctx->uring_lock); | |
186 | if (!wq_list_empty(&ctx->iopoll_list)) | |
187 | io_do_iopoll(ctx, true); | |
188 | ||
189 | /* | |
190 | * Don't submit if refs are dying, good for io_uring_register(), | |
191 | * but also it is relied upon by io_ring_exit_work() | |
192 | */ | |
193 | if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && | |
194 | !(ctx->flags & IORING_SETUP_R_DISABLED)) | |
195 | ret = io_submit_sqes(ctx, to_submit); | |
196 | mutex_unlock(&ctx->uring_lock); | |
197 | ||
ff183d42 SR |
198 | if (io_napi(ctx)) |
199 | ret += io_napi_sqpoll_busy_poll(ctx); | |
200 | ||
17437f31 JA |
201 | if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) |
202 | wake_up(&ctx->sqo_sq_wait); | |
203 | if (creds) | |
204 | revert_creds(creds); | |
205 | } | |
206 | ||
207 | return ret; | |
208 | } | |
209 | ||
210 | static bool io_sqd_handle_event(struct io_sq_data *sqd) | |
211 | { | |
212 | bool did_sig = false; | |
213 | struct ksignal ksig; | |
214 | ||
215 | if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || | |
216 | signal_pending(current)) { | |
217 | mutex_unlock(&sqd->lock); | |
218 | if (signal_pending(current)) | |
219 | did_sig = get_signal(&ksig); | |
220 | cond_resched(); | |
221 | mutex_lock(&sqd->lock); | |
a0d45c3f | 222 | sqd->sq_cpu = raw_smp_processor_id(); |
17437f31 JA |
223 | } |
224 | return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); | |
225 | } | |
226 | ||
af5d68f8 JA |
227 | /* |
228 | * Run task_work, processing the retry_list first. The retry_list holds | |
229 | * entries that we passed on in the previous run, if we had more task_work | |
230 | * than we were asked to process. Newly queued task_work isn't run until the | |
231 | * retry list has been fully processed. | |
232 | */ | |
233 | static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries) | |
234 | { | |
235 | struct io_uring_task *tctx = current->io_uring; | |
236 | unsigned int count = 0; | |
237 | ||
238 | if (*retry_list) { | |
239 | *retry_list = io_handle_tw_list(*retry_list, &count, max_entries); | |
240 | if (count >= max_entries) | |
241 | return count; | |
242 | max_entries -= count; | |
243 | } | |
244 | ||
245 | *retry_list = tctx_task_work_run(tctx, max_entries, &count); | |
246 | return count; | |
247 | } | |
248 | ||
c8d8fc3b JA |
249 | static bool io_sq_tw_pending(struct llist_node *retry_list) |
250 | { | |
251 | struct io_uring_task *tctx = current->io_uring; | |
252 | ||
253 | return retry_list || !llist_empty(&tctx->task_list); | |
254 | } | |
255 | ||
3fcb9d17 XL |
256 | static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start) |
257 | { | |
258 | struct rusage end; | |
259 | ||
260 | getrusage(current, RUSAGE_SELF, &end); | |
261 | end.ru_stime.tv_sec -= start->ru_stime.tv_sec; | |
262 | end.ru_stime.tv_usec -= start->ru_stime.tv_usec; | |
263 | ||
264 | sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000; | |
265 | } | |
266 | ||
17437f31 JA |
267 | static int io_sq_thread(void *data) |
268 | { | |
af5d68f8 | 269 | struct llist_node *retry_list = NULL; |
17437f31 JA |
270 | struct io_sq_data *sqd = data; |
271 | struct io_ring_ctx *ctx; | |
3fcb9d17 | 272 | struct rusage start; |
17437f31 JA |
273 | unsigned long timeout = 0; |
274 | char buf[TASK_COMM_LEN]; | |
275 | DEFINE_WAIT(wait); | |
276 | ||
1251d202 JA |
277 | /* offload context creation failed, just exit */ |
278 | if (!current->io_uring) | |
279 | goto err_out; | |
280 | ||
17437f31 JA |
281 | snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); |
282 | set_task_comm(current, buf); | |
283 | ||
a0d45c3f JA |
284 | /* reset to our pid after we've set task_comm, for fdinfo */ |
285 | sqd->task_pid = current->pid; | |
286 | ||
287 | if (sqd->sq_cpu != -1) { | |
17437f31 | 288 | set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); |
a0d45c3f | 289 | } else { |
17437f31 | 290 | set_cpus_allowed_ptr(current, cpu_online_mask); |
a0d45c3f JA |
291 | sqd->sq_cpu = raw_smp_processor_id(); |
292 | } | |
17437f31 | 293 | |
17437f31 JA |
294 | mutex_lock(&sqd->lock); |
295 | while (1) { | |
296 | bool cap_entries, sqt_spin = false; | |
297 | ||
298 | if (io_sqd_events_pending(sqd) || signal_pending(current)) { | |
299 | if (io_sqd_handle_event(sqd)) | |
300 | break; | |
301 | timeout = jiffies + sqd->sq_thread_idle; | |
302 | } | |
303 | ||
304 | cap_entries = !list_is_singular(&sqd->ctx_list); | |
3fcb9d17 | 305 | getrusage(current, RUSAGE_SELF, &start); |
17437f31 JA |
306 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { |
307 | int ret = __io_sq_thread(ctx, cap_entries); | |
308 | ||
309 | if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) | |
310 | sqt_spin = true; | |
311 | } | |
af5d68f8 | 312 | if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) |
17437f31 JA |
313 | sqt_spin = true; |
314 | ||
315 | if (sqt_spin || !time_after(jiffies, timeout)) { | |
3fcb9d17 XL |
316 | if (sqt_spin) { |
317 | io_sq_update_worktime(sqd, &start); | |
17437f31 | 318 | timeout = jiffies + sqd->sq_thread_idle; |
3fcb9d17 | 319 | } |
533ab73f WC |
320 | if (unlikely(need_resched())) { |
321 | mutex_unlock(&sqd->lock); | |
322 | cond_resched(); | |
323 | mutex_lock(&sqd->lock); | |
a0d45c3f | 324 | sqd->sq_cpu = raw_smp_processor_id(); |
533ab73f | 325 | } |
17437f31 JA |
326 | continue; |
327 | } | |
328 | ||
329 | prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); | |
c8d8fc3b | 330 | if (!io_sqd_events_pending(sqd) && !io_sq_tw_pending(retry_list)) { |
17437f31 JA |
331 | bool needs_sched = true; |
332 | ||
333 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { | |
334 | atomic_or(IORING_SQ_NEED_WAKEUP, | |
335 | &ctx->rings->sq_flags); | |
336 | if ((ctx->flags & IORING_SETUP_IOPOLL) && | |
337 | !wq_list_empty(&ctx->iopoll_list)) { | |
338 | needs_sched = false; | |
339 | break; | |
340 | } | |
341 | ||
342 | /* | |
343 | * Ensure the store of the wakeup flag is not | |
344 | * reordered with the load of the SQ tail | |
345 | */ | |
346 | smp_mb__after_atomic(); | |
347 | ||
348 | if (io_sqring_entries(ctx)) { | |
349 | needs_sched = false; | |
350 | break; | |
351 | } | |
352 | } | |
353 | ||
354 | if (needs_sched) { | |
355 | mutex_unlock(&sqd->lock); | |
356 | schedule(); | |
357 | mutex_lock(&sqd->lock); | |
a0d45c3f | 358 | sqd->sq_cpu = raw_smp_processor_id(); |
17437f31 JA |
359 | } |
360 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) | |
361 | atomic_andnot(IORING_SQ_NEED_WAKEUP, | |
362 | &ctx->rings->sq_flags); | |
363 | } | |
364 | ||
365 | finish_wait(&sqd->wait, &wait); | |
366 | timeout = jiffies + sqd->sq_thread_idle; | |
367 | } | |
368 | ||
af5d68f8 JA |
369 | if (retry_list) |
370 | io_sq_tw(&retry_list, UINT_MAX); | |
371 | ||
17437f31 JA |
372 | io_uring_cancel_generic(true, sqd); |
373 | sqd->thread = NULL; | |
374 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) | |
375 | atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); | |
376 | io_run_task_work(); | |
377 | mutex_unlock(&sqd->lock); | |
1251d202 | 378 | err_out: |
17437f31 JA |
379 | complete(&sqd->exited); |
380 | do_exit(0); | |
381 | } | |
382 | ||
88b80534 | 383 | void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) |
17437f31 JA |
384 | { |
385 | DEFINE_WAIT(wait); | |
386 | ||
387 | do { | |
388 | if (!io_sqring_full(ctx)) | |
389 | break; | |
390 | prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); | |
391 | ||
392 | if (!io_sqring_full(ctx)) | |
393 | break; | |
394 | schedule(); | |
395 | } while (!signal_pending(current)); | |
396 | ||
397 | finish_wait(&ctx->sqo_sq_wait, &wait); | |
17437f31 JA |
398 | } |
399 | ||
400 | __cold int io_sq_offload_create(struct io_ring_ctx *ctx, | |
401 | struct io_uring_params *p) | |
402 | { | |
403 | int ret; | |
404 | ||
405 | /* Retain compatibility with failing for an invalid attach attempt */ | |
406 | if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == | |
407 | IORING_SETUP_ATTACH_WQ) { | |
408 | struct fd f; | |
409 | ||
410 | f = fdget(p->wq_fd); | |
411 | if (!f.file) | |
412 | return -ENXIO; | |
413 | if (!io_is_uring_fops(f.file)) { | |
414 | fdput(f); | |
415 | return -EINVAL; | |
416 | } | |
417 | fdput(f); | |
418 | } | |
419 | if (ctx->flags & IORING_SETUP_SQPOLL) { | |
420 | struct task_struct *tsk; | |
421 | struct io_sq_data *sqd; | |
422 | bool attached; | |
423 | ||
424 | ret = security_uring_sqpoll(); | |
425 | if (ret) | |
426 | return ret; | |
427 | ||
428 | sqd = io_get_sq_data(p, &attached); | |
429 | if (IS_ERR(sqd)) { | |
430 | ret = PTR_ERR(sqd); | |
431 | goto err; | |
432 | } | |
433 | ||
434 | ctx->sq_creds = get_current_cred(); | |
435 | ctx->sq_data = sqd; | |
436 | ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); | |
437 | if (!ctx->sq_thread_idle) | |
438 | ctx->sq_thread_idle = HZ; | |
439 | ||
440 | io_sq_thread_park(sqd); | |
441 | list_add(&ctx->sqd_list, &sqd->ctx_list); | |
442 | io_sqd_update_thread_idle(sqd); | |
443 | /* don't attach to a dying SQPOLL thread, would be racy */ | |
444 | ret = (attached && !sqd->thread) ? -ENXIO : 0; | |
445 | io_sq_thread_unpark(sqd); | |
446 | ||
447 | if (ret < 0) | |
448 | goto err; | |
449 | if (attached) | |
450 | return 0; | |
451 | ||
452 | if (p->flags & IORING_SETUP_SQ_AFF) { | |
453 | int cpu = p->sq_thread_cpu; | |
454 | ||
455 | ret = -EINVAL; | |
456 | if (cpu >= nr_cpu_ids || !cpu_online(cpu)) | |
457 | goto err_sqpoll; | |
458 | sqd->sq_cpu = cpu; | |
459 | } else { | |
460 | sqd->sq_cpu = -1; | |
461 | } | |
462 | ||
463 | sqd->task_pid = current->pid; | |
464 | sqd->task_tgid = current->tgid; | |
465 | tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); | |
466 | if (IS_ERR(tsk)) { | |
467 | ret = PTR_ERR(tsk); | |
468 | goto err_sqpoll; | |
469 | } | |
470 | ||
471 | sqd->thread = tsk; | |
472 | ret = io_uring_alloc_task_context(tsk, ctx); | |
473 | wake_up_new_task(tsk); | |
474 | if (ret) | |
475 | goto err; | |
476 | } else if (p->flags & IORING_SETUP_SQ_AFF) { | |
477 | /* Can't have SQ_AFF without SQPOLL */ | |
478 | ret = -EINVAL; | |
479 | goto err; | |
480 | } | |
481 | ||
482 | return 0; | |
483 | err_sqpoll: | |
484 | complete(&ctx->sq_data->exited); | |
485 | err: | |
486 | io_sq_thread_finish(ctx); | |
487 | return ret; | |
488 | } | |
ebdfefc0 JA |
489 | |
490 | __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, | |
491 | cpumask_var_t mask) | |
492 | { | |
493 | struct io_sq_data *sqd = ctx->sq_data; | |
494 | int ret = -EINVAL; | |
495 | ||
496 | if (sqd) { | |
497 | io_sq_thread_park(sqd); | |
bd6fc5da GKB |
498 | /* Don't set affinity for a dying thread */ |
499 | if (sqd->thread) | |
500 | ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); | |
ebdfefc0 JA |
501 | io_sq_thread_unpark(sqd); |
502 | } | |
503 | ||
504 | return ret; | |
505 | } |