git.ipfire.org Git - thirdparty/kernel/stable.git/blob

1 // SPDX-License-Identifier: GPL-2.0

2 /*

3 * Shared application/kernel submission and completion ring pairs, for

4 * supporting fast/efficient IO.

5 *

6 * A note on the read/write ordering memory barriers that are matched between

7 * the application and kernel side.

8 *

9 * After the application reads the CQ ring tail, it must use an

10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses

11 * before writing the tail (using smp_load_acquire to read the tail will

12 * do). It also needs a smp_mb() before updating CQ head (ordering the

13 * entry load(s) with the head store), pairing with an implicit barrier

14 * through a control-dependency in io_get_cqring (smp_store_release to

15 * store head will do). Failure to do so could lead to reading invalid

16 * CQ entries.

17 *

18 * Likewise, the application must use an appropriate smp_wmb() before

19 * writing the SQ tail (ordering SQ entry stores with the tail store),

20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release

21 * to store the tail will do). And it needs a barrier ordering the SQ

22 * head load before writing new SQ entries (smp_load_acquire to read

23 * head will do).

24 *

25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application

26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*

27 * updating the SQ tail; a full memory barrier smp_mb() is needed

28 * between.

29 *

30 * Also see the examples in the liburing library:

31 *

32 * git://git.kernel.dk/liburing

33 *

34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens

35 * from data shared between the kernel and application. This is done both

36 * for ordering purposes, but also to ensure that once a value is loaded from

37 * data that the application could potentially modify, it remains stable.

38 *

41 */

42 #include <linux/kernel.h>

43 #include <linux/init.h>

44 #include <linux/errno.h>

45 #include <linux/syscalls.h>

46 #include <linux/compat.h>

47 #include <linux/refcount.h>

48 #include <linux/uio.h>

50 #include <linux/sched/signal.h>

51 #include <linux/fs.h>

52 #include <linux/file.h>

53 #include <linux/fdtable.h>

54 #include <linux/mm.h>

55 #include <linux/mman.h>

56 #include <linux/mmu_context.h>

57 #include <linux/percpu.h>

58 #include <linux/slab.h>

59 #include <linux/workqueue.h>

60 #include <linux/kthread.h>

61 #include <linux/blkdev.h>

62 #include <linux/bvec.h>

63 #include <linux/net.h>

64 #include <net/sock.h>

65 #include <net/af_unix.h>

66 #include <net/scm.h>

67 #include <linux/anon_inodes.h>

68 #include <linux/sched/mm.h>

69 #include <linux/uaccess.h>

70 #include <linux/nospec.h>

71 #include <linux/sizes.h>

72 #include <linux/hugetlb.h>

73 #include <linux/highmem.h>

74 #include <linux/fs_struct.h>

76 #include <uapi/linux/io_uring.h>

78 #include "internal.h"

80 #define IORING_MAX_ENTRIES 32768

81 #define IORING_MAX_FIXED_FILES 1024

83 struct io_uring {

84 u32 head ____cacheline_aligned_in_smp;

85 u32 tail ____cacheline_aligned_in_smp;

86 };

88 /*

89 * This data is shared with the application through the mmap at offsets

90 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.

91 *

92 * The offsets to the member fields are published through struct

93 * io_sqring_offsets when calling io_uring_setup.

94 */

95 struct io_rings {

96 /*

97 * Head and tail offsets into the ring; the offsets need to be

98 * masked to get valid indices.

99 *

100 * The kernel controls head of the sq ring and the tail of the cq ring,

101 * and the application controls tail of the sq ring and the head of the

102 * cq ring.

103 */

104 struct io_uring sq, cq;

105 /*

106 * Bitmasks to apply to head and tail offsets (constant, equals

107 * ring_entries - 1)

108 */

109 u32 sq_ring_mask, cq_ring_mask;

110 /* Ring sizes (constant, power of 2) */

111 u32 sq_ring_entries, cq_ring_entries;

112 /*

113 * Number of invalid entries dropped by the kernel due to

114 * invalid index stored in array

115 *

116 * Written by the kernel, shouldn't be modified by the

117 * application (i.e. get number of "new events" by comparing to

118 * cached value).

119 *

120 * After a new SQ head value was read by the application this

121 * counter includes all submissions that were dropped reaching

122 * the new SQ head (and possibly more).

123 */

124 u32 sq_dropped;

125 /*

126 * Runtime flags

127 *

128 * Written by the kernel, shouldn't be modified by the

129 * application.

130 *

131 * The application needs a full memory barrier before checking

132 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.

133 */

134 u32 sq_flags;

135 /*

136 * Number of completion events lost because the queue was full;

137 * this should be avoided by the application by making sure

138 * there are not more requests pending thatn there is space in

139 * the completion queue.

140 *

141 * Written by the kernel, shouldn't be modified by the

142 * application (i.e. get number of "new events" by comparing to

143 * cached value).

144 *

145 * As completion events come in out of order this counter is not

146 * ordered with any other data.

147 */

148 u32 cq_overflow;

149 /*

150 * Ring buffer of completion events.

151 *

152 * The kernel writes completion events fresh every time they are

153 * produced, so the application is allowed to modify pending

154 * entries.

155 */

156 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;

157 };

158

159 struct io_mapped_ubuf {

160 u64 ubuf;

161 size_t len;

162 struct bio_vec *bvec;

163 unsigned int nr_bvecs;

164 };

165

166 struct async_list {

167 spinlock_t lock;

168 atomic_t cnt;

169 struct list_head list;

170

171 struct file *file;

172 off_t io_start;

173 size_t io_len;

174 };

175

176 struct io_ring_ctx {

177 struct {

178 struct percpu_ref refs;

179 } ____cacheline_aligned_in_smp;

180

181 struct {

182 unsigned int flags;

183 bool compat;

184 bool account_mem;

185

186 /*

187 * Ring buffer of indices into array of io_uring_sqe, which is

188 * mmapped by the application using the IORING_OFF_SQES offset.

189 *

190 * This indirection could e.g. be used to assign fixed

191 * io_uring_sqe entries to operations and only submit them to

192 * the queue when needed.

193 *

194 * The kernel modifies neither the indices array nor the entries

195 * array.

196 */

197 u32 *sq_array;

198 unsigned cached_sq_head;

199 unsigned sq_entries;

200 unsigned sq_mask;

201 unsigned sq_thread_idle;

202 unsigned cached_sq_dropped;

203 struct io_uring_sqe *sq_sqes;

204

205 struct list_head defer_list;

206 struct list_head timeout_list;

207 } ____cacheline_aligned_in_smp;

208

209 /* IO offload */

210 struct workqueue_struct *sqo_wq[2];

211 struct task_struct *sqo_thread; /* if using sq thread polling */

212 struct mm_struct *sqo_mm;

213 wait_queue_head_t sqo_wait;

214 struct completion sqo_thread_started;

215

216 struct {

217 unsigned cached_cq_tail;

218 atomic_t cached_cq_overflow;

219 unsigned cq_entries;

220 unsigned cq_mask;

221 struct wait_queue_head cq_wait;

222 struct fasync_struct *cq_fasync;

223 struct eventfd_ctx *cq_ev_fd;

224 atomic_t cq_timeouts;

225 } ____cacheline_aligned_in_smp;

226

227 struct io_rings *rings;

228

229 /*

230 * If used, fixed file set. Writers must ensure that ->refs is dead,

231 * readers must ensure that ->refs is alive as long as the file* is

232 * used. Only updated through io_uring_register(2).

233 */

234 struct file **user_files;

235 unsigned nr_user_files;

236

237 /* if used, fixed mapped user buffers */

238 unsigned nr_user_bufs;

239 struct io_mapped_ubuf *user_bufs;

240

241 struct user_struct *user;

242

243 const struct cred *creds;

244

245 struct completion ctx_done;

246

247 struct {

248 struct mutex uring_lock;

249 wait_queue_head_t wait;

250 } ____cacheline_aligned_in_smp;

251

252 struct {

253 spinlock_t completion_lock;

254 bool poll_multi_file;

255 /*

256 * ->poll_list is protected by the ctx->uring_lock for

257 * io_uring instances that don't use IORING_SETUP_SQPOLL.

258 * For SQPOLL, only the single threaded io_sq_thread() will

259 * manipulate the list, hence no extra locking is needed there.

260 */

261 struct list_head poll_list;

262 struct list_head cancel_list;

263 } ____cacheline_aligned_in_smp;

264

265 struct async_list pending_async[2];

266

267 #if defined(CONFIG_UNIX)

268 struct socket *ring_sock;

269 #endif

270 };

271

272 struct sqe_submit {

273 const struct io_uring_sqe *sqe;

274 unsigned short index;

275 u32 sequence;

276 bool has_user;

277 bool needs_lock;

278 bool needs_fixed_file;

279 };

280

281 /*

282 * First field must be the file pointer in all the

283 * iocb unions! See also 'struct kiocb' in <linux/fs.h>

284 */

285 struct io_poll_iocb {

286 struct file *file;

287 struct wait_queue_head *head;

288 __poll_t events;

289 bool done;

290 bool canceled;

291 struct wait_queue_entry wait;

292 };

293

294 struct io_timeout {

295 struct file *file;

296 struct hrtimer timer;

297 };

298

299 /*

300 * NOTE! Each of the iocb union members has the file pointer

301 * as the first entry in their struct definition. So you can

302 * access the file pointer through any of the sub-structs,

303 * or directly as just 'ki_filp' in this struct.

304 */

305 struct io_kiocb {

306 union {

307 struct file *file;

308 struct kiocb rw;

309 struct io_poll_iocb poll;

310 struct io_timeout timeout;

311 };

312

313 struct sqe_submit submit;

314

315 struct io_ring_ctx *ctx;

316 struct list_head list;

317 struct list_head link_list;

318 unsigned int flags;

319 refcount_t refs;

320 #define REQ_F_NOWAIT 1 /* must not punt to workers */

321 #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */

322 #define REQ_F_FIXED_FILE 4 /* ctx owns file */

323 #define REQ_F_SEQ_PREV 8 /* sequential with previous */

324 #define REQ_F_IO_DRAIN 16 /* drain existing IO first */

325 #define REQ_F_IO_DRAINED 32 /* drain done */

326 #define REQ_F_LINK 64 /* linked sqes */

327 #define REQ_F_LINK_DONE 128 /* linked sqes done */

328 #define REQ_F_FAIL_LINK 256 /* fail rest of links */

329 #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */

330 #define REQ_F_TIMEOUT 1024 /* timeout request */

331 #define REQ_F_ISREG 2048 /* regular file */

332 #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */

333 #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */

334 unsigned long fsize;

335 u64 user_data;

336 u32 result;

337 u32 sequence;

338

339 struct fs_struct *fs;

340

341 struct work_struct work;

342 };

343

344 #define IO_PLUG_THRESHOLD 2

345 #define IO_IOPOLL_BATCH 8

346

347 struct io_submit_state {

348 struct blk_plug plug;

349

350 /*

351 * io_kiocb alloc cache

352 */

353 void *reqs[IO_IOPOLL_BATCH];

354 unsigned int free_reqs;

355 unsigned int cur_req;

356

357 /*

358 * File reference cache

359 */

360 struct file *file;

361 unsigned int fd;

362 unsigned int has_refs;

363 unsigned int used_refs;

364 unsigned int ios_left;

365 };

366

367 static void io_sq_wq_submit_work(struct work_struct *work);

368 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,

369 long res);

370 static void __io_free_req(struct io_kiocb *req);

371

372 static struct kmem_cache *req_cachep;

373

374 static const struct file_operations io_uring_fops;

375

376 struct sock *io_uring_get_socket(struct file *file)

377 {

378 #if defined(CONFIG_UNIX)

379 if (file->f_op == &io_uring_fops) {

380 struct io_ring_ctx *ctx = file->private_data;

381

382 return ctx->ring_sock->sk;

383 }

384 #endif

385 return NULL;

386 }

387 EXPORT_SYMBOL(io_uring_get_socket);

388

389 static void io_ring_ctx_ref_free(struct percpu_ref *ref)

390 {

391 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);

392

393 complete(&ctx->ctx_done);

394 }

395

396 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)

397 {

398 struct io_ring_ctx *ctx;

399 int i;

400

401 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);

402 if (!ctx)

403 return NULL;

404

405 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,

406 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {

407 kfree(ctx);

408 return NULL;

409 }

410

411 ctx->flags = p->flags;

412 init_waitqueue_head(&ctx->sqo_wait);

413 init_waitqueue_head(&ctx->cq_wait);

414 init_completion(&ctx->ctx_done);

415 init_completion(&ctx->sqo_thread_started);

416 mutex_init(&ctx->uring_lock);

417 init_waitqueue_head(&ctx->wait);

418 for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {

419 spin_lock_init(&ctx->pending_async[i].lock);

420 INIT_LIST_HEAD(&ctx->pending_async[i].list);

421 atomic_set(&ctx->pending_async[i].cnt, 0);

422 }

423 spin_lock_init(&ctx->completion_lock);

424 INIT_LIST_HEAD(&ctx->poll_list);

425 INIT_LIST_HEAD(&ctx->cancel_list);

426 INIT_LIST_HEAD(&ctx->defer_list);

427 INIT_LIST_HEAD(&ctx->timeout_list);

428 return ctx;

429 }

430

431 static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,

432 struct io_kiocb *req)

433 {

434 return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped

435 + atomic_read(&ctx->cached_cq_overflow);

436 }

437

438 static inline bool io_sequence_defer(struct io_ring_ctx *ctx,

439 struct io_kiocb *req)

440 {

441 if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)

442 return false;

443

444 return __io_sequence_defer(ctx, req);

445 }

446

447 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)

448 {

449 struct io_kiocb *req;

450

451 req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);

452 if (req && !io_sequence_defer(ctx, req)) {

453 list_del_init(&req->list);

454 return req;

455 }

456

457 return NULL;

458 }

459

460 static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)

461 {

462 struct io_kiocb *req;

463

464 req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);

465 if (req) {

466 if (req->flags & REQ_F_TIMEOUT_NOSEQ)

467 return NULL;

468 if (!__io_sequence_defer(ctx, req)) {

469 list_del_init(&req->list);

470 return req;

471 }

472 }

473

474 return NULL;

475 }

476

477 static void __io_commit_cqring(struct io_ring_ctx *ctx)

478 {

479 struct io_rings *rings = ctx->rings;

480

481 if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {

482 /* order cqe stores with ring update */

483 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);

484

485 if (wq_has_sleeper(&ctx->cq_wait)) {

486 wake_up_interruptible(&ctx->cq_wait);

487 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);

488 }

489 }

490 }

491

492 static inline void io_queue_async_work(struct io_ring_ctx *ctx,

493 struct io_kiocb *req)

494 {

495 int rw = 0;

496

497 if (req->submit.sqe) {

498 switch (req->submit.sqe->opcode) {

499 case IORING_OP_WRITEV:

500 case IORING_OP_WRITE_FIXED:

501 rw = !(req->rw.ki_flags & IOCB_DIRECT);

502 break;

503 }

504 }

505

506 queue_work(ctx->sqo_wq[rw], &req->work);

507 }

508

509 static void io_kill_timeout(struct io_kiocb *req)

510 {

511 int ret;

512

513 ret = hrtimer_try_to_cancel(&req->timeout.timer);

514 if (ret != -1) {

515 atomic_inc(&req->ctx->cq_timeouts);

516 list_del(&req->list);

517 io_cqring_fill_event(req->ctx, req->user_data, 0);

518 __io_free_req(req);

519 }

520 }

521

522 static void io_kill_timeouts(struct io_ring_ctx *ctx)

523 {

524 struct io_kiocb *req, *tmp;

525

526 spin_lock_irq(&ctx->completion_lock);

527 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)

528 io_kill_timeout(req);

529 spin_unlock_irq(&ctx->completion_lock);

530 }

531

532 static void io_commit_cqring(struct io_ring_ctx *ctx)

533 {

534 struct io_kiocb *req;

535

536 while ((req = io_get_timeout_req(ctx)) != NULL)

537 io_kill_timeout(req);

538

539 __io_commit_cqring(ctx);

540

541 while ((req = io_get_deferred_req(ctx)) != NULL) {

542 if (req->flags & REQ_F_SHADOW_DRAIN) {

543 /* Just for drain, free it. */

544 __io_free_req(req);

545 continue;

546 }

547 req->flags |= REQ_F_IO_DRAINED;

548 io_queue_async_work(ctx, req);

549 }

550 }

551

552 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)

553 {

554 struct io_rings *rings = ctx->rings;

555 unsigned tail;

556

557 tail = ctx->cached_cq_tail;

558 /*

559 * writes to the cq entry need to come after reading head; the

560 * control dependency is enough as we're using WRITE_ONCE to

561 * fill the cq entry

562 */

563 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)

564 return NULL;

565

566 ctx->cached_cq_tail++;

567 return &rings->cqes[tail & ctx->cq_mask];

568 }

569

570 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,

571 long res)

572 {

573 struct io_uring_cqe *cqe;

574

575 /*

576 * If we can't get a cq entry, userspace overflowed the

577 * submission (by quite a lot). Increment the overflow count in

578 * the ring.

579 */

580 cqe = io_get_cqring(ctx);

581 if (cqe) {

582 WRITE_ONCE(cqe->user_data, ki_user_data);

583 WRITE_ONCE(cqe->res, res);

584 WRITE_ONCE(cqe->flags, 0);

585 } else {

586 WRITE_ONCE(ctx->rings->cq_overflow,

587 atomic_inc_return(&ctx->cached_cq_overflow));

588 }

589 }

590

591 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)

592 {

593 if (waitqueue_active(&ctx->wait))

594 wake_up(&ctx->wait);

595 if (waitqueue_active(&ctx->sqo_wait))

596 wake_up(&ctx->sqo_wait);

597 if (ctx->cq_ev_fd)

598 eventfd_signal(ctx->cq_ev_fd, 1);

599 }

600

601 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,

602 long res)

603 {

604 unsigned long flags;

605

606 spin_lock_irqsave(&ctx->completion_lock, flags);

607 io_cqring_fill_event(ctx, user_data, res);

608 io_commit_cqring(ctx);

609 spin_unlock_irqrestore(&ctx->completion_lock, flags);

610

611 io_cqring_ev_posted(ctx);

612 }

613

614 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,

615 struct io_submit_state *state)

616 {

617 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;

618 struct io_kiocb *req;

619

620 if (!percpu_ref_tryget(&ctx->refs))

621 return NULL;

622

623 if (!state) {

624 req = kmem_cache_alloc(req_cachep, gfp);

625 if (unlikely(!req))

626 goto out;

627 } else if (!state->free_reqs) {

628 size_t sz;

629 int ret;

630

631 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));

632 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);

633

634 /*

635 * Bulk alloc is all-or-nothing. If we fail to get a batch,

636 * retry single alloc to be on the safe side.

637 */

638 if (unlikely(ret <= 0)) {

639 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);

640 if (!state->reqs[0])

641 goto out;

642 ret = 1;

643 }

644 state->free_reqs = ret - 1;

645 state->cur_req = 1;

646 req = state->reqs[0];

647 } else {

648 req = state->reqs[state->cur_req];

649 state->free_reqs--;

650 state->cur_req++;

651 }

652

653 req->file = NULL;

654 req->ctx = ctx;

655 req->flags = 0;

656 /* one is dropped after submission, the other at completion */

657 refcount_set(&req->refs, 2);

658 req->result = 0;

659 req->fs = NULL;

660 return req;

661 out:

662 percpu_ref_put(&ctx->refs);

663 return NULL;

664 }

665

666 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)

667 {

668 if (*nr) {

669 kmem_cache_free_bulk(req_cachep, *nr, reqs);

670 percpu_ref_put_many(&ctx->refs, *nr);

671 *nr = 0;

672 }

673 }

674

675 static void __io_free_req(struct io_kiocb *req)

676 {

677 if (req->file && !(req->flags & REQ_F_FIXED_FILE))

678 fput(req->file);

679 percpu_ref_put(&req->ctx->refs);

680 kmem_cache_free(req_cachep, req);

681 }

682

683 static void io_req_link_next(struct io_kiocb *req)

684 {

685 struct io_kiocb *nxt;

686

687 /*

688 * The list should never be empty when we are called here. But could

689 * potentially happen if the chain is messed up, check to be on the

690 * safe side.

691 */

692 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);

693 if (nxt) {

694 list_del(&nxt->list);

695 if (!list_empty(&req->link_list)) {

696 INIT_LIST_HEAD(&nxt->link_list);

697 list_splice(&req->link_list, &nxt->link_list);

698 nxt->flags |= REQ_F_LINK;

699 }

700

701 nxt->flags |= REQ_F_LINK_DONE;

702 INIT_WORK(&nxt->work, io_sq_wq_submit_work);

703 io_queue_async_work(req->ctx, nxt);

704 }

705 }

706

707 /*

708 * Called if REQ_F_LINK is set, and we fail the head request

709 */

710 static void io_fail_links(struct io_kiocb *req)

711 {

712 struct io_kiocb *link;

713

714 while (!list_empty(&req->link_list)) {

715 link = list_first_entry(&req->link_list, struct io_kiocb, list);

716 list_del(&link->list);

717

718 io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);

719 __io_free_req(link);

720 }

721 }

722

723 static void io_free_req(struct io_kiocb *req)

724 {

725 /*

726 * If LINK is set, we have dependent requests in this chain. If we

727 * didn't fail this request, queue the first one up, moving any other

728 * dependencies to the next request. In case of failure, fail the rest

729 * of the chain.

730 */

731 if (req->flags & REQ_F_LINK) {

732 if (req->flags & REQ_F_FAIL_LINK)

733 io_fail_links(req);

734 else

735 io_req_link_next(req);

736 }

737

738 __io_free_req(req);

739 }

740

741 static void io_put_req(struct io_kiocb *req)

742 {

743 if (refcount_dec_and_test(&req->refs))

744 io_free_req(req);

745 }

746

747 static unsigned io_cqring_events(struct io_rings *rings)

748 {

749 /* See comment at the top of this file */

750 smp_rmb();

751 return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);

752 }

753

754 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)

755 {

756 struct io_rings *rings = ctx->rings;

757

758 /* make sure SQ entry isn't read before tail */

759 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;

760 }

761

762 /*

763 * Find and free completed poll iocbs

764 */

765 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,

766 struct list_head *done)

767 {

768 void *reqs[IO_IOPOLL_BATCH];

769 struct io_kiocb *req;

770 int to_free;

771

772 to_free = 0;

773 while (!list_empty(done)) {

774 req = list_first_entry(done, struct io_kiocb, list);

775 list_del(&req->list);

776

777 io_cqring_fill_event(ctx, req->user_data, req->result);

778 (*nr_events)++;

779

780 if (refcount_dec_and_test(&req->refs)) {

781 /* If we're not using fixed files, we have to pair the

782 * completion part with the file put. Use regular

783 * completions for those, only batch free for fixed

784 * file and non-linked commands.

785 */

786 if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==

787 REQ_F_FIXED_FILE) {

788 reqs[to_free++] = req;

789 if (to_free == ARRAY_SIZE(reqs))

790 io_free_req_many(ctx, reqs, &to_free);

791 } else {

792 io_free_req(req);

793 }

794 }

795 }

796

797 io_commit_cqring(ctx);

798 io_free_req_many(ctx, reqs, &to_free);

799 }

800

801 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,

802 long min)

803 {

804 struct io_kiocb *req, *tmp;

805 LIST_HEAD(done);

806 bool spin;

807 int ret;

808

809 /*

810 * Only spin for completions if we don't have multiple devices hanging

811 * off our complete list, and we're under the requested amount.

812 */

813 spin = !ctx->poll_multi_file && *nr_events < min;

814

815 ret = 0;

816 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {

817 struct kiocb *kiocb = &req->rw;

818

819 /*

820 * Move completed entries to our local list. If we find a

821 * request that requires polling, break out and complete

822 * the done list first, if we have entries there.

823 */

824 if (req->flags & REQ_F_IOPOLL_COMPLETED) {

825 list_move_tail(&req->list, &done);

826 continue;

827 }

828 if (!list_empty(&done))

829 break;

830

831 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);

832 if (ret < 0)

833 break;

834

835 if (ret && spin)

836 spin = false;

837 ret = 0;

838 }

839

840 if (!list_empty(&done))

841 io_iopoll_complete(ctx, nr_events, &done);

842

843 return ret;

844 }

845

846 /*

847 * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a

848 * non-spinning poll check - we'll still enter the driver poll loop, but only

849 * as a non-spinning completion check.

850 */

851 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,

852 long min)

853 {

854 while (!list_empty(&ctx->poll_list) && !need_resched()) {

855 int ret;

856

857 ret = io_do_iopoll(ctx, nr_events, min);

858 if (ret < 0)

859 return ret;

860 if (!min || *nr_events >= min)

861 return 0;

862 }

863

864 return 1;

865 }

866

867 /*

868 * We can't just wait for polled events to come to us, we have to actively

869 * find and complete them.

870 */

871 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)

872 {

873 if (!(ctx->flags & IORING_SETUP_IOPOLL))

874 return;

875

876 mutex_lock(&ctx->uring_lock);

877 while (!list_empty(&ctx->poll_list)) {

878 unsigned int nr_events = 0;

879

880 io_iopoll_getevents(ctx, &nr_events, 1);

881

882 /*

883 * Ensure we allow local-to-the-cpu processing to take place,

884 * in this case we need to ensure that we reap all events.

885 */

886 cond_resched();

887 }

888 mutex_unlock(&ctx->uring_lock);

889 }

890

891 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,

892 long min)

893 {

894 int iters = 0, ret = 0;

895

896 /*

897 * We disallow the app entering submit/complete with polling, but we

898 * still need to lock the ring to prevent racing with polled issue

899 * that got punted to a workqueue.

900 */

901 mutex_lock(&ctx->uring_lock);

902 do {

903 int tmin = 0;

904

905 /*

906 * Don't enter poll loop if we already have events pending.

907 * If we do, we can potentially be spinning for commands that

908 * already triggered a CQE (eg in error).

909 */

910 if (io_cqring_events(ctx->rings))

911 break;

912

913 /*

914 * If a submit got punted to a workqueue, we can have the

915 * application entering polling for a command before it gets

916 * issued. That app will hold the uring_lock for the duration

917 * of the poll right here, so we need to take a breather every

918 * now and then to ensure that the issue has a chance to add

919 * the poll to the issued list. Otherwise we can spin here

920 * forever, while the workqueue is stuck trying to acquire the

921 * very same mutex.

922 */

923 if (!(++iters & 7)) {

924 mutex_unlock(&ctx->uring_lock);

925 mutex_lock(&ctx->uring_lock);

926 }

927

928 if (*nr_events < min)

929 tmin = min - *nr_events;

930

931 ret = io_iopoll_getevents(ctx, nr_events, tmin);

932 if (ret <= 0)

933 break;

934 ret = 0;

935 } while (min && !*nr_events && !need_resched());

936

937 mutex_unlock(&ctx->uring_lock);

938 return ret;

939 }

940

941 static void kiocb_end_write(struct io_kiocb *req)

942 {

943 /*

944 * Tell lockdep we inherited freeze protection from submission

945 * thread.

946 */

947 if (req->flags & REQ_F_ISREG) {

948 struct inode *inode = file_inode(req->file);

949

950 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);

951 }

952 file_end_write(req->file);

953 }

954

955 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)

956 {

957 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);

958

959 if (kiocb->ki_flags & IOCB_WRITE)

960 kiocb_end_write(req);

961

962 if ((req->flags & REQ_F_LINK) && res != req->result)

963 req->flags |= REQ_F_FAIL_LINK;

964 io_cqring_add_event(req->ctx, req->user_data, res);

965 io_put_req(req);

966 }

967

968 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)

969 {

970 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);

971

972 if (kiocb->ki_flags & IOCB_WRITE)

973 kiocb_end_write(req);

974

975 if ((req->flags & REQ_F_LINK) && res != req->result)

976 req->flags |= REQ_F_FAIL_LINK;

977 req->result = res;

978 if (res != -EAGAIN)

979 req->flags |= REQ_F_IOPOLL_COMPLETED;

980 }

981

982 /*

983 * After the iocb has been issued, it's safe to be found on the poll list.

984 * Adding the kiocb to the list AFTER submission ensures that we don't

985 * find it from a io_iopoll_getevents() thread before the issuer is done

986 * accessing the kiocb cookie.

987 */

988 static void io_iopoll_req_issued(struct io_kiocb *req)

989 {

990 struct io_ring_ctx *ctx = req->ctx;

991

992 /*

993 * Track whether we have multiple files in our lists. This will impact

994 * how we do polling eventually, not spinning if we're on potentially

995 * different devices.

996 */

997 if (list_empty(&ctx->poll_list)) {

998 ctx->poll_multi_file = false;

999 } else if (!ctx->poll_multi_file) {

1000 struct io_kiocb *list_req;

1001

1002 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,

1003 list);

1004 if (list_req->rw.ki_filp != req->rw.ki_filp)

1005 ctx->poll_multi_file = true;

1006 }

1007

1008 /*

1009 * For fast devices, IO may have already completed. If it has, add

1010 * it to the front so we find it first.

1011 */

1012 if (req->flags & REQ_F_IOPOLL_COMPLETED)

1013 list_add(&req->list, &ctx->poll_list);

1014 else

1015 list_add_tail(&req->list, &ctx->poll_list);

1016 }

1017

1018 static void io_file_put(struct io_submit_state *state)

1019 {

1020 if (state->file) {

1021 int diff = state->has_refs - state->used_refs;

1022

1023 if (diff)

1024 fput_many(state->file, diff);

1025 state->file = NULL;

1026 }

1027 }

1028

1029 /*

1030 * Get as many references to a file as we have IOs left in this submission,

1031 * assuming most submissions are for one file, or at least that each file

1032 * has more than one submission.

1033 */

1034 static struct file *io_file_get(struct io_submit_state *state, int fd)

1035 {

1036 if (!state)

1037 return fget(fd);

1038

1039 if (state->file) {

1040 if (state->fd == fd) {

1041 state->used_refs++;

1042 state->ios_left--;

1043 return state->file;

1044 }

1045 io_file_put(state);

1046 }

1047 state->file = fget_many(fd, state->ios_left);

1048 if (!state->file)

1049 return NULL;

1050

1051 state->fd = fd;

1052 state->has_refs = state->ios_left;

1053 state->used_refs = 1;

1054 state->ios_left--;

1055 return state->file;

1056 }

1057

1058 /*

1059 * If we tracked the file through the SCM inflight mechanism, we could support

1060 * any file. For now, just ensure that anything potentially problematic is done

1061 * inline.

1062 */

1063 static bool io_file_supports_async(struct file *file)

1064 {

1065 umode_t mode = file_inode(file)->i_mode;

1066

1067 if (S_ISBLK(mode) || S_ISCHR(mode))

1068 return true;

1069 if (S_ISREG(mode) && file->f_op != &io_uring_fops)

1070 return true;

1071

1072 return false;

1073 }

1074

1075 static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,

1076 bool force_nonblock)

1077 {

1078 const struct io_uring_sqe *sqe = s->sqe;

1079 struct io_ring_ctx *ctx = req->ctx;

1080 struct kiocb *kiocb = &req->rw;

1081 unsigned ioprio;

1082 int ret;

1083

1084 if (!req->file)

1085 return -EBADF;

1086

1087 if (S_ISREG(file_inode(req->file)->i_mode))

1088 req->flags |= REQ_F_ISREG;

1089

1090 if (force_nonblock)

1091 req->fsize = rlimit(RLIMIT_FSIZE);

1092

1093 /*

1094 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so

1095 * we know to async punt it even if it was opened O_NONBLOCK

1096 */

1097 if (force_nonblock && !io_file_supports_async(req->file)) {

1098 req->flags |= REQ_F_MUST_PUNT;

1099 return -EAGAIN;

1100 }

1101

1102 kiocb->ki_pos = READ_ONCE(sqe->off);

1103 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);

1104 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));

1105

1106 ioprio = READ_ONCE(sqe->ioprio);

1107 if (ioprio) {

1108 ret = ioprio_check_cap(ioprio);

1109 if (ret)

1110 return ret;

1111

1112 kiocb->ki_ioprio = ioprio;

1113 } else

1114 kiocb->ki_ioprio = get_current_ioprio();

1115

1116 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));

1117 if (unlikely(ret))

1118 return ret;

1119

1120 /* don't allow async punt if RWF_NOWAIT was requested */

1121 if ((kiocb->ki_flags & IOCB_NOWAIT) ||

1122 (req->file->f_flags & O_NONBLOCK))

1123 req->flags |= REQ_F_NOWAIT;

1124

1125 if (force_nonblock)

1126 kiocb->ki_flags |= IOCB_NOWAIT;

1127

1128 if (ctx->flags & IORING_SETUP_IOPOLL) {

1129 if (!(kiocb->ki_flags & IOCB_DIRECT) ||

1130 !kiocb->ki_filp->f_op->iopoll)

1131 return -EOPNOTSUPP;

1132

1133 kiocb->ki_flags |= IOCB_HIPRI;

1134 kiocb->ki_complete = io_complete_rw_iopoll;

1135 req->result = 0;

1136 } else {

1137 if (kiocb->ki_flags & IOCB_HIPRI)

1138 return -EINVAL;

1139 kiocb->ki_complete = io_complete_rw;

1140 }

1141 return 0;

1142 }

1143

1144 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)

1145 {

1146 switch (ret) {

1147 case -EIOCBQUEUED:

1148 break;

1149 case -ERESTARTSYS:

1150 case -ERESTARTNOINTR:

1151 case -ERESTARTNOHAND:

1152 case -ERESTART_RESTARTBLOCK:

1153 /*

1154 * We can't just restart the syscall, since previously

1155 * submitted sqes may already be in progress. Just fail this

1156 * IO with EINTR.

1157 */

1158 ret = -EINTR;

1159 /* fall through */

1160 default:

1161 kiocb->ki_complete(kiocb, ret, 0);

1162 }

1163 }

1164

1165 static int io_import_fixed(struct io_ring_ctx *ctx, int rw,

1166 const struct io_uring_sqe *sqe,

1167 struct iov_iter *iter)

1168 {

1169 size_t len = READ_ONCE(sqe->len);

1170 struct io_mapped_ubuf *imu;

1171 unsigned index, buf_index;

1172 size_t offset;

1173 u64 buf_addr;

1174

1175 /* attempt to use fixed buffers without having provided iovecs */

1176 if (unlikely(!ctx->user_bufs))

1177 return -EFAULT;

1178

1179 buf_index = READ_ONCE(sqe->buf_index);

1180 if (unlikely(buf_index >= ctx->nr_user_bufs))

1181 return -EFAULT;

1182

1183 index = array_index_nospec(buf_index, ctx->nr_user_bufs);

1184 imu = &ctx->user_bufs[index];

1185 buf_addr = READ_ONCE(sqe->addr);

1186

1187 /* overflow */

1188 if (buf_addr + len < buf_addr)

1189 return -EFAULT;

1190 /* not inside the mapped region */

1191 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)

1192 return -EFAULT;

1193

1194 /*

1195 * May not be a start of buffer, set size appropriately

1196 * and advance us to the beginning.

1197 */

1198 offset = buf_addr - imu->ubuf;

1199 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);

1200

1201 if (offset) {

1202 /*

1203 * Don't use iov_iter_advance() here, as it's really slow for

1204 * using the latter parts of a big fixed buffer - it iterates

1205 * over each segment manually. We can cheat a bit here, because

1206 * we know that:

1207 *

1208 * 1) it's a BVEC iter, we set it up

1209 * 2) all bvecs are PAGE_SIZE in size, except potentially the

1210 * first and last bvec

1211 *

1212 * So just find our index, and adjust the iterator afterwards.

1213 * If the offset is within the first bvec (or the whole first

1214 * bvec, just use iov_iter_advance(). This makes it easier

1215 * since we can just skip the first segment, which may not

1216 * be PAGE_SIZE aligned.

1217 */

1218 const struct bio_vec *bvec = imu->bvec;

1219

1220 if (offset <= bvec->bv_len) {

1221 iov_iter_advance(iter, offset);

1222 } else {

1223 unsigned long seg_skip;

1224

1225 /* skip first vec */

1226 offset -= bvec->bv_len;

1227 seg_skip = 1 + (offset >> PAGE_SHIFT);

1228

1229 iter->bvec = bvec + seg_skip;

1230 iter->nr_segs -= seg_skip;

1231 iter->count -= bvec->bv_len + offset;

1232 iter->iov_offset = offset & ~PAGE_MASK;

1233 }

1234 }

1235

1236 return len;

1237 }

1238

1239 static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,

1240 const struct sqe_submit *s, struct iovec **iovec,

1241 struct iov_iter *iter)

1242 {

1243 const struct io_uring_sqe *sqe = s->sqe;

1244 void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));

1245 size_t sqe_len = READ_ONCE(sqe->len);

1246 u8 opcode;

1247

1248 /*

1249 * We're reading ->opcode for the second time, but the first read

1250 * doesn't care whether it's _FIXED or not, so it doesn't matter

1251 * whether ->opcode changes concurrently. The first read does care

1252 * about whether it is a READ or a WRITE, so we don't trust this read

1253 * for that purpose and instead let the caller pass in the read/write

1254 * flag.

1255 */

1256 opcode = READ_ONCE(sqe->opcode);

1257 if (opcode == IORING_OP_READ_FIXED ||

1258 opcode == IORING_OP_WRITE_FIXED) {

1259 ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);

1260 *iovec = NULL;

1261 return ret;

1262 }

1263

1264 if (!s->has_user)

1265 return -EFAULT;

1266

1267 #ifdef CONFIG_COMPAT

1268 if (ctx->compat)

1269 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,

1270 iovec, iter);

1271 #endif

1272

1273 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);

1274 }

1275

1276 static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)

1277 {

1278 if (al->file == kiocb->ki_filp) {

1279 off_t start, end;

1280

1281 /*

1282 * Allow merging if we're anywhere in the range of the same

1283 * page. Generally this happens for sub-page reads or writes,

1284 * and it's beneficial to allow the first worker to bring the

1285 * page in and the piggy backed work can then work on the

1286 * cached page.

1287 */

1288 start = al->io_start & PAGE_MASK;

1289 end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;

1290 if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)

1291 return true;

1292 }

1293

1294 al->file = NULL;

1295 return false;

1296 }

1297

1298 /*

1299 * Make a note of the last file/offset/direction we punted to async

1300 * context. We'll use this information to see if we can piggy back a

1301 * sequential request onto the previous one, if it's still hasn't been

1302 * completed by the async worker.

1303 */

1304 static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)

1305 {

1306 struct async_list *async_list = &req->ctx->pending_async[rw];

1307 struct kiocb *kiocb = &req->rw;

1308 struct file *filp = kiocb->ki_filp;

1309

1310 if (io_should_merge(async_list, kiocb)) {

1311 unsigned long max_bytes;

1312

1313 /* Use 8x RA size as a decent limiter for both reads/writes */

1314 max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);

1315 if (!max_bytes)

1316 max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);

1317

1318 /* If max len are exceeded, reset the state */

1319 if (async_list->io_len + len <= max_bytes) {

1320 req->flags |= REQ_F_SEQ_PREV;

1321 async_list->io_len += len;

1322 } else {

1323 async_list->file = NULL;

1324 }

1325 }

1326

1327 /* New file? Reset state. */

1328 if (async_list->file != filp) {

1329 async_list->io_start = kiocb->ki_pos;

1330 async_list->io_len = len;

1331 async_list->file = filp;

1332 }

1333 }

1334

1335 /*

1336 * For files that don't have ->read_iter() and ->write_iter(), handle them

1337 * by looping over ->read() or ->write() manually.

1338 */

1339 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,

1340 struct iov_iter *iter)

1341 {

1342 ssize_t ret = 0;

1343

1344 /*

1345 * Don't support polled IO through this interface, and we can't

1346 * support non-blocking either. For the latter, this just causes

1347 * the kiocb to be handled from an async context.

1348 */

1349 if (kiocb->ki_flags & IOCB_HIPRI)

1350 return -EOPNOTSUPP;

1351 if (kiocb->ki_flags & IOCB_NOWAIT)

1352 return -EAGAIN;

1353

1354 while (iov_iter_count(iter)) {

1355 struct iovec iovec;

1356 ssize_t nr;

1357

1358 if (!iov_iter_is_bvec(iter)) {

1359 iovec = iov_iter_iovec(iter);

1360 } else {

1361 /* fixed buffers import bvec */

1362 iovec.iov_base = kmap(iter->bvec->bv_page)

1363 + iter->iov_offset;

1364 iovec.iov_len = min(iter->count,

1365 iter->bvec->bv_len - iter->iov_offset);

1366 }

1367

1368 if (rw == READ) {

1369 nr = file->f_op->read(file, iovec.iov_base,

1370 iovec.iov_len, &kiocb->ki_pos);

1371 } else {

1372 nr = file->f_op->write(file, iovec.iov_base,

1373 iovec.iov_len, &kiocb->ki_pos);

1374 }

1375

1376 if (iov_iter_is_bvec(iter))

1377 kunmap(iter->bvec->bv_page);

1378

1379 if (nr < 0) {

1380 if (!ret)

1381 ret = nr;

1382 break;

1383 }

1384 ret += nr;

1385 if (nr != iovec.iov_len)

1386 break;

1387 iov_iter_advance(iter, nr);

1388 }

1389

1390 return ret;

1391 }

1392

1393 static int io_read(struct io_kiocb *req, const struct sqe_submit *s,

1394 bool force_nonblock)

1395 {

1396 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;

1397 struct kiocb *kiocb = &req->rw;

1398 struct iov_iter iter;

1399 struct file *file;

1400 size_t iov_count;

1401 ssize_t read_size, ret;

1402

1403 ret = io_prep_rw(req, s, force_nonblock);

1404 if (ret)

1405 return ret;

1406 file = kiocb->ki_filp;

1407

1408 if (unlikely(!(file->f_mode & FMODE_READ)))

1409 return -EBADF;

1410

1411 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);

1412 if (ret < 0)

1413 return ret;

1414

1415 read_size = ret;

1416 if (req->flags & REQ_F_LINK)

1417 req->result = read_size;

1418

1419 iov_count = iov_iter_count(&iter);

1420 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);

1421 if (!ret) {

1422 ssize_t ret2;

1423

1424 if (file->f_op->read_iter)

1425 ret2 = call_read_iter(file, kiocb, &iter);

1426 else

1427 ret2 = loop_rw_iter(READ, file, kiocb, &iter);

1428

1429 /*

1430 * In case of a short read, punt to async. This can happen

1431 * if we have data partially cached. Alternatively we can

1432 * return the short read, in which case the application will

1433 * need to issue another SQE and wait for it. That SQE will

1434 * need async punt anyway, so it's more efficient to do it

1435 * here.

1436 */

1437 if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&

1438 (req->flags & REQ_F_ISREG) &&

1439 ret2 > 0 && ret2 < read_size)

1440 ret2 = -EAGAIN;

1441 /* Catch -EAGAIN return for forced non-blocking submission */

1442 if (!force_nonblock || ret2 != -EAGAIN) {

1443 io_rw_done(kiocb, ret2);

1444 } else {

1445 /*

1446 * If ->needs_lock is true, we're already in async

1447 * context.

1448 */

1449 if (!s->needs_lock)

1450 io_async_list_note(READ, req, iov_count);

1451 ret = -EAGAIN;

1452 }

1453 }

1454 kfree(iovec);

1455 return ret;

1456 }

1457

1458 static int io_write(struct io_kiocb *req, const struct sqe_submit *s,

1459 bool force_nonblock)

1460 {

1461 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;

1462 struct kiocb *kiocb = &req->rw;

1463 struct iov_iter iter;

1464 struct file *file;

1465 size_t iov_count;

1466 ssize_t ret;

1467

1468 ret = io_prep_rw(req, s, force_nonblock);

1469 if (ret)

1470 return ret;

1471

1472 file = kiocb->ki_filp;

1473 if (unlikely(!(file->f_mode & FMODE_WRITE)))

1474 return -EBADF;

1475

1476 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);

1477 if (ret < 0)

1478 return ret;

1479

1480 if (req->flags & REQ_F_LINK)

1481 req->result = ret;

1482

1483 iov_count = iov_iter_count(&iter);

1484

1485 ret = -EAGAIN;

1486 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {

1487 /* If ->needs_lock is true, we're already in async context. */

1488 if (!s->needs_lock)

1489 io_async_list_note(WRITE, req, iov_count);

1490 goto out_free;

1491 }

1492

1493 ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);

1494 if (!ret) {

1495 ssize_t ret2;

1496

1497 /*

1498 * Open-code file_start_write here to grab freeze protection,

1499 * which will be released by another thread in

1500 * io_complete_rw(). Fool lockdep by telling it the lock got

1501 * released so that it doesn't complain about the held lock when

1502 * we return to userspace.

1503 */

1504 if (req->flags & REQ_F_ISREG) {

1505 __sb_start_write(file_inode(file)->i_sb,

1506 SB_FREEZE_WRITE, true);

1507 __sb_writers_release(file_inode(file)->i_sb,

1508 SB_FREEZE_WRITE);

1509 }

1510 kiocb->ki_flags |= IOCB_WRITE;

1511

1512 if (!force_nonblock)

1513 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;

1514

1515 if (file->f_op->write_iter)

1516 ret2 = call_write_iter(file, kiocb, &iter);

1517 else

1518 ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);

1519

1520 if (!force_nonblock)

1521 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;

1522

1523 if (!force_nonblock || ret2 != -EAGAIN) {

1524 io_rw_done(kiocb, ret2);

1525 } else {

1526 /*

1527 * If ->needs_lock is true, we're already in async

1528 * context.

1529 */

1530 if (!s->needs_lock)

1531 io_async_list_note(WRITE, req, iov_count);

1532 ret = -EAGAIN;

1533 }

1534 }

1535 out_free:

1536 kfree(iovec);

1537 return ret;

1538 }

1539

1540 /*

1541 * IORING_OP_NOP just posts a completion event, nothing else.

1542 */

1543 static int io_nop(struct io_kiocb *req, u64 user_data)

1544 {

1545 struct io_ring_ctx *ctx = req->ctx;

1546 long err = 0;

1547

1548 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))

1549 return -EINVAL;

1550

1551 io_cqring_add_event(ctx, user_data, err);

1552 io_put_req(req);

1553 return 0;

1554 }

1555

1556 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)

1557 {

1558 struct io_ring_ctx *ctx = req->ctx;

1559

1560 if (!req->file)

1561 return -EBADF;

1562

1563 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))

1564 return -EINVAL;

1565 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))

1566 return -EINVAL;

1567

1568 return 0;

1569 }

1570

1571 static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,

1572 bool force_nonblock)

1573 {

1574 loff_t sqe_off = READ_ONCE(sqe->off);

1575 loff_t sqe_len = READ_ONCE(sqe->len);

1576 loff_t end = sqe_off + sqe_len;

1577 unsigned fsync_flags;

1578 int ret;

1579

1580 fsync_flags = READ_ONCE(sqe->fsync_flags);

1581 if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))

1582 return -EINVAL;

1583

1584 ret = io_prep_fsync(req, sqe);

1585 if (ret)

1586 return ret;

1587

1588 /* fsync always requires a blocking context */

1589 if (force_nonblock)

1590 return -EAGAIN;

1591

1592 ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,

1593 end > 0 ? end : LLONG_MAX,

1594 fsync_flags & IORING_FSYNC_DATASYNC);

1595

1596 if (ret < 0 && (req->flags & REQ_F_LINK))

1597 req->flags |= REQ_F_FAIL_LINK;

1598 io_cqring_add_event(req->ctx, sqe->user_data, ret);

1599 io_put_req(req);

1600 return 0;

1601 }

1602

1603 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)

1604 {

1605 struct io_ring_ctx *ctx = req->ctx;

1606 int ret = 0;

1607

1608 if (!req->file)

1609 return -EBADF;

1610

1611 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))

1612 return -EINVAL;

1613 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))

1614 return -EINVAL;

1615

1616 return ret;

1617 }

1618

1619 static int io_sync_file_range(struct io_kiocb *req,

1620 const struct io_uring_sqe *sqe,

1621 bool force_nonblock)

1622 {

1623 loff_t sqe_off;

1624 loff_t sqe_len;

1625 unsigned flags;

1626 int ret;

1627

1628 ret = io_prep_sfr(req, sqe);

1629 if (ret)

1630 return ret;

1631

1632 /* sync_file_range always requires a blocking context */

1633 if (force_nonblock)

1634 return -EAGAIN;

1635

1636 sqe_off = READ_ONCE(sqe->off);

1637 sqe_len = READ_ONCE(sqe->len);

1638 flags = READ_ONCE(sqe->sync_range_flags);

1639

1640 ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);

1641

1642 if (ret < 0 && (req->flags & REQ_F_LINK))

1643 req->flags |= REQ_F_FAIL_LINK;

1644 io_cqring_add_event(req->ctx, sqe->user_data, ret);

1645 io_put_req(req);

1646 return 0;

1647 }

1648

1649 #if defined(CONFIG_NET)

1650 static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,

1651 bool force_nonblock,

1652 long (*fn)(struct socket *, struct user_msghdr __user *,

1653 unsigned int))

1654 {

1655 struct socket *sock;

1656 int ret;

1657

1658 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))

1659 return -EINVAL;

1660

1661 sock = sock_from_file(req->file, &ret);

1662 if (sock) {

1663 struct user_msghdr __user *msg;

1664 unsigned flags;

1665

1666 flags = READ_ONCE(sqe->msg_flags);

1667 if (flags & MSG_DONTWAIT)

1668 req->flags |= REQ_F_NOWAIT;

1669 else if (force_nonblock)

1670 flags |= MSG_DONTWAIT;

1671

1672 #ifdef CONFIG_COMPAT

1673 if (req->ctx->compat)

1674 flags |= MSG_CMSG_COMPAT;

1675 #endif

1676

1677 msg = (struct user_msghdr __user *) (unsigned long)

1678 READ_ONCE(sqe->addr);

1679

1680 ret = fn(sock, msg, flags);

1681 if (force_nonblock && ret == -EAGAIN)

1682 return ret;

1683 if (ret == -ERESTARTSYS)

1684 ret = -EINTR;

1685 }

1686

1687 if (req->fs) {

1688 struct fs_struct *fs = req->fs;

1689

1690 spin_lock(&req->fs->lock);

1691 if (--fs->users)

1692 fs = NULL;

1693 spin_unlock(&req->fs->lock);

1694 if (fs)

1695 free_fs_struct(fs);

1696 }

1697 io_cqring_add_event(req->ctx, sqe->user_data, ret);

1698 io_put_req(req);

1699 return 0;

1700 }

1701 #endif

1702

1703 static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,

1704 bool force_nonblock)

1705 {

1706 #if defined(CONFIG_NET)

1707 return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);

1708 #else

1709 return -EOPNOTSUPP;

1710 #endif

1711 }

1712

1713 static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,

1714 bool force_nonblock)

1715 {

1716 #if defined(CONFIG_NET)

1717 return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);

1718 #else

1719 return -EOPNOTSUPP;

1720 #endif

1721 }

1722

1723 static void io_poll_remove_one(struct io_kiocb *req)

1724 {

1725 struct io_poll_iocb *poll = &req->poll;

1726

1727 spin_lock(&poll->head->lock);

1728 WRITE_ONCE(poll->canceled, true);

1729 if (!list_empty(&poll->wait.entry)) {

1730 list_del_init(&poll->wait.entry);

1731 io_queue_async_work(req->ctx, req);

1732 }

1733 spin_unlock(&poll->head->lock);

1734

1735 list_del_init(&req->list);

1736 }

1737

1738 static void io_poll_remove_all(struct io_ring_ctx *ctx)

1739 {

1740 struct io_kiocb *req;

1741

1742 spin_lock_irq(&ctx->completion_lock);

1743 while (!list_empty(&ctx->cancel_list)) {

1744 req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);

1745 io_poll_remove_one(req);

1746 }

1747 spin_unlock_irq(&ctx->completion_lock);

1748 }

1749

1750 /*

1751 * Find a running poll command that matches one specified in sqe->addr,

1752 * and remove it if found.

1753 */

1754 static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)

1755 {

1756 struct io_ring_ctx *ctx = req->ctx;

1757 struct io_kiocb *poll_req, *next;

1758 int ret = -ENOENT;

1759

1760 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))

1761 return -EINVAL;

1762 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||

1763 sqe->poll_events)

1764 return -EINVAL;

1765

1766 spin_lock_irq(&ctx->completion_lock);

1767 list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {

1768 if (READ_ONCE(sqe->addr) == poll_req->user_data) {

1769 io_poll_remove_one(poll_req);

1770 ret = 0;

1771 break;

1772 }

1773 }

1774 spin_unlock_irq(&ctx->completion_lock);

1775

1776 io_cqring_add_event(req->ctx, sqe->user_data, ret);

1777 io_put_req(req);

1778 return 0;

1779 }

1780

1781 static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,

1782 __poll_t mask)

1783 {

1784 req->poll.done = true;

1785 io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));

1786 io_commit_cqring(ctx);

1787 }

1788

1789 static void io_poll_complete_work(struct work_struct *work)

1790 {

1791 struct io_kiocb *req = container_of(work, struct io_kiocb, work);

1792 struct io_poll_iocb *poll = &req->poll;

1793 struct poll_table_struct pt = { ._key = poll->events };

1794 struct io_ring_ctx *ctx = req->ctx;

1795 const struct cred *old_cred;

1796 __poll_t mask = 0;

1797

1798 old_cred = override_creds(ctx->creds);

1799

1800 if (!READ_ONCE(poll->canceled))

1801 mask = vfs_poll(poll->file, &pt) & poll->events;

1802

1803 /*

1804 * Note that ->ki_cancel callers also delete iocb from active_reqs after

1805 * calling ->ki_cancel. We need the ctx_lock roundtrip here to

1806 * synchronize with them. In the cancellation case the list_del_init

1807 * itself is not actually needed, but harmless so we keep it in to

1808 * avoid further branches in the fast path.

1809 */

1810 spin_lock_irq(&ctx->completion_lock);

1811 if (!mask && !READ_ONCE(poll->canceled)) {

1812 add_wait_queue(poll->head, &poll->wait);

1813 spin_unlock_irq(&ctx->completion_lock);

1814 goto out;

1815 }

1816 list_del_init(&req->list);

1817 io_poll_complete(ctx, req, mask);

1818 spin_unlock_irq(&ctx->completion_lock);

1819

1820 io_cqring_ev_posted(ctx);

1821 io_put_req(req);

1822 out:

1823 revert_creds(old_cred);

1824 }

1825

1826 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,

1827 void *key)

1828 {

1829 struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,

1830 wait);

1831 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);

1832 struct io_ring_ctx *ctx = req->ctx;

1833 __poll_t mask = key_to_poll(key);

1834 unsigned long flags;

1835

1836 /* for instances that support it check for an event match first: */

1837 if (mask && !(mask & poll->events))

1838 return 0;

1839

1840 list_del_init(&poll->wait.entry);

1841

1842 if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {

1843 list_del(&req->list);

1844 io_poll_complete(ctx, req, mask);

1845 spin_unlock_irqrestore(&ctx->completion_lock, flags);

1846

1847 io_cqring_ev_posted(ctx);

1848 io_put_req(req);

1849 } else {

1850 io_queue_async_work(ctx, req);

1851 }

1852

1853 return 1;

1854 }

1855

1856 struct io_poll_table {

1857 struct poll_table_struct pt;

1858 struct io_kiocb *req;

1859 int error;

1860 };

1861

1862 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,

1863 struct poll_table_struct *p)

1864 {

1865 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);

1866

1867 if (unlikely(pt->req->poll.head)) {

1868 pt->error = -EINVAL;

1869 return;

1870 }

1871

1872 pt->error = 0;

1873 pt->req->poll.head = head;

1874 add_wait_queue(head, &pt->req->poll.wait);

1875 }

1876

1877 static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)

1878 {

1879 struct io_poll_iocb *poll = &req->poll;

1880 struct io_ring_ctx *ctx = req->ctx;

1881 struct io_poll_table ipt;

1882 bool cancel = false;

1883 __poll_t mask;

1884 u16 events;

1885

1886 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))

1887 return -EINVAL;

1888 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)

1889 return -EINVAL;

1890 if (!poll->file)

1891 return -EBADF;

1892

1893 req->submit.sqe = NULL;

1894 INIT_WORK(&req->work, io_poll_complete_work);

1895 events = READ_ONCE(sqe->poll_events);

1896 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;

1897

1898 poll->head = NULL;

1899 poll->done = false;

1900 poll->canceled = false;

1901

1902 ipt.pt._qproc = io_poll_queue_proc;

1903 ipt.pt._key = poll->events;

1904 ipt.req = req;

1905 ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */

1906

1907 /* initialized the list so that we can do list_empty checks */

1908 INIT_LIST_HEAD(&poll->wait.entry);

1909 init_waitqueue_func_entry(&poll->wait, io_poll_wake);

1910

1911 INIT_LIST_HEAD(&req->list);

1912

1913 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;

1914

1915 spin_lock_irq(&ctx->completion_lock);

1916 if (likely(poll->head)) {

1917 spin_lock(&poll->head->lock);

1918 if (unlikely(list_empty(&poll->wait.entry))) {

1919 if (ipt.error)

1920 cancel = true;

1921 ipt.error = 0;

1922 mask = 0;

1923 }

1924 if (mask || ipt.error)

1925 list_del_init(&poll->wait.entry);

1926 else if (cancel)

1927 WRITE_ONCE(poll->canceled, true);

1928 else if (!poll->done) /* actually waiting for an event */

1929 list_add_tail(&req->list, &ctx->cancel_list);

1930 spin_unlock(&poll->head->lock);

1931 }

1932 if (mask) { /* no async, we'd stolen it */

1933 ipt.error = 0;

1934 io_poll_complete(ctx, req, mask);

1935 }

1936 spin_unlock_irq(&ctx->completion_lock);

1937

1938 if (mask) {

1939 io_cqring_ev_posted(ctx);

1940 io_put_req(req);

1941 }

1942 return ipt.error;

1943 }

1944

1945 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)

1946 {

1947 struct io_ring_ctx *ctx;

1948 struct io_kiocb *req, *prev;

1949 unsigned long flags;

1950

1951 req = container_of(timer, struct io_kiocb, timeout.timer);

1952 ctx = req->ctx;

1953 atomic_inc(&ctx->cq_timeouts);

1954

1955 spin_lock_irqsave(&ctx->completion_lock, flags);

1956 /*

1957 * Adjust the reqs sequence before the current one because it

1958 * will consume a slot in the cq_ring and the the cq_tail pointer

1959 * will be increased, otherwise other timeout reqs may return in

1960 * advance without waiting for enough wait_nr.

1961 */

1962 prev = req;

1963 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)

1964 prev->sequence++;

1965 list_del(&req->list);

1966

1967 io_cqring_fill_event(ctx, req->user_data, -ETIME);

1968 io_commit_cqring(ctx);

1969 spin_unlock_irqrestore(&ctx->completion_lock, flags);

1970

1971 io_cqring_ev_posted(ctx);

1972

1973 io_put_req(req);

1974 return HRTIMER_NORESTART;

1975 }

1976

1977 static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)

1978 {

1979 unsigned count;

1980 struct io_ring_ctx *ctx = req->ctx;

1981 struct list_head *entry;

1982 struct timespec64 ts;

1983 unsigned span = 0;

1984

1985 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))

1986 return -EINVAL;

1987 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||

1988 sqe->len != 1)

1989 return -EINVAL;

1990

1991 if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))

1992 return -EFAULT;

1993

1994 req->flags |= REQ_F_TIMEOUT;

1995

1996 /*

1997 * sqe->off holds how many events that need to occur for this

1998 * timeout event to be satisfied. If it isn't set, then this is

1999 * a pure timeout request, sequence isn't used.

2000 */

2001 count = READ_ONCE(sqe->off);

2002 if (!count) {

2003 req->flags |= REQ_F_TIMEOUT_NOSEQ;

2004 spin_lock_irq(&ctx->completion_lock);

2005 entry = ctx->timeout_list.prev;

2006 goto add;

2007 }

2008

2009 req->sequence = ctx->cached_sq_head + count - 1;

2010 /* reuse it to store the count */

2011 req->submit.sequence = count;

2012

2013 /*

2014 * Insertion sort, ensuring the first entry in the list is always

2015 * the one we need first.

2016 */

2017 spin_lock_irq(&ctx->completion_lock);

2018 list_for_each_prev(entry, &ctx->timeout_list) {

2019 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);

2020 unsigned nxt_sq_head;

2021 long long tmp, tmp_nxt;

2022

2023 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)

2024 continue;

2025

2026 /*

2027 * Since cached_sq_head + count - 1 can overflow, use type long

2028 * long to store it.

2029 */

2030 tmp = (long long)ctx->cached_sq_head + count - 1;

2031 nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;

2032 tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;

2033

2034 /*

2035 * cached_sq_head may overflow, and it will never overflow twice

2036 * once there is some timeout req still be valid.

2037 */

2038 if (ctx->cached_sq_head < nxt_sq_head)

2039 tmp += UINT_MAX;

2040

2041 if (tmp > tmp_nxt)

2042 break;

2043

2044 /*

2045 * Sequence of reqs after the insert one and itself should

2046 * be adjusted because each timeout req consumes a slot.

2047 */

2048 span++;

2049 nxt->sequence++;

2050 }

2051 req->sequence -= span;

2052 add:

2053 list_add(&req->list, entry);

2054 spin_unlock_irq(&ctx->completion_lock);

2055

2056 hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

2057 req->timeout.timer.function = io_timeout_fn;

2058 hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),

2059 HRTIMER_MODE_REL);

2060 return 0;

2061 }

2062

2063 static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,

2064 struct sqe_submit *s)

2065 {

2066 struct io_uring_sqe *sqe_copy;

2067

2068 if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))

2069 return 0;

2070

2071 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);

2072 if (!sqe_copy)

2073 return -EAGAIN;

2074

2075 spin_lock_irq(&ctx->completion_lock);

2076 if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {

2077 spin_unlock_irq(&ctx->completion_lock);

2078 kfree(sqe_copy);

2079 return 0;

2080 }

2081

2082 memcpy(&req->submit, s, sizeof(*s));

2083 memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));

2084 req->submit.sqe = sqe_copy;

2085

2086 INIT_WORK(&req->work, io_sq_wq_submit_work);

2087 list_add_tail(&req->list, &ctx->defer_list);

2088 spin_unlock_irq(&ctx->completion_lock);

2089 return -EIOCBQUEUED;

2090 }

2091

2092 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,

2093 const struct sqe_submit *s, bool force_nonblock)

2094 {

2095 int ret, opcode;

2096

2097 req->user_data = READ_ONCE(s->sqe->user_data);

2098

2099 if (unlikely(s->index >= ctx->sq_entries))

2100 return -EINVAL;

2101

2102 opcode = READ_ONCE(s->sqe->opcode);

2103 switch (opcode) {

2104 case IORING_OP_NOP:

2105 ret = io_nop(req, req->user_data);

2106 break;

2107 case IORING_OP_READV:

2108 if (unlikely(s->sqe->buf_index))

2109 return -EINVAL;

2110 ret = io_read(req, s, force_nonblock);

2111 break;

2112 case IORING_OP_WRITEV:

2113 if (unlikely(s->sqe->buf_index))

2114 return -EINVAL;

2115 ret = io_write(req, s, force_nonblock);

2116 break;

2117 case IORING_OP_READ_FIXED:

2118 ret = io_read(req, s, force_nonblock);

2119 break;

2120 case IORING_OP_WRITE_FIXED:

2121 ret = io_write(req, s, force_nonblock);

2122 break;

2123 case IORING_OP_FSYNC:

2124 ret = io_fsync(req, s->sqe, force_nonblock);

2125 break;

2126 case IORING_OP_POLL_ADD:

2127 ret = io_poll_add(req, s->sqe);

2128 break;

2129 case IORING_OP_POLL_REMOVE:

2130 ret = io_poll_remove(req, s->sqe);

2131 break;

2132 case IORING_OP_SYNC_FILE_RANGE:

2133 ret = io_sync_file_range(req, s->sqe, force_nonblock);

2134 break;

2135 case IORING_OP_SENDMSG:

2136 ret = io_sendmsg(req, s->sqe, force_nonblock);

2137 break;

2138 case IORING_OP_RECVMSG:

2139 ret = io_recvmsg(req, s->sqe, force_nonblock);

2140 break;

2141 case IORING_OP_TIMEOUT:

2142 ret = io_timeout(req, s->sqe);

2143 break;

2144 default:

2145 ret = -EINVAL;

2146 break;

2147 }

2148

2149 if (ret)

2150 return ret;

2151

2152 if (ctx->flags & IORING_SETUP_IOPOLL) {

2153 if (req->result == -EAGAIN)

2154 return -EAGAIN;

2155

2156 /* workqueue context doesn't hold uring_lock, grab it now */

2157 if (s->needs_lock)

2158 mutex_lock(&ctx->uring_lock);

2159 io_iopoll_req_issued(req);

2160 if (s->needs_lock)

2161 mutex_unlock(&ctx->uring_lock);

2162 }

2163

2164 return 0;

2165 }

2166

2167 static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,

2168 const struct io_uring_sqe *sqe)

2169 {

2170 switch (sqe->opcode) {

2171 case IORING_OP_READV:

2172 case IORING_OP_READ_FIXED:

2173 return &ctx->pending_async[READ];

2174 case IORING_OP_WRITEV:

2175 case IORING_OP_WRITE_FIXED:

2176 return &ctx->pending_async[WRITE];

2177 default:

2178 return NULL;

2179 }

2180 }

2181

2182 static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)

2183 {

2184 u8 opcode = READ_ONCE(sqe->opcode);

2185

2186 return !(opcode == IORING_OP_READ_FIXED ||

2187 opcode == IORING_OP_WRITE_FIXED);

2188 }

2189

2190 static void io_sq_wq_submit_work(struct work_struct *work)

2191 {

2192 struct io_kiocb *req = container_of(work, struct io_kiocb, work);

2193 struct fs_struct *old_fs_struct = current->fs;

2194 struct io_ring_ctx *ctx = req->ctx;

2195 struct mm_struct *cur_mm = NULL;

2196 struct async_list *async_list;

2197 const struct cred *old_cred;

2198 LIST_HEAD(req_list);

2199 mm_segment_t old_fs;

2200 int ret;

2201

2202 old_cred = override_creds(ctx->creds);

2203 async_list = io_async_list_from_sqe(ctx, req->submit.sqe);

2204 restart:

2205 do {

2206 struct sqe_submit *s = &req->submit;

2207 const struct io_uring_sqe *sqe = s->sqe;

2208 unsigned int flags = req->flags;

2209

2210 /* Ensure we clear previously set non-block flag */

2211 req->rw.ki_flags &= ~IOCB_NOWAIT;

2212

2213 if (req->fs != current->fs && current->fs != old_fs_struct) {

2214 task_lock(current);

2215 if (req->fs)

2216 current->fs = req->fs;

2217 else

2218 current->fs = old_fs_struct;

2219 task_unlock(current);

2220 }

2221

2222 ret = 0;

2223 if (io_sqe_needs_user(sqe) && !cur_mm) {

2224 if (!mmget_not_zero(ctx->sqo_mm)) {

2225 ret = -EFAULT;

2226 } else {

2227 cur_mm = ctx->sqo_mm;

2228 use_mm(cur_mm);

2229 old_fs = get_fs();

2230 set_fs(USER_DS);

2231 }

2232 }

2233

2234 if (!ret) {

2235 s->has_user = cur_mm != NULL;

2236 s->needs_lock = true;

2237 do {

2238 ret = __io_submit_sqe(ctx, req, s, false);

2239 /*

2240 * We can get EAGAIN for polled IO even though

2241 * we're forcing a sync submission from here,

2242 * since we can't wait for request slots on the

2243 * block side.

2244 */

2245 if (ret != -EAGAIN)

2246 break;

2247 cond_resched();

2248 } while (1);

2249 }

2250

2251 /* drop submission reference */

2252 io_put_req(req);

2253

2254 if (ret) {

2255 io_cqring_add_event(ctx, sqe->user_data, ret);

2256 io_put_req(req);

2257 }

2258

2259 /* async context always use a copy of the sqe */

2260 kfree(sqe);

2261

2262 /* req from defer and link list needn't decrease async cnt */

2263 if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))

2264 goto out;

2265

2266 if (!async_list)

2267 break;

2268 if (!list_empty(&req_list)) {

2269 req = list_first_entry(&req_list, struct io_kiocb,

2270 list);

2271 list_del(&req->list);

2272 continue;

2273 }

2274 if (list_empty(&async_list->list))

2275 break;

2276

2277 req = NULL;

2278 spin_lock(&async_list->lock);

2279 if (list_empty(&async_list->list)) {

2280 spin_unlock(&async_list->lock);

2281 break;

2282 }

2283 list_splice_init(&async_list->list, &req_list);

2284 spin_unlock(&async_list->lock);

2285

2286 req = list_first_entry(&req_list, struct io_kiocb, list);

2287 list_del(&req->list);

2288 } while (req);

2289

2290 /*

2291 * Rare case of racing with a submitter. If we find the count has

2292 * dropped to zero AND we have pending work items, then restart

2293 * the processing. This is a tiny race window.

2294 */

2295 if (async_list) {

2296 ret = atomic_dec_return(&async_list->cnt);

2297 while (!ret && !list_empty(&async_list->list)) {

2298 spin_lock(&async_list->lock);

2299 atomic_inc(&async_list->cnt);

2300 list_splice_init(&async_list->list, &req_list);

2301 spin_unlock(&async_list->lock);

2302

2303 if (!list_empty(&req_list)) {

2304 req = list_first_entry(&req_list,

2305 struct io_kiocb, list);

2306 list_del(&req->list);

2307 goto restart;

2308 }

2309 ret = atomic_dec_return(&async_list->cnt);

2310 }

2311 }

2312

2313 out:

2314 if (cur_mm) {

2315 set_fs(old_fs);

2316 unuse_mm(cur_mm);

2317 mmput(cur_mm);

2318 }

2319 revert_creds(old_cred);

2320 if (old_fs_struct) {

2321 task_lock(current);

2322 current->fs = old_fs_struct;

2323 task_unlock(current);

2324 }

2325 }

2326

2327 /*

2328 * See if we can piggy back onto previously submitted work, that is still

2329 * running. We currently only allow this if the new request is sequential

2330 * to the previous one we punted.

2331 */

2332 static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)

2333 {

2334 bool ret;

2335

2336 if (!list)

2337 return false;

2338 if (!(req->flags & REQ_F_SEQ_PREV))

2339 return false;

2340 if (!atomic_read(&list->cnt))

2341 return false;

2342

2343 ret = true;

2344 spin_lock(&list->lock);

2345 list_add_tail(&req->list, &list->list);

2346 /*

2347 * Ensure we see a simultaneous modification from io_sq_wq_submit_work()

2348 */

2349 smp_mb();

2350 if (!atomic_read(&list->cnt)) {

2351 list_del_init(&req->list);

2352 ret = false;

2353 }

2354 spin_unlock(&list->lock);

2355 return ret;

2356 }

2357

2358 static bool io_op_needs_file(const struct io_uring_sqe *sqe)

2359 {

2360 int op = READ_ONCE(sqe->opcode);

2361

2362 switch (op) {

2363 case IORING_OP_NOP:

2364 case IORING_OP_POLL_REMOVE:

2365 case IORING_OP_TIMEOUT:

2366 return false;

2367 default:

2368 return true;

2369 }

2370 }

2371

2372 static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,

2373 struct io_submit_state *state, struct io_kiocb *req)

2374 {

2375 unsigned flags;

2376 int fd;

2377

2378 flags = READ_ONCE(s->sqe->flags);

2379 fd = READ_ONCE(s->sqe->fd);

2380

2381 if (flags & IOSQE_IO_DRAIN)

2382 req->flags |= REQ_F_IO_DRAIN;

2383 /*

2384 * All io need record the previous position, if LINK vs DARIN,

2385 * it can be used to mark the position of the first IO in the

2386 * link list.

2387 */

2388 req->sequence = s->sequence;

2389

2390 if (!io_op_needs_file(s->sqe))

2391 return 0;

2392

2393 if (flags & IOSQE_FIXED_FILE) {

2394 if (unlikely(!ctx->user_files ||

2395 (unsigned) fd >= ctx->nr_user_files))

2396 return -EBADF;

2397 req->file = ctx->user_files[fd];

2398 req->flags |= REQ_F_FIXED_FILE;

2399 } else {

2400 if (s->needs_fixed_file)

2401 return -EBADF;

2402 req->file = io_file_get(state, fd);

2403 if (unlikely(!req->file))

2404 return -EBADF;

2405 }

2406

2407 return 0;

2408 }

2409

2410 static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,

2411 struct sqe_submit *s)

2412 {

2413 int ret;

2414

2415 ret = __io_submit_sqe(ctx, req, s, true);

2416

2417 /*

2418 * We async punt it if the file wasn't marked NOWAIT, or if the file

2419 * doesn't support non-blocking read/write attempts

2420 */

2421 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||

2422 (req->flags & REQ_F_MUST_PUNT))) {

2423 struct io_uring_sqe *sqe_copy;

2424

2425 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);

2426 if (sqe_copy) {

2427 struct async_list *list;

2428

2429 s->sqe = sqe_copy;

2430 memcpy(&req->submit, s, sizeof(*s));

2431 list = io_async_list_from_sqe(ctx, s->sqe);

2432 if (!io_add_to_prev_work(list, req)) {

2433 if (list)

2434 atomic_inc(&list->cnt);

2435 INIT_WORK(&req->work, io_sq_wq_submit_work);

2436 io_queue_async_work(ctx, req);

2437 }

2438

2439 /*

2440 * Queued up for async execution, worker will release

2441 * submit reference when the iocb is actually submitted.

2442 */

2443 return 0;

2444 }

2445 }

2446

2447 /* drop submission reference */

2448 io_put_req(req);

2449

2450 /* and drop final reference, if we failed */

2451 if (ret) {

2452 io_cqring_add_event(ctx, req->user_data, ret);

2453 if (req->flags & REQ_F_LINK)

2454 req->flags |= REQ_F_FAIL_LINK;

2455 io_put_req(req);

2456 }

2457

2458 return ret;

2459 }

2460

2461 static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,

2462 struct sqe_submit *s)

2463 {

2464 int ret;

2465

2466 ret = io_req_defer(ctx, req, s);

2467 if (ret) {

2468 if (ret != -EIOCBQUEUED) {

2469 io_free_req(req);

2470 io_cqring_add_event(ctx, s->sqe->user_data, ret);

2471 }

2472 return 0;

2473 }

2474

2475 return __io_queue_sqe(ctx, req, s);

2476 }

2477

2478 static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,

2479 struct sqe_submit *s, struct io_kiocb *shadow)

2480 {

2481 int ret;

2482 int need_submit = false;

2483

2484 if (!shadow)

2485 return io_queue_sqe(ctx, req, s);

2486

2487 /*

2488 * Mark the first IO in link list as DRAIN, let all the following

2489 * IOs enter the defer list. all IO needs to be completed before link

2490 * list.

2491 */

2492 req->flags |= REQ_F_IO_DRAIN;

2493 ret = io_req_defer(ctx, req, s);

2494 if (ret) {

2495 if (ret != -EIOCBQUEUED) {

2496 io_free_req(req);

2497 __io_free_req(shadow);

2498 io_cqring_add_event(ctx, s->sqe->user_data, ret);

2499 return 0;

2500 }

2501 } else {

2502 /*

2503 * If ret == 0 means that all IOs in front of link io are

2504 * running done. let's queue link head.

2505 */

2506 need_submit = true;

2507 }

2508

2509 /* Insert shadow req to defer_list, blocking next IOs */

2510 spin_lock_irq(&ctx->completion_lock);

2511 list_add_tail(&shadow->list, &ctx->defer_list);

2512 spin_unlock_irq(&ctx->completion_lock);

2513

2514 if (need_submit)

2515 return __io_queue_sqe(ctx, req, s);

2516

2517 return 0;

2518 }

2519

2520 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)

2521

2522 static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,

2523 struct io_submit_state *state, struct io_kiocb **link)

2524 {

2525 struct io_uring_sqe *sqe_copy;

2526 struct io_kiocb *req;

2527 int ret;

2528

2529 /* enforce forwards compatibility on users */

2530 if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {

2531 ret = -EINVAL;

2532 goto err;

2533 }

2534

2535 req = io_get_req(ctx, state);

2536 if (unlikely(!req)) {

2537 ret = -EAGAIN;

2538 goto err;

2539 }

2540

2541 ret = io_req_set_file(ctx, s, state, req);

2542 if (unlikely(ret)) {

2543 err_req:

2544 io_free_req(req);

2545 err:

2546 io_cqring_add_event(ctx, s->sqe->user_data, ret);

2547 return;

2548 }

2549

2550 req->user_data = s->sqe->user_data;

2551

2552 #if defined(CONFIG_NET)

2553 switch (READ_ONCE(s->sqe->opcode)) {

2554 case IORING_OP_SENDMSG:

2555 case IORING_OP_RECVMSG:

2556 spin_lock(&current->fs->lock);

2557 if (!current->fs->in_exec) {

2558 req->fs = current->fs;

2559 req->fs->users++;

2560 }

2561 spin_unlock(&current->fs->lock);

2562 if (!req->fs) {

2563 ret = -EAGAIN;

2564 goto err_req;

2565 }

2566 }

2567 #endif

2568

2569 /*

2570 * If we already have a head request, queue this one for async

2571 * submittal once the head completes. If we don't have a head but

2572 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be

2573 * submitted sync once the chain is complete. If none of those

2574 * conditions are true (normal request), then just queue it.

2575 */

2576 if (*link) {

2577 struct io_kiocb *prev = *link;

2578

2579 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);

2580 if (!sqe_copy) {

2581 ret = -EAGAIN;

2582 goto err_req;

2583 }

2584

2585 s->sqe = sqe_copy;

2586 memcpy(&req->submit, s, sizeof(*s));

2587 list_add_tail(&req->list, &prev->link_list);

2588 } else if (s->sqe->flags & IOSQE_IO_LINK) {

2589 req->flags |= REQ_F_LINK;

2590

2591 memcpy(&req->submit, s, sizeof(*s));

2592 INIT_LIST_HEAD(&req->link_list);

2593 *link = req;

2594 } else {

2595 io_queue_sqe(ctx, req, s);

2596 }

2597 }

2598

2599 /*

2600 * Batched submission is done, ensure local IO is flushed out.

2601 */

2602 static void io_submit_state_end(struct io_submit_state *state)

2603 {

2604 blk_finish_plug(&state->plug);

2605 io_file_put(state);

2606 if (state->free_reqs)

2607 kmem_cache_free_bulk(req_cachep, state->free_reqs,

2608 &state->reqs[state->cur_req]);

2609 }

2610

2611 /*

2612 * Start submission side cache.

2613 */

2614 static void io_submit_state_start(struct io_submit_state *state,

2615 struct io_ring_ctx *ctx, unsigned max_ios)

2616 {

2617 blk_start_plug(&state->plug);

2618 state->free_reqs = 0;

2619 state->file = NULL;

2620 state->ios_left = max_ios;

2621 }

2622

2623 static void io_commit_sqring(struct io_ring_ctx *ctx)

2624 {

2625 struct io_rings *rings = ctx->rings;

2626

2627 if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {

2628 /*

2629 * Ensure any loads from the SQEs are done at this point,

2630 * since once we write the new head, the application could

2631 * write new data to them.

2632 */

2633 smp_store_release(&rings->sq.head, ctx->cached_sq_head);

2634 }

2635 }

2636

2637 /*

2638 * Fetch an sqe, if one is available. Note that s->sqe will point to memory

2639 * that is mapped by userspace. This means that care needs to be taken to

2640 * ensure that reads are stable, as we cannot rely on userspace always

2641 * being a good citizen. If members of the sqe are validated and then later

2642 * used, it's important that those reads are done through READ_ONCE() to

2643 * prevent a re-load down the line.

2644 */

2645 static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)

2646 {

2647 struct io_rings *rings = ctx->rings;

2648 u32 *sq_array = ctx->sq_array;

2649 unsigned head;

2650

2651 /*

2652 * The cached sq head (or cq tail) serves two purposes:

2653 *

2654 * 1) allows us to batch the cost of updating the user visible

2655 * head updates.

2656 * 2) allows the kernel side to track the head on its own, even

2657 * though the application is the one updating it.

2658 */

2659 head = ctx->cached_sq_head;

2660 /* make sure SQ entry isn't read before tail */

2661 if (head == smp_load_acquire(&rings->sq.tail))

2662 return false;

2663

2664 head = READ_ONCE(sq_array[head & ctx->sq_mask]);

2665 if (head < ctx->sq_entries) {

2666 s->index = head;

2667 s->sqe = &ctx->sq_sqes[head];

2668 s->sequence = ctx->cached_sq_head;

2669 ctx->cached_sq_head++;

2670 return true;

2671 }

2672

2673 /* drop invalid entries */

2674 ctx->cached_sq_head++;

2675 ctx->cached_sq_dropped++;

2676 WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);

2677 return false;

2678 }

2679

2680 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,

2681 bool has_user, bool mm_fault)

2682 {

2683 struct io_submit_state state, *statep = NULL;

2684 struct io_kiocb *link = NULL;

2685 struct io_kiocb *shadow_req = NULL;

2686 bool prev_was_link = false;

2687 int i, submitted = 0;

2688

2689 if (nr > IO_PLUG_THRESHOLD) {

2690 io_submit_state_start(&state, ctx, nr);

2691 statep = &state;

2692 }

2693

2694 for (i = 0; i < nr; i++) {

2695 struct sqe_submit s;

2696

2697 if (!io_get_sqring(ctx, &s))

2698 break;

2699

2700 /*

2701 * If previous wasn't linked and we have a linked command,

2702 * that's the end of the chain. Submit the previous link.

2703 */

2704 if (!prev_was_link && link) {

2705 io_queue_link_head(ctx, link, &link->submit, shadow_req);

2706 link = NULL;

2707 shadow_req = NULL;

2708 }

2709 prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;

2710

2711 if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {

2712 if (!shadow_req) {

2713 shadow_req = io_get_req(ctx, NULL);

2714 if (unlikely(!shadow_req))

2715 goto out;

2716 shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);

2717 refcount_dec(&shadow_req->refs);

2718 }

2719 shadow_req->sequence = s.sequence;

2720 }

2721

2722 out:

2723 if (unlikely(mm_fault)) {

2724 io_cqring_add_event(ctx, s.sqe->user_data,

2725 -EFAULT);

2726 } else {

2727 s.has_user = has_user;

2728 s.needs_lock = true;

2729 s.needs_fixed_file = true;

2730 io_submit_sqe(ctx, &s, statep, &link);

2731 submitted++;

2732 }

2733 }

2734

2735 if (link)

2736 io_queue_link_head(ctx, link, &link->submit, shadow_req);

2737 if (statep)

2738 io_submit_state_end(&state);

2739

2740 return submitted;

2741 }

2742

2743 static int io_sq_thread(void *data)

2744 {

2745 struct io_ring_ctx *ctx = data;

2746 struct mm_struct *cur_mm = NULL;

2747 const struct cred *old_cred;

2748 mm_segment_t old_fs;

2749 DEFINE_WAIT(wait);

2750 unsigned inflight;

2751 unsigned long timeout;

2752

2753 complete(&ctx->sqo_thread_started);

2754

2755 old_fs = get_fs();

2756 set_fs(USER_DS);

2757 old_cred = override_creds(ctx->creds);

2758

2759 timeout = inflight = 0;

2760 while (!kthread_should_park()) {

2761 bool mm_fault = false;

2762 unsigned int to_submit;

2763

2764 if (inflight) {

2765 unsigned nr_events = 0;

2766

2767 if (ctx->flags & IORING_SETUP_IOPOLL) {

2768 /*

2769 * inflight is the count of the maximum possible

2770 * entries we submitted, but it can be smaller

2771 * if we dropped some of them. If we don't have

2772 * poll entries available, then we know that we

2773 * have nothing left to poll for. Reset the

2774 * inflight count to zero in that case.

2775 */

2776 mutex_lock(&ctx->uring_lock);

2777 if (!list_empty(&ctx->poll_list))

2778 io_iopoll_getevents(ctx, &nr_events, 0);

2779 else

2780 inflight = 0;

2781 mutex_unlock(&ctx->uring_lock);

2782 } else {

2783 /*

2784 * Normal IO, just pretend everything completed.

2785 * We don't have to poll completions for that.

2786 */

2787 nr_events = inflight;

2788 }

2789

2790 inflight -= nr_events;

2791 if (!inflight)

2792 timeout = jiffies + ctx->sq_thread_idle;

2793 }

2794

2795 to_submit = io_sqring_entries(ctx);

2796 if (!to_submit) {

2797 /*

2798 * Drop cur_mm before scheduling, we can't hold it for

2799 * long periods (or over schedule()). Do this before

2800 * adding ourselves to the waitqueue, as the unuse/drop

2801 * may sleep.

2802 */

2803 if (cur_mm) {

2804 unuse_mm(cur_mm);

2805 mmput(cur_mm);

2806 cur_mm = NULL;

2807 }

2808

2809 /*

2810 * We're polling. If we're within the defined idle

2811 * period, then let us spin without work before going

2812 * to sleep.

2813 */

2814 if (inflight || !time_after(jiffies, timeout)) {

2815 cond_resched();

2816 continue;

2817 }

2818

2819 prepare_to_wait(&ctx->sqo_wait, &wait,

2820 TASK_INTERRUPTIBLE);

2821

2822 /* Tell userspace we may need a wakeup call */

2823 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;

2824 /* make sure to read SQ tail after writing flags */

2825 smp_mb();

2826

2827 to_submit = io_sqring_entries(ctx);

2828 if (!to_submit) {

2829 if (kthread_should_park()) {

2830 finish_wait(&ctx->sqo_wait, &wait);

2831 break;

2832 }

2833 if (signal_pending(current))

2834 flush_signals(current);

2835 schedule();

2836 finish_wait(&ctx->sqo_wait, &wait);

2837

2838 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;

2839 continue;

2840 }

2841 finish_wait(&ctx->sqo_wait, &wait);

2842

2843 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;

2844 }

2845

2846 /* Unless all new commands are FIXED regions, grab mm */

2847 if (!cur_mm) {

2848 mm_fault = !mmget_not_zero(ctx->sqo_mm);

2849 if (!mm_fault) {

2850 use_mm(ctx->sqo_mm);

2851 cur_mm = ctx->sqo_mm;

2852 }

2853 }

2854

2855 to_submit = min(to_submit, ctx->sq_entries);

2856 inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,

2857 mm_fault);

2858

2859 /* Commit SQ ring head once we've consumed all SQEs */

2860 io_commit_sqring(ctx);

2861 }

2862

2863 set_fs(old_fs);

2864 if (cur_mm) {

2865 unuse_mm(cur_mm);

2866 mmput(cur_mm);

2867 }

2868 revert_creds(old_cred);

2869

2870 kthread_parkme();

2871

2872 return 0;

2873 }

2874

2875 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)

2876 {

2877 struct io_submit_state state, *statep = NULL;

2878 struct io_kiocb *link = NULL;

2879 struct io_kiocb *shadow_req = NULL;

2880 bool prev_was_link = false;

2881 int i, submit = 0;

2882

2883 if (to_submit > IO_PLUG_THRESHOLD) {

2884 io_submit_state_start(&state, ctx, to_submit);

2885 statep = &state;

2886 }

2887

2888 for (i = 0; i < to_submit; i++) {

2889 struct sqe_submit s;

2890

2891 if (!io_get_sqring(ctx, &s))

2892 break;

2893

2894 /*

2895 * If previous wasn't linked and we have a linked command,

2896 * that's the end of the chain. Submit the previous link.

2897 */

2898 if (!prev_was_link && link) {

2899 io_queue_link_head(ctx, link, &link->submit, shadow_req);

2900 link = NULL;

2901 shadow_req = NULL;

2902 }

2903 prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;

2904

2905 if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {

2906 if (!shadow_req) {

2907 shadow_req = io_get_req(ctx, NULL);

2908 if (unlikely(!shadow_req))

2909 goto out;

2910 shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);

2911 refcount_dec(&shadow_req->refs);

2912 }

2913 shadow_req->sequence = s.sequence;

2914 }

2915

2916 out:

2917 s.has_user = true;

2918 s.needs_lock = false;

2919 s.needs_fixed_file = false;

2920 submit++;

2921 io_submit_sqe(ctx, &s, statep, &link);

2922 }

2923

2924 if (link)

2925 io_queue_link_head(ctx, link, &link->submit, shadow_req);

2926 if (statep)

2927 io_submit_state_end(statep);

2928

2929 io_commit_sqring(ctx);

2930

2931 return submit;

2932 }

2933

2934 struct io_wait_queue {

2935 struct wait_queue_entry wq;

2936 struct io_ring_ctx *ctx;

2937 unsigned to_wait;

2938 unsigned nr_timeouts;

2939 };

2940

2941 static inline bool io_should_wake(struct io_wait_queue *iowq)

2942 {

2943 struct io_ring_ctx *ctx = iowq->ctx;

2944

2945 /*

2946 * Wake up if we have enough events, or if a timeout occured since we

2947 * started waiting. For timeouts, we always want to return to userspace,

2948 * regardless of event count.

2949 */

2950 return io_cqring_events(ctx->rings) >= iowq->to_wait ||

2951 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;

2952 }

2953

2954 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,

2955 int wake_flags, void *key)

2956 {

2957 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,

2958 wq);

2959

2960 if (!io_should_wake(iowq))

2961 return -1;

2962

2963 return autoremove_wake_function(curr, mode, wake_flags, key);

2964 }

2965

2966 /*

2967 * Wait until events become available, if we don't already have some. The

2968 * application must reap them itself, as they reside on the shared cq ring.

2969 */

2970 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,

2971 const sigset_t __user *sig, size_t sigsz)

2972 {

2973 struct io_wait_queue iowq = {

2974 .wq = {

2975 .private = current,

2976 .func = io_wake_function,

2977 .entry = LIST_HEAD_INIT(iowq.wq.entry),

2978 },

2979 .ctx = ctx,

2980 .to_wait = min_events,

2981 };

2982 struct io_rings *rings = ctx->rings;

2983 int ret;

2984

2985 if (io_cqring_events(rings) >= min_events)

2986 return 0;

2987

2988 if (sig) {

2989 #ifdef CONFIG_COMPAT

2990 if (in_compat_syscall())

2991 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,

2992 sigsz);

2993 else

2994 #endif

2995 ret = set_user_sigmask(sig, sigsz);

2996

2997 if (ret)

2998 return ret;

2999 }

3000

3001 ret = 0;

3002 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);

3003 do {

3004 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,

3005 TASK_INTERRUPTIBLE);

3006 if (io_should_wake(&iowq))

3007 break;

3008 schedule();

3009 if (signal_pending(current)) {

3010 ret = -ERESTARTSYS;

3011 break;

3012 }

3013 } while (1);

3014 finish_wait(&ctx->wait, &iowq.wq);

3015

3016 restore_saved_sigmask_unless(ret == -ERESTARTSYS);

3017 if (ret == -ERESTARTSYS)

3018 ret = -EINTR;

3019

3020 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;

3021 }

3022

3023 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)

3024 {

3025 #if defined(CONFIG_UNIX)

3026 if (ctx->ring_sock) {

3027 struct sock *sock = ctx->ring_sock->sk;

3028 struct sk_buff *skb;

3029

3030 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)

3031 kfree_skb(skb);

3032 }

3033 #else

3034 int i;

3035

3036 for (i = 0; i < ctx->nr_user_files; i++)

3037 fput(ctx->user_files[i]);

3038 #endif

3039 }

3040

3041 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)

3042 {

3043 if (!ctx->user_files)

3044 return -ENXIO;

3045

3046 __io_sqe_files_unregister(ctx);

3047 kfree(ctx->user_files);

3048 ctx->user_files = NULL;

3049 ctx->nr_user_files = 0;

3050 return 0;

3051 }

3052

3053 static void io_sq_thread_stop(struct io_ring_ctx *ctx)

3054 {

3055 if (ctx->sqo_thread) {

3056 wait_for_completion(&ctx->sqo_thread_started);

3057 /*

3058 * The park is a bit of a work-around, without it we get

3059 * warning spews on shutdown with SQPOLL set and affinity

3060 * set to a single CPU.

3061 */

3062 kthread_park(ctx->sqo_thread);

3063 kthread_stop(ctx->sqo_thread);

3064 ctx->sqo_thread = NULL;

3065 }

3066 }

3067

3068 static void io_finish_async(struct io_ring_ctx *ctx)

3069 {

3070 int i;

3071

3072 io_sq_thread_stop(ctx);

3073

3074 for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {

3075 if (ctx->sqo_wq[i]) {

3076 destroy_workqueue(ctx->sqo_wq[i]);

3077 ctx->sqo_wq[i] = NULL;

3078 }

3079 }

3080 }

3081

3082 #if defined(CONFIG_UNIX)

3083 static void io_destruct_skb(struct sk_buff *skb)

3084 {

3085 struct io_ring_ctx *ctx = skb->sk->sk_user_data;

3086 int i;

3087

3088 for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)

3089 if (ctx->sqo_wq[i])

3090 flush_workqueue(ctx->sqo_wq[i]);

3091

3092 unix_destruct_scm(skb);

3093 }

3094

3095 /*

3096 * Ensure the UNIX gc is aware of our file set, so we are certain that

3097 * the io_uring can be safely unregistered on process exit, even if we have

3098 * loops in the file referencing.

3099 */

3100 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)

3101 {

3102 struct sock *sk = ctx->ring_sock->sk;

3103 struct scm_fp_list *fpl;

3104 struct sk_buff *skb;

3105 int i;

3106

3107 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);

3108 if (!fpl)

3109 return -ENOMEM;

3110

3111 skb = alloc_skb(0, GFP_KERNEL);

3112 if (!skb) {

3113 kfree(fpl);

3114 return -ENOMEM;

3115 }

3116

3117 skb->sk = sk;

3118 skb->destructor = io_destruct_skb;

3119

3120 fpl->user = get_uid(ctx->user);

3121 for (i = 0; i < nr; i++) {

3122 fpl->fp[i] = get_file(ctx->user_files[i + offset]);

3123 unix_inflight(fpl->user, fpl->fp[i]);

3124 }

3125

3126 fpl->max = fpl->count = nr;

3127 UNIXCB(skb).fp = fpl;

3128 refcount_add(skb->truesize, &sk->sk_wmem_alloc);

3129 skb_queue_head(&sk->sk_receive_queue, skb);

3130

3131 for (i = 0; i < nr; i++)

3132 fput(fpl->fp[i]);

3133

3134 return 0;

3135 }

3136

3137 /*

3138 * If UNIX sockets are enabled, fd passing can cause a reference cycle which

3139 * causes regular reference counting to break down. We rely on the UNIX

3140 * garbage collection to take care of this problem for us.

3141 */

3142 static int io_sqe_files_scm(struct io_ring_ctx *ctx)

3143 {

3144 unsigned left, total;

3145 int ret = 0;

3146

3147 total = 0;

3148 left = ctx->nr_user_files;

3149 while (left) {

3150 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);

3151

3152 ret = __io_sqe_files_scm(ctx, this_files, total);

3153 if (ret)

3154 break;

3155 left -= this_files;

3156 total += this_files;

3157 }

3158

3159 if (!ret)

3160 return 0;

3161

3162 while (total < ctx->nr_user_files) {

3163 fput(ctx->user_files[total]);

3164 total++;

3165 }

3166

3167 return ret;

3168 }

3169 #else

3170 static int io_sqe_files_scm(struct io_ring_ctx *ctx)

3171 {

3172 return 0;

3173 }

3174 #endif

3175

3176 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,

3177 unsigned nr_args)

3178 {

3179 __s32 __user *fds = (__s32 __user *) arg;

3180 int fd, ret = 0;

3181 unsigned i;

3182

3183 if (ctx->user_files)

3184 return -EBUSY;

3185 if (!nr_args)

3186 return -EINVAL;

3187 if (nr_args > IORING_MAX_FIXED_FILES)

3188 return -EMFILE;

3189

3190 ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);

3191 if (!ctx->user_files)

3192 return -ENOMEM;

3193

3194 for (i = 0; i < nr_args; i++) {

3195 ret = -EFAULT;

3196 if (copy_from_user(&fd, &fds[i], sizeof(fd)))

3197 break;

3198

3199 ctx->user_files[i] = fget(fd);

3200

3201 ret = -EBADF;

3202 if (!ctx->user_files[i])

3203 break;

3204 /*

3205 * Don't allow io_uring instances to be registered. If UNIX

3206 * isn't enabled, then this causes a reference cycle and this

3207 * instance can never get freed. If UNIX is enabled we'll

3208 * handle it just fine, but there's still no point in allowing

3209 * a ring fd as it doesn't support regular read/write anyway.

3210 */

3211 if (ctx->user_files[i]->f_op == &io_uring_fops) {

3212 fput(ctx->user_files[i]);

3213 break;

3214 }

3215 ctx->nr_user_files++;

3216 ret = 0;

3217 }

3218

3219 if (ret) {

3220 for (i = 0; i < ctx->nr_user_files; i++)

3221 fput(ctx->user_files[i]);

3222

3223 kfree(ctx->user_files);

3224 ctx->user_files = NULL;

3225 ctx->nr_user_files = 0;

3226 return ret;

3227 }

3228

3229 ret = io_sqe_files_scm(ctx);

3230 if (ret)

3231 io_sqe_files_unregister(ctx);

3232

3233 return ret;

3234 }

3235

3236 static int io_sq_offload_start(struct io_ring_ctx *ctx,

3237 struct io_uring_params *p)

3238 {

3239 int ret;

3240

3241 mmgrab(current->mm);

3242 ctx->sqo_mm = current->mm;

3243

3244 if (ctx->flags & IORING_SETUP_SQPOLL) {

3245 ret = -EPERM;

3246 if (!capable(CAP_SYS_ADMIN))

3247 goto err;

3248

3249 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);

3250 if (!ctx->sq_thread_idle)

3251 ctx->sq_thread_idle = HZ;

3252

3253 if (p->flags & IORING_SETUP_SQ_AFF) {

3254 int cpu = p->sq_thread_cpu;

3255

3256 ret = -EINVAL;

3257 if (cpu >= nr_cpu_ids)

3258 goto err;

3259 if (!cpu_online(cpu))

3260 goto err;

3261

3262 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,

3263 ctx, cpu,

3264 "io_uring-sq");

3265 } else {

3266 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,

3267 "io_uring-sq");

3268 }

3269 if (IS_ERR(ctx->sqo_thread)) {

3270 ret = PTR_ERR(ctx->sqo_thread);

3271 ctx->sqo_thread = NULL;

3272 goto err;

3273 }

3274 wake_up_process(ctx->sqo_thread);

3275 } else if (p->flags & IORING_SETUP_SQ_AFF) {

3276 /* Can't have SQ_AFF without SQPOLL */

3277 ret = -EINVAL;

3278 goto err;

3279 }

3280

3281 /* Do QD, or 2 * CPUS, whatever is smallest */

3282 ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",

3283 WQ_UNBOUND | WQ_FREEZABLE,

3284 min(ctx->sq_entries - 1, 2 * num_online_cpus()));

3285 if (!ctx->sqo_wq[0]) {

3286 ret = -ENOMEM;

3287 goto err;

3288 }

3289

3290 /*

3291 * This is for buffered writes, where we want to limit the parallelism

3292 * due to file locking in file systems. As "normal" buffered writes

3293 * should parellelize on writeout quite nicely, limit us to having 2

3294 * pending. This avoids massive contention on the inode when doing

3295 * buffered async writes.

3296 */

3297 ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",

3298 WQ_UNBOUND | WQ_FREEZABLE, 2);

3299 if (!ctx->sqo_wq[1]) {

3300 ret = -ENOMEM;

3301 goto err;

3302 }

3303

3304 return 0;

3305 err:

3306 io_finish_async(ctx);

3307 mmdrop(ctx->sqo_mm);

3308 ctx->sqo_mm = NULL;

3309 return ret;

3310 }

3311

3312 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)

3313 {

3314 atomic_long_sub(nr_pages, &user->locked_vm);

3315 }

3316

3317 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)

3318 {

3319 unsigned long page_limit, cur_pages, new_pages;

3320

3321 /* Don't allow more pages than we can safely lock */

3322 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

3323

3324 do {

3325 cur_pages = atomic_long_read(&user->locked_vm);

3326 new_pages = cur_pages + nr_pages;

3327 if (new_pages > page_limit)

3328 return -ENOMEM;

3329 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,

3330 new_pages) != cur_pages);

3331

3332 return 0;

3333 }

3334

3335 static void io_mem_free(void *ptr)

3336 {

3337 struct page *page;

3338

3339 if (!ptr)

3340 return;

3341

3342 page = virt_to_head_page(ptr);

3343 if (put_page_testzero(page))

3344 free_compound_page(page);

3345 }

3346

3347 static void *io_mem_alloc(size_t size)

3348 {

3349 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |

3350 __GFP_NORETRY;

3351

3352 return (void *) __get_free_pages(gfp_flags, get_order(size));

3353 }

3354

3355 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,

3356 size_t *sq_offset)

3357 {

3358 struct io_rings *rings;

3359 size_t off, sq_array_size;

3360

3361 off = struct_size(rings, cqes, cq_entries);

3362 if (off == SIZE_MAX)

3363 return SIZE_MAX;

3364

3365 #ifdef CONFIG_SMP

3366 off = ALIGN(off, SMP_CACHE_BYTES);

3367 if (off == 0)

3368 return SIZE_MAX;

3369 #endif

3370

3371 sq_array_size = array_size(sizeof(u32), sq_entries);

3372 if (sq_array_size == SIZE_MAX)

3373 return SIZE_MAX;

3374

3375 if (check_add_overflow(off, sq_array_size, &off))

3376 return SIZE_MAX;

3377

3378 if (sq_offset)

3379 *sq_offset = off;

3380

3381 return off;

3382 }

3383

3384 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)

3385 {

3386 size_t pages;

3387

3388 pages = (size_t)1 << get_order(

3389 rings_size(sq_entries, cq_entries, NULL));

3390 pages += (size_t)1 << get_order(

3391 array_size(sizeof(struct io_uring_sqe), sq_entries));

3392

3393 return pages;

3394 }

3395

3396 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)

3397 {

3398 int i, j;

3399

3400 if (!ctx->user_bufs)

3401 return -ENXIO;

3402

3403 for (i = 0; i < ctx->nr_user_bufs; i++) {

3404 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];

3405

3406 for (j = 0; j < imu->nr_bvecs; j++)

3407 put_user_page(imu->bvec[j].bv_page);

3408

3409 if (ctx->account_mem)

3410 io_unaccount_mem(ctx->user, imu->nr_bvecs);

3411 kvfree(imu->bvec);

3412 imu->nr_bvecs = 0;

3413 }

3414

3415 kfree(ctx->user_bufs);

3416 ctx->user_bufs = NULL;

3417 ctx->nr_user_bufs = 0;

3418 return 0;

3419 }

3420

3421 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,

3422 void __user *arg, unsigned index)

3423 {

3424 struct iovec __user *src;

3425

3426 #ifdef CONFIG_COMPAT

3427 if (ctx->compat) {

3428 struct compat_iovec __user *ciovs;

3429 struct compat_iovec ciov;

3430

3431 ciovs = (struct compat_iovec __user *) arg;

3432 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))

3433 return -EFAULT;

3434

3435 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;

3436 dst->iov_len = ciov.iov_len;

3437 return 0;

3438 }

3439 #endif

3440 src = (struct iovec __user *) arg;

3441 if (copy_from_user(dst, &src[index], sizeof(*dst)))

3442 return -EFAULT;

3443 return 0;

3444 }

3445

3446 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,

3447 unsigned nr_args)

3448 {

3449 struct vm_area_struct **vmas = NULL;

3450 struct page **pages = NULL;

3451 int i, j, got_pages = 0;

3452 int ret = -EINVAL;

3453

3454 if (ctx->user_bufs)

3455 return -EBUSY;

3456 if (!nr_args || nr_args > UIO_MAXIOV)

3457 return -EINVAL;

3458

3459 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),

3460 GFP_KERNEL);

3461 if (!ctx->user_bufs)

3462 return -ENOMEM;

3463

3464 for (i = 0; i < nr_args; i++) {

3465 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];

3466 unsigned long off, start, end, ubuf;

3467 int pret, nr_pages;

3468 struct iovec iov;

3469 size_t size;

3470

3471 ret = io_copy_iov(ctx, &iov, arg, i);

3472 if (ret)

3473 goto err;

3474

3475 /*

3476 * Don't impose further limits on the size and buffer

3477 * constraints here, we'll -EINVAL later when IO is

3478 * submitted if they are wrong.

3479 */

3480 ret = -EFAULT;

3481 if (!iov.iov_base || !iov.iov_len)

3482 goto err;

3483

3484 /* arbitrary limit, but we need something */

3485 if (iov.iov_len > SZ_1G)

3486 goto err;

3487

3488 ubuf = (unsigned long) iov.iov_base;

3489 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;

3490 start = ubuf >> PAGE_SHIFT;

3491 nr_pages = end - start;

3492

3493 if (ctx->account_mem) {

3494 ret = io_account_mem(ctx->user, nr_pages);

3495 if (ret)

3496 goto err;

3497 }

3498

3499 ret = 0;

3500 if (!pages || nr_pages > got_pages) {

3501 kfree(vmas);

3502 kfree(pages);

3503 pages = kvmalloc_array(nr_pages, sizeof(struct page *),

3504 GFP_KERNEL);

3505 vmas = kvmalloc_array(nr_pages,

3506 sizeof(struct vm_area_struct *),

3507 GFP_KERNEL);

3508 if (!pages || !vmas) {

3509 ret = -ENOMEM;

3510 if (ctx->account_mem)

3511 io_unaccount_mem(ctx->user, nr_pages);

3512 goto err;

3513 }

3514 got_pages = nr_pages;

3515 }

3516

3517 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),

3518 GFP_KERNEL);

3519 ret = -ENOMEM;

3520 if (!imu->bvec) {

3521 if (ctx->account_mem)

3522 io_unaccount_mem(ctx->user, nr_pages);

3523 goto err;

3524 }

3525

3526 ret = 0;

3527 down_read(&current->mm->mmap_sem);

3528 pret = get_user_pages(ubuf, nr_pages,

3529 FOLL_WRITE | FOLL_LONGTERM,

3530 pages, vmas);

3531 if (pret == nr_pages) {

3532 /* don't support file backed memory */

3533 for (j = 0; j < nr_pages; j++) {

3534 struct vm_area_struct *vma = vmas[j];

3535

3536 if (vma->vm_file &&

3537 !is_file_hugepages(vma->vm_file)) {

3538 ret = -EOPNOTSUPP;

3539 break;

3540 }

3541 }

3542 } else {

3543 ret = pret < 0 ? pret : -EFAULT;

3544 }

3545 up_read(&current->mm->mmap_sem);

3546 if (ret) {

3547 /*

3548 * if we did partial map, or found file backed vmas,

3549 * release any pages we did get

3550 */

3551 if (pret > 0)

3552 put_user_pages(pages, pret);

3553 if (ctx->account_mem)

3554 io_unaccount_mem(ctx->user, nr_pages);

3555 kvfree(imu->bvec);

3556 goto err;

3557 }

3558

3559 off = ubuf & ~PAGE_MASK;

3560 size = iov.iov_len;

3561 for (j = 0; j < nr_pages; j++) {

3562 size_t vec_len;

3563

3564 vec_len = min_t(size_t, size, PAGE_SIZE - off);

3565 imu->bvec[j].bv_page = pages[j];

3566 imu->bvec[j].bv_len = vec_len;

3567 imu->bvec[j].bv_offset = off;

3568 off = 0;

3569 size -= vec_len;

3570 }

3571 /* store original address for later verification */

3572 imu->ubuf = ubuf;

3573 imu->len = iov.iov_len;

3574 imu->nr_bvecs = nr_pages;

3575

3576 ctx->nr_user_bufs++;

3577 }

3578 kvfree(pages);

3579 kvfree(vmas);

3580 return 0;

3581 err:

3582 kvfree(pages);

3583 kvfree(vmas);

3584 io_sqe_buffer_unregister(ctx);

3585 return ret;

3586 }

3587

3588 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)

3589 {

3590 __s32 __user *fds = arg;

3591 int fd;

3592

3593 if (ctx->cq_ev_fd)

3594 return -EBUSY;

3595

3596 if (copy_from_user(&fd, fds, sizeof(*fds)))

3597 return -EFAULT;

3598

3599 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);

3600 if (IS_ERR(ctx->cq_ev_fd)) {

3601 int ret = PTR_ERR(ctx->cq_ev_fd);

3602 ctx->cq_ev_fd = NULL;

3603 return ret;

3604 }

3605

3606 return 0;

3607 }

3608

3609 static int io_eventfd_unregister(struct io_ring_ctx *ctx)

3610 {

3611 if (ctx->cq_ev_fd) {

3612 eventfd_ctx_put(ctx->cq_ev_fd);

3613 ctx->cq_ev_fd = NULL;

3614 return 0;

3615 }

3616

3617 return -ENXIO;

3618 }

3619

3620 static void io_ring_ctx_free(struct io_ring_ctx *ctx)

3621 {

3622 io_finish_async(ctx);

3623 if (ctx->sqo_mm)

3624 mmdrop(ctx->sqo_mm);

3625

3626 io_iopoll_reap_events(ctx);

3627 io_sqe_buffer_unregister(ctx);

3628 io_sqe_files_unregister(ctx);

3629 io_eventfd_unregister(ctx);

3630

3631 #if defined(CONFIG_UNIX)

3632 if (ctx->ring_sock) {

3633 ctx->ring_sock->file = NULL; /* so that iput() is called */

3634 sock_release(ctx->ring_sock);

3635 }

3636 #endif

3637

3638 io_mem_free(ctx->rings);

3639 io_mem_free(ctx->sq_sqes);

3640

3641 percpu_ref_exit(&ctx->refs);

3642 if (ctx->account_mem)

3643 io_unaccount_mem(ctx->user,

3644 ring_pages(ctx->sq_entries, ctx->cq_entries));

3645 free_uid(ctx->user);

3646 if (ctx->creds)

3647 put_cred(ctx->creds);

3648 kfree(ctx);

3649 }

3650

3651 static __poll_t io_uring_poll(struct file *file, poll_table *wait)

3652 {

3653 struct io_ring_ctx *ctx = file->private_data;

3654 __poll_t mask = 0;

3655

3656 poll_wait(file, &ctx->cq_wait, wait);

3657 /*

3658 * synchronizes with barrier from wq_has_sleeper call in

3659 * io_commit_cqring

3660 */

3661 smp_rmb();

3662 if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=

3663 ctx->rings->sq_ring_entries)

3664 mask |= EPOLLOUT | EPOLLWRNORM;

3665 if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)

3666 mask |= EPOLLIN | EPOLLRDNORM;

3667

3668 return mask;

3669 }

3670

3671 static int io_uring_fasync(int fd, struct file *file, int on)

3672 {

3673 struct io_ring_ctx *ctx = file->private_data;

3674

3675 return fasync_helper(fd, file, on, &ctx->cq_fasync);

3676 }

3677

3678 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)

3679 {

3680 mutex_lock(&ctx->uring_lock);

3681 percpu_ref_kill(&ctx->refs);

3682 mutex_unlock(&ctx->uring_lock);

3683

3684 io_kill_timeouts(ctx);

3685 io_poll_remove_all(ctx);

3686 io_iopoll_reap_events(ctx);

3687 wait_for_completion(&ctx->ctx_done);

3688 io_ring_ctx_free(ctx);

3689 }

3690

3691 static int io_uring_release(struct inode *inode, struct file *file)

3692 {

3693 struct io_ring_ctx *ctx = file->private_data;

3694

3695 file->private_data = NULL;

3696 io_ring_ctx_wait_and_kill(ctx);

3697 return 0;

3698 }

3699

3700 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)

3701 {

3702 loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;

3703 unsigned long sz = vma->vm_end - vma->vm_start;

3704 struct io_ring_ctx *ctx = file->private_data;

3705 unsigned long pfn;

3706 struct page *page;

3707 void *ptr;

3708

3709 switch (offset) {

3710 case IORING_OFF_SQ_RING:

3711 case IORING_OFF_CQ_RING:

3712 ptr = ctx->rings;

3713 break;

3714 case IORING_OFF_SQES:

3715 ptr = ctx->sq_sqes;

3716 break;

3717 default:

3718 return -EINVAL;

3719 }

3720

3721 page = virt_to_head_page(ptr);

3722 if (sz > page_size(page))

3723 return -EINVAL;

3724

3725 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;

3726 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);

3727 }

3728

3729 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,

3730 u32, min_complete, u32, flags, const sigset_t __user *, sig,

3731 size_t, sigsz)

3732 {

3733 struct io_ring_ctx *ctx;

3734 long ret = -EBADF;

3735 int submitted = 0;

3736 struct fd f;

3737

3738 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))

3739 return -EINVAL;

3740

3741 f = fdget(fd);

3742 if (!f.file)

3743 return -EBADF;

3744

3745 ret = -EOPNOTSUPP;

3746 if (f.file->f_op != &io_uring_fops)

3747 goto out_fput;

3748

3749 ret = -ENXIO;

3750 ctx = f.file->private_data;

3751 if (!percpu_ref_tryget(&ctx->refs))

3752 goto out_fput;

3753

3754 /*

3755 * For SQ polling, the thread will do all submissions and completions.

3756 * Just return the requested submit count, and wake the thread if

3757 * we were asked to.

3758 */

3759 ret = 0;

3760 if (ctx->flags & IORING_SETUP_SQPOLL) {

3761 if (flags & IORING_ENTER_SQ_WAKEUP)

3762 wake_up(&ctx->sqo_wait);

3763 submitted = to_submit;

3764 } else if (to_submit) {

3765 to_submit = min(to_submit, ctx->sq_entries);

3766

3767 mutex_lock(&ctx->uring_lock);

3768 submitted = io_ring_submit(ctx, to_submit);

3769 mutex_unlock(&ctx->uring_lock);

3770

3771 if (submitted != to_submit)

3772 goto out;

3773 }

3774 if (flags & IORING_ENTER_GETEVENTS) {

3775 unsigned nr_events = 0;

3776

3777 min_complete = min(min_complete, ctx->cq_entries);

3778

3779 if (ctx->flags & IORING_SETUP_IOPOLL) {

3780 ret = io_iopoll_check(ctx, &nr_events, min_complete);

3781 } else {

3782 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);

3783 }

3784 }

3785

3786 out:

3787 percpu_ref_put(&ctx->refs);

3788 out_fput:

3789 fdput(f);

3790 return submitted ? submitted : ret;

3791 }

3792

3793 static const struct file_operations io_uring_fops = {

3794 .release = io_uring_release,

3795 .mmap = io_uring_mmap,

3796 .poll = io_uring_poll,

3797 .fasync = io_uring_fasync,

3798 };

3799

3800 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,

3801 struct io_uring_params *p)

3802 {

3803 struct io_rings *rings;

3804 size_t size, sq_array_offset;

3805

3806 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);

3807 if (size == SIZE_MAX)

3808 return -EOVERFLOW;

3809

3810 rings = io_mem_alloc(size);

3811 if (!rings)

3812 return -ENOMEM;

3813

3814 ctx->rings = rings;

3815 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);

3816 rings->sq_ring_mask = p->sq_entries - 1;

3817 rings->cq_ring_mask = p->cq_entries - 1;

3818 rings->sq_ring_entries = p->sq_entries;

3819 rings->cq_ring_entries = p->cq_entries;

3820 ctx->sq_mask = rings->sq_ring_mask;

3821 ctx->cq_mask = rings->cq_ring_mask;

3822 ctx->sq_entries = rings->sq_ring_entries;

3823 ctx->cq_entries = rings->cq_ring_entries;

3824

3825 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);

3826 if (size == SIZE_MAX) {

3827 io_mem_free(ctx->rings);

3828 ctx->rings = NULL;

3829 return -EOVERFLOW;

3830 }

3831

3832 ctx->sq_sqes = io_mem_alloc(size);

3833 if (!ctx->sq_sqes) {

3834 io_mem_free(ctx->rings);

3835 ctx->rings = NULL;

3836 return -ENOMEM;

3837 }

3838

3839 return 0;

3840 }

3841

3842 /*

3843 * Allocate an anonymous fd, this is what constitutes the application

3844 * visible backing of an io_uring instance. The application mmaps this

3845 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,

3846 * we have to tie this fd to a socket for file garbage collection purposes.

3847 */

3848 static int io_uring_get_fd(struct io_ring_ctx *ctx)

3849 {

3850 struct file *file;

3851 int ret;

3852

3853 #if defined(CONFIG_UNIX)

3854 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,

3855 &ctx->ring_sock);

3856 if (ret)

3857 return ret;

3858 #endif

3859

3860 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);

3861 if (ret < 0)

3862 goto err;

3863

3864 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,

3865 O_RDWR | O_CLOEXEC);

3866 if (IS_ERR(file)) {

3867 put_unused_fd(ret);

3868 ret = PTR_ERR(file);

3869 goto err;

3870 }

3871

3872 #if defined(CONFIG_UNIX)

3873 ctx->ring_sock->file = file;

3874 ctx->ring_sock->sk->sk_user_data = ctx;

3875 #endif

3876 fd_install(ret, file);

3877 return ret;

3878 err:

3879 #if defined(CONFIG_UNIX)

3880 sock_release(ctx->ring_sock);

3881 ctx->ring_sock = NULL;

3882 #endif

3883 return ret;

3884 }

3885

3886 static int io_uring_create(unsigned entries, struct io_uring_params *p)

3887 {

3888 struct user_struct *user = NULL;

3889 struct io_ring_ctx *ctx;

3890 bool account_mem;

3891 int ret;

3892

3893 if (!entries || entries > IORING_MAX_ENTRIES)

3894 return -EINVAL;

3895

3896 /*

3897 * Use twice as many entries for the CQ ring. It's possible for the

3898 * application to drive a higher depth than the size of the SQ ring,

3899 * since the sqes are only used at submission time. This allows for

3900 * some flexibility in overcommitting a bit.

3901 */

3902 p->sq_entries = roundup_pow_of_two(entries);

3903 p->cq_entries = 2 * p->sq_entries;

3904

3905 user = get_uid(current_user());

3906 account_mem = !capable(CAP_IPC_LOCK);

3907

3908 if (account_mem) {

3909 ret = io_account_mem(user,

3910 ring_pages(p->sq_entries, p->cq_entries));

3911 if (ret) {

3912 free_uid(user);

3913 return ret;

3914 }

3915 }

3916

3917 ctx = io_ring_ctx_alloc(p);

3918 if (!ctx) {

3919 if (account_mem)

3920 io_unaccount_mem(user, ring_pages(p->sq_entries,

3921 p->cq_entries));

3922 free_uid(user);

3923 return -ENOMEM;

3924 }

3925 ctx->compat = in_compat_syscall();

3926 ctx->account_mem = account_mem;

3927 ctx->user = user;

3928

3929 ctx->creds = get_current_cred();

3930 if (!ctx->creds) {

3931 ret = -ENOMEM;

3932 goto err;

3933 }

3934

3935 ret = io_allocate_scq_urings(ctx, p);

3936 if (ret)

3937 goto err;

3938

3939 ret = io_sq_offload_start(ctx, p);

3940 if (ret)

3941 goto err;

3942

3943 memset(&p->sq_off, 0, sizeof(p->sq_off));

3944 p->sq_off.head = offsetof(struct io_rings, sq.head);

3945 p->sq_off.tail = offsetof(struct io_rings, sq.tail);

3946 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);

3947 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);

3948 p->sq_off.flags = offsetof(struct io_rings, sq_flags);

3949 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);

3950 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;

3951

3952 memset(&p->cq_off, 0, sizeof(p->cq_off));

3953 p->cq_off.head = offsetof(struct io_rings, cq.head);

3954 p->cq_off.tail = offsetof(struct io_rings, cq.tail);

3955 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);

3956 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);

3957 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);

3958 p->cq_off.cqes = offsetof(struct io_rings, cqes);

3959

3960 /*

3961 * Install ring fd as the very last thing, so we don't risk someone

3962 * having closed it before we finish setup

3963 */

3964 ret = io_uring_get_fd(ctx);

3965 if (ret < 0)

3966 goto err;

3967

3968 p->features = IORING_FEAT_SINGLE_MMAP;

3969 return ret;

3970 err:

3971 io_ring_ctx_wait_and_kill(ctx);

3972 return ret;

3973 }

3974

3975 /*

3976 * Sets up an aio uring context, and returns the fd. Applications asks for a

3977 * ring size, we return the actual sq/cq ring sizes (among other things) in the

3978 * params structure passed in.

3979 */

3980 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)

3981 {

3982 struct io_uring_params p;

3983 long ret;

3984 int i;

3985

3986 if (copy_from_user(&p, params, sizeof(p)))

3987 return -EFAULT;

3988 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {

3989 if (p.resv[i])

3990 return -EINVAL;

3991 }

3992

3993 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |

3994 IORING_SETUP_SQ_AFF))

3995 return -EINVAL;

3996

3997 ret = io_uring_create(entries, &p);

3998 if (ret < 0)

3999 return ret;

4000

4001 if (copy_to_user(params, &p, sizeof(p)))

4002 return -EFAULT;

4003

4004 return ret;

4005 }

4006

4007 SYSCALL_DEFINE2(io_uring_setup, u32, entries,

4008 struct io_uring_params __user *, params)

4009 {

4010 return io_uring_setup(entries, params);

4011 }

4012

4013 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,

4014 void __user *arg, unsigned nr_args)

4015 __releases(ctx->uring_lock)

4016 __acquires(ctx->uring_lock)

4017 {

4018 int ret;

4019

4020 /*

4021 * We're inside the ring mutex, if the ref is already dying, then

4022 * someone else killed the ctx or is already going through

4023 * io_uring_register().

4024 */

4025 if (percpu_ref_is_dying(&ctx->refs))

4026 return -ENXIO;

4027

4028 percpu_ref_kill(&ctx->refs);

4029

4030 /*

4031 * Drop uring mutex before waiting for references to exit. If another

4032 * thread is currently inside io_uring_enter() it might need to grab

4033 * the uring_lock to make progress. If we hold it here across the drain

4034 * wait, then we can deadlock. It's safe to drop the mutex here, since

4035 * no new references will come in after we've killed the percpu ref.

4036 */

4037 mutex_unlock(&ctx->uring_lock);

4038 wait_for_completion(&ctx->ctx_done);

4039 mutex_lock(&ctx->uring_lock);

4040

4041 switch (opcode) {

4042 case IORING_REGISTER_BUFFERS:

4043 ret = io_sqe_buffer_register(ctx, arg, nr_args);

4044 break;

4045 case IORING_UNREGISTER_BUFFERS:

4046 ret = -EINVAL;

4047 if (arg || nr_args)

4048 break;

4049 ret = io_sqe_buffer_unregister(ctx);

4050 break;

4051 case IORING_REGISTER_FILES:

4052 ret = io_sqe_files_register(ctx, arg, nr_args);

4053 break;

4054 case IORING_UNREGISTER_FILES:

4055 ret = -EINVAL;

4056 if (arg || nr_args)

4057 break;

4058 ret = io_sqe_files_unregister(ctx);

4059 break;

4060 case IORING_REGISTER_EVENTFD:

4061 ret = -EINVAL;

4062 if (nr_args != 1)

4063 break;

4064 ret = io_eventfd_register(ctx, arg);

4065 break;

4066 case IORING_UNREGISTER_EVENTFD:

4067 ret = -EINVAL;

4068 if (arg || nr_args)

4069 break;

4070 ret = io_eventfd_unregister(ctx);

4071 break;

4072 default:

4073 ret = -EINVAL;

4074 break;

4075 }

4076

4077 /* bring the ctx back to life */

4078 reinit_completion(&ctx->ctx_done);

4079 percpu_ref_reinit(&ctx->refs);

4080 return ret;

4081 }

4082

4083 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,

4084 void __user *, arg, unsigned int, nr_args)

4085 {

4086 struct io_ring_ctx *ctx;

4087 long ret = -EBADF;

4088 struct fd f;

4089

4090 f = fdget(fd);

4091 if (!f.file)

4092 return -EBADF;

4093

4094 ret = -EOPNOTSUPP;

4095 if (f.file->f_op != &io_uring_fops)

4096 goto out_fput;

4097

4098 ctx = f.file->private_data;

4099

4100 mutex_lock(&ctx->uring_lock);

4101 ret = __io_uring_register(ctx, opcode, arg, nr_args);

4102 mutex_unlock(&ctx->uring_lock);

4103 out_fput:

4104 fdput(f);

4105 return ret;

4106 }

4107

4108 static int __init io_uring_init(void)

4109 {

4110 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);

4111 return 0;

4112 };

4113 __initcall(io_uring_init);