fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side. When the application reads the CQ ring
   8  * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
   9  * the kernel uses after writing the tail. Failure to do so could cause a
  10  * delay in when the application notices that completion events available.
  11  * This isn't a fatal condition. Likewise, the application must use an
  12  * appropriate smp_wmb() both before writing the SQ tail, and after writing
  13  * the SQ tail. The first one orders the sqe writes with the tail write, and
  14  * the latter is paired with the smp_rmb() the kernel will issue before
  15  * reading the SQ tail on submission.
  16  *
  17  * Also see the examples in the liburing library:
  18  *
  19  *      git://git.kernel.dk/liburing
  20  *
  21  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  22  * from data shared between the kernel and application. This is done both
  23  * for ordering purposes, but also to ensure that once a value is loaded from
  24  * data that the application could potentially modify, it remains stable.
  25  *
  26  * Copyright (C) 2018-2019 Jens Axboe
  27  * Copyright (c) 2018-2019 Christoph Hellwig
  28  */
  29 #include <linux/kernel.h>
  30 #include <linux/init.h>
  31 #include <linux/errno.h>
  32 #include <linux/syscalls.h>
  33 #include <linux/compat.h>
  34 #include <linux/refcount.h>
  35 #include <linux/uio.h>
  36
  37 #include <linux/sched/signal.h>
  38 #include <linux/fs.h>
  39 #include <linux/file.h>
  40 #include <linux/fdtable.h>
  41 #include <linux/mm.h>
  42 #include <linux/mman.h>
  43 #include <linux/mmu_context.h>
  44 #include <linux/percpu.h>
  45 #include <linux/slab.h>
  46 #include <linux/workqueue.h>
  47 #include <linux/kthread.h>
  48 #include <linux/blkdev.h>
  49 #include <linux/bvec.h>
  50 #include <linux/net.h>
  51 #include <net/sock.h>
  52 #include <net/af_unix.h>
  53 #include <net/scm.h>
  54 #include <linux/anon_inodes.h>
  55 #include <linux/sched/mm.h>
  56 #include <linux/uaccess.h>
  57 #include <linux/nospec.h>
  58 #include <linux/sizes.h>
  59 #include <linux/hugetlb.h>
  60
  61 #include <uapi/linux/io_uring.h>
  62
  63 #include "internal.h"
  64
  65 #define IORING_MAX_ENTRIES      4096
  66 #define IORING_MAX_FIXED_FILES  1024
  67
  68 struct io_uring {
  69         u32 head ____cacheline_aligned_in_smp;
  70         u32 tail ____cacheline_aligned_in_smp;
  71 };
  72
  73 struct io_sq_ring {
  74         struct io_uring         r;
  75         u32                     ring_mask;
  76         u32                     ring_entries;
  77         u32                     dropped;
  78         u32                     flags;
  79         u32                     array[];
  80 };
  81
  82 struct io_cq_ring {
  83         struct io_uring         r;
  84         u32                     ring_mask;
  85         u32                     ring_entries;
  86         u32                     overflow;
  87         struct io_uring_cqe     cqes[];
  88 };
  89
  90 struct io_mapped_ubuf {
  91         u64             ubuf;
  92         size_t          len;
  93         struct          bio_vec *bvec;
  94         unsigned int    nr_bvecs;
  95 };
  96
  97 struct io_ring_ctx {
  98         struct {
  99                 struct percpu_ref       refs;
 100         } ____cacheline_aligned_in_smp;
 101
 102         struct {
 103                 unsigned int            flags;
 104                 bool                    compat;
 105                 bool                    account_mem;
 106
 107                 /* SQ ring */
 108                 struct io_sq_ring       *sq_ring;
 109                 unsigned                cached_sq_head;
 110                 unsigned                sq_entries;
 111                 unsigned                sq_mask;
 112                 unsigned                sq_thread_idle;
 113                 struct io_uring_sqe     *sq_sqes;
 114         } ____cacheline_aligned_in_smp;
 115
 116         /* IO offload */
 117         struct workqueue_struct *sqo_wq;
 118         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 119         struct mm_struct        *sqo_mm;
 120         wait_queue_head_t       sqo_wait;
 121         unsigned                sqo_stop;
 122
 123         struct {
 124                 /* CQ ring */
 125                 struct io_cq_ring       *cq_ring;
 126                 unsigned                cached_cq_tail;
 127                 unsigned                cq_entries;
 128                 unsigned                cq_mask;
 129                 struct wait_queue_head  cq_wait;
 130                 struct fasync_struct    *cq_fasync;
 131         } ____cacheline_aligned_in_smp;
 132
 133         /*
 134          * If used, fixed file set. Writers must ensure that ->refs is dead,
 135          * readers must ensure that ->refs is alive as long as the file* is
 136          * used. Only updated through io_uring_register(2).
 137          */
 138         struct file             **user_files;
 139         unsigned                nr_user_files;
 140
 141         /* if used, fixed mapped user buffers */
 142         unsigned                nr_user_bufs;
 143         struct io_mapped_ubuf   *user_bufs;
 144
 145         struct user_struct      *user;
 146
 147         struct completion       ctx_done;
 148
 149         struct {
 150                 struct mutex            uring_lock;
 151                 wait_queue_head_t       wait;
 152         } ____cacheline_aligned_in_smp;
 153
 154         struct {
 155                 spinlock_t              completion_lock;
 156                 bool                    poll_multi_file;
 157                 /*
 158                  * ->poll_list is protected by the ctx->uring_lock for
 159                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 160                  * For SQPOLL, only the single threaded io_sq_thread() will
 161                  * manipulate the list, hence no extra locking is needed there.
 162                  */
 163                 struct list_head        poll_list;
 164         } ____cacheline_aligned_in_smp;
 165
 166 #if defined(CONFIG_UNIX)
 167         struct socket           *ring_sock;
 168 #endif
 169 };
 170
 171 struct sqe_submit {
 172         const struct io_uring_sqe       *sqe;
 173         unsigned short                  index;
 174         bool                            has_user;
 175         bool                            needs_lock;
 176         bool                            needs_fixed_file;
 177 };
 178
 179 struct io_kiocb {
 180         struct kiocb            rw;
 181
 182         struct sqe_submit       submit;
 183
 184         struct io_ring_ctx      *ctx;
 185         struct list_head        list;
 186         unsigned int            flags;
 187         refcount_t              refs;
 188 #define REQ_F_FORCE_NONBLOCK    1       /* inline submission attempt */
 189 #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
 190 #define REQ_F_FIXED_FILE        4       /* ctx owns file */
 191         u64                     user_data;
 192         u64                     error;
 193
 194         struct work_struct      work;
 195 };
 196
 197 #define IO_PLUG_THRESHOLD               2
 198 #define IO_IOPOLL_BATCH                 8
 199
 200 struct io_submit_state {
 201         struct blk_plug         plug;
 202
 203         /*
 204          * io_kiocb alloc cache
 205          */
 206         void                    *reqs[IO_IOPOLL_BATCH];
 207         unsigned                int free_reqs;
 208         unsigned                int cur_req;
 209
 210         /*
 211          * File reference cache
 212          */
 213         struct file             *file;
 214         unsigned int            fd;
 215         unsigned int            has_refs;
 216         unsigned int            used_refs;
 217         unsigned int            ios_left;
 218 };
 219
 220 static struct kmem_cache *req_cachep;
 221
 222 static const struct file_operations io_uring_fops;
 223
 224 struct sock *io_uring_get_socket(struct file *file)
 225 {
 226 #if defined(CONFIG_UNIX)
 227         if (file->f_op == &io_uring_fops) {
 228                 struct io_ring_ctx *ctx = file->private_data;
 229
 230                 return ctx->ring_sock->sk;
 231         }
 232 #endif
 233         return NULL;
 234 }
 235 EXPORT_SYMBOL(io_uring_get_socket);
 236
 237 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 238 {
 239         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 240
 241         complete(&ctx->ctx_done);
 242 }
 243
 244 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 245 {
 246         struct io_ring_ctx *ctx;
 247
 248         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 249         if (!ctx)
 250                 return NULL;
 251
 252         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
 253                 kfree(ctx);
 254                 return NULL;
 255         }
 256
 257         ctx->flags = p->flags;
 258         init_waitqueue_head(&ctx->cq_wait);
 259         init_completion(&ctx->ctx_done);
 260         mutex_init(&ctx->uring_lock);
 261         init_waitqueue_head(&ctx->wait);
 262         spin_lock_init(&ctx->completion_lock);
 263         INIT_LIST_HEAD(&ctx->poll_list);
 264         return ctx;
 265 }
 266
 267 static void io_commit_cqring(struct io_ring_ctx *ctx)
 268 {
 269         struct io_cq_ring *ring = ctx->cq_ring;
 270
 271         if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
 272                 /* order cqe stores with ring update */
 273                 smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
 274
 275                 /*
 276                  * Write sider barrier of tail update, app has read side. See
 277                  * comment at the top of this file.
 278                  */
 279                 smp_wmb();
 280
 281                 if (wq_has_sleeper(&ctx->cq_wait)) {
 282                         wake_up_interruptible(&ctx->cq_wait);
 283                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 284                 }
 285         }
 286 }
 287
 288 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 289 {
 290         struct io_cq_ring *ring = ctx->cq_ring;
 291         unsigned tail;
 292
 293         tail = ctx->cached_cq_tail;
 294         /* See comment at the top of the file */
 295         smp_rmb();
 296         if (tail + 1 == READ_ONCE(ring->r.head))
 297                 return NULL;
 298
 299         ctx->cached_cq_tail++;
 300         return &ring->cqes[tail & ctx->cq_mask];
 301 }
 302
 303 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 304                                  long res, unsigned ev_flags)
 305 {
 306         struct io_uring_cqe *cqe;
 307
 308         /*
 309          * If we can't get a cq entry, userspace overflowed the
 310          * submission (by quite a lot). Increment the overflow count in
 311          * the ring.
 312          */
 313         cqe = io_get_cqring(ctx);
 314         if (cqe) {
 315                 WRITE_ONCE(cqe->user_data, ki_user_data);
 316                 WRITE_ONCE(cqe->res, res);
 317                 WRITE_ONCE(cqe->flags, ev_flags);
 318         } else {
 319                 unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
 320
 321                 WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
 322         }
 323 }
 324
 325 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 326                                 long res, unsigned ev_flags)
 327 {
 328         unsigned long flags;
 329
 330         spin_lock_irqsave(&ctx->completion_lock, flags);
 331         io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
 332         io_commit_cqring(ctx);
 333         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 334
 335         if (waitqueue_active(&ctx->wait))
 336                 wake_up(&ctx->wait);
 337         if (waitqueue_active(&ctx->sqo_wait))
 338                 wake_up(&ctx->sqo_wait);
 339 }
 340
 341 static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
 342 {
 343         percpu_ref_put_many(&ctx->refs, refs);
 344
 345         if (waitqueue_active(&ctx->wait))
 346                 wake_up(&ctx->wait);
 347 }
 348
 349 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 350                                    struct io_submit_state *state)
 351 {
 352         struct io_kiocb *req;
 353
 354         if (!percpu_ref_tryget(&ctx->refs))
 355                 return NULL;
 356
 357         if (!state) {
 358                 req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
 359                 if (unlikely(!req))
 360                         goto out;
 361         } else if (!state->free_reqs) {
 362                 size_t sz;
 363                 int ret;
 364
 365                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
 366                 ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
 367                                                 state->reqs);
 368                 if (unlikely(ret <= 0))
 369                         goto out;
 370                 state->free_reqs = ret - 1;
 371                 state->cur_req = 1;
 372                 req = state->reqs[0];
 373         } else {
 374                 req = state->reqs[state->cur_req];
 375                 state->free_reqs--;
 376                 state->cur_req++;
 377         }
 378
 379         req->ctx = ctx;
 380         req->flags = 0;
 381         refcount_set(&req->refs, 0);
 382         return req;
 383 out:
 384         io_ring_drop_ctx_refs(ctx, 1);
 385         return NULL;
 386 }
 387
 388 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 389 {
 390         if (*nr) {
 391                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
 392                 io_ring_drop_ctx_refs(ctx, *nr);
 393                 *nr = 0;
 394         }
 395 }
 396
 397 static void io_free_req(struct io_kiocb *req)
 398 {
 399         if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) {
 400                 io_ring_drop_ctx_refs(req->ctx, 1);
 401                 kmem_cache_free(req_cachep, req);
 402         }
 403 }
 404
 405 /*
 406  * Find and free completed poll iocbs
 407  */
 408 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 409                                struct list_head *done)
 410 {
 411         void *reqs[IO_IOPOLL_BATCH];
 412         int file_count, to_free;
 413         struct file *file = NULL;
 414         struct io_kiocb *req;
 415
 416         file_count = to_free = 0;
 417         while (!list_empty(done)) {
 418                 req = list_first_entry(done, struct io_kiocb, list);
 419                 list_del(&req->list);
 420
 421                 io_cqring_fill_event(ctx, req->user_data, req->error, 0);
 422
 423                 reqs[to_free++] = req;
 424                 (*nr_events)++;
 425
 426                 /*
 427                  * Batched puts of the same file, to avoid dirtying the
 428                  * file usage count multiple times, if avoidable.
 429                  */
 430                 if (!(req->flags & REQ_F_FIXED_FILE)) {
 431                         if (!file) {
 432                                 file = req->rw.ki_filp;
 433                                 file_count = 1;
 434                         } else if (file == req->rw.ki_filp) {
 435                                 file_count++;
 436                         } else {
 437                                 fput_many(file, file_count);
 438                                 file = req->rw.ki_filp;
 439                                 file_count = 1;
 440                         }
 441                 }
 442
 443                 if (to_free == ARRAY_SIZE(reqs))
 444                         io_free_req_many(ctx, reqs, &to_free);
 445         }
 446         io_commit_cqring(ctx);
 447
 448         if (file)
 449                 fput_many(file, file_count);
 450         io_free_req_many(ctx, reqs, &to_free);
 451 }
 452
 453 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 454                         long min)
 455 {
 456         struct io_kiocb *req, *tmp;
 457         LIST_HEAD(done);
 458         bool spin;
 459         int ret;
 460
 461         /*
 462          * Only spin for completions if we don't have multiple devices hanging
 463          * off our complete list, and we're under the requested amount.
 464          */
 465         spin = !ctx->poll_multi_file && *nr_events < min;
 466
 467         ret = 0;
 468         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
 469                 struct kiocb *kiocb = &req->rw;
 470
 471                 /*
 472                  * Move completed entries to our local list. If we find a
 473                  * request that requires polling, break out and complete
 474                  * the done list first, if we have entries there.
 475                  */
 476                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
 477                         list_move_tail(&req->list, &done);
 478                         continue;
 479                 }
 480                 if (!list_empty(&done))
 481                         break;
 482
 483                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
 484                 if (ret < 0)
 485                         break;
 486
 487                 if (ret && spin)
 488                         spin = false;
 489                 ret = 0;
 490         }
 491
 492         if (!list_empty(&done))
 493                 io_iopoll_complete(ctx, nr_events, &done);
 494
 495         return ret;
 496 }
 497
 498 /*
 499  * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
 500  * non-spinning poll check - we'll still enter the driver poll loop, but only
 501  * as a non-spinning completion check.
 502  */
 503 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
 504                                 long min)
 505 {
 506         while (!list_empty(&ctx->poll_list)) {
 507                 int ret;
 508
 509                 ret = io_do_iopoll(ctx, nr_events, min);
 510                 if (ret < 0)
 511                         return ret;
 512                 if (!min || *nr_events >= min)
 513                         return 0;
 514         }
 515
 516         return 1;
 517 }
 518
 519 /*
 520  * We can't just wait for polled events to come to us, we have to actively
 521  * find and complete them.
 522  */
 523 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 524 {
 525         if (!(ctx->flags & IORING_SETUP_IOPOLL))
 526                 return;
 527
 528         mutex_lock(&ctx->uring_lock);
 529         while (!list_empty(&ctx->poll_list)) {
 530                 unsigned int nr_events = 0;
 531
 532                 io_iopoll_getevents(ctx, &nr_events, 1);
 533         }
 534         mutex_unlock(&ctx->uring_lock);
 535 }
 536
 537 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 538                            long min)
 539 {
 540         int ret = 0;
 541
 542         do {
 543                 int tmin = 0;
 544
 545                 if (*nr_events < min)
 546                         tmin = min - *nr_events;
 547
 548                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
 549                 if (ret <= 0)
 550                         break;
 551                 ret = 0;
 552         } while (min && !*nr_events && !need_resched());
 553
 554         return ret;
 555 }
 556
 557 static void kiocb_end_write(struct kiocb *kiocb)
 558 {
 559         if (kiocb->ki_flags & IOCB_WRITE) {
 560                 struct inode *inode = file_inode(kiocb->ki_filp);
 561
 562                 /*
 563                  * Tell lockdep we inherited freeze protection from submission
 564                  * thread.
 565                  */
 566                 if (S_ISREG(inode->i_mode))
 567                         __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
 568                 file_end_write(kiocb->ki_filp);
 569         }
 570 }
 571
 572 static void io_fput(struct io_kiocb *req)
 573 {
 574         if (!(req->flags & REQ_F_FIXED_FILE))
 575                 fput(req->rw.ki_filp);
 576 }
 577
 578 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 579 {
 580         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 581
 582         kiocb_end_write(kiocb);
 583
 584         io_fput(req);
 585         io_cqring_add_event(req->ctx, req->user_data, res, 0);
 586         io_free_req(req);
 587 }
 588
 589 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 590 {
 591         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 592
 593         kiocb_end_write(kiocb);
 594
 595         req->error = res;
 596         if (res != -EAGAIN)
 597                 req->flags |= REQ_F_IOPOLL_COMPLETED;
 598 }
 599
 600 /*
 601  * After the iocb has been issued, it's safe to be found on the poll list.
 602  * Adding the kiocb to the list AFTER submission ensures that we don't
 603  * find it from a io_iopoll_getevents() thread before the issuer is done
 604  * accessing the kiocb cookie.
 605  */
 606 static void io_iopoll_req_issued(struct io_kiocb *req)
 607 {
 608         struct io_ring_ctx *ctx = req->ctx;
 609
 610         /*
 611          * Track whether we have multiple files in our lists. This will impact
 612          * how we do polling eventually, not spinning if we're on potentially
 613          * different devices.
 614          */
 615         if (list_empty(&ctx->poll_list)) {
 616                 ctx->poll_multi_file = false;
 617         } else if (!ctx->poll_multi_file) {
 618                 struct io_kiocb *list_req;
 619
 620                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
 621                                                 list);
 622                 if (list_req->rw.ki_filp != req->rw.ki_filp)
 623                         ctx->poll_multi_file = true;
 624         }
 625
 626         /*
 627          * For fast devices, IO may have already completed. If it has, add
 628          * it to the front so we find it first.
 629          */
 630         if (req->flags & REQ_F_IOPOLL_COMPLETED)
 631                 list_add(&req->list, &ctx->poll_list);
 632         else
 633                 list_add_tail(&req->list, &ctx->poll_list);
 634 }
 635
 636 static void io_file_put(struct io_submit_state *state, struct file *file)
 637 {
 638         if (!state) {
 639                 fput(file);
 640         } else if (state->file) {
 641                 int diff = state->has_refs - state->used_refs;
 642
 643                 if (diff)
 644                         fput_many(state->file, diff);
 645                 state->file = NULL;
 646         }
 647 }
 648
 649 /*
 650  * Get as many references to a file as we have IOs left in this submission,
 651  * assuming most submissions are for one file, or at least that each file
 652  * has more than one submission.
 653  */
 654 static struct file *io_file_get(struct io_submit_state *state, int fd)
 655 {
 656         if (!state)
 657                 return fget(fd);
 658
 659         if (state->file) {
 660                 if (state->fd == fd) {
 661                         state->used_refs++;
 662                         state->ios_left--;
 663                         return state->file;
 664                 }
 665                 io_file_put(state, NULL);
 666         }
 667         state->file = fget_many(fd, state->ios_left);
 668         if (!state->file)
 669                 return NULL;
 670
 671         state->fd = fd;
 672         state->has_refs = state->ios_left;
 673         state->used_refs = 1;
 674         state->ios_left--;
 675         return state->file;
 676 }
 677
 678 /*
 679  * If we tracked the file through the SCM inflight mechanism, we could support
 680  * any file. For now, just ensure that anything potentially problematic is done
 681  * inline.
 682  */
 683 static bool io_file_supports_async(struct file *file)
 684 {
 685         umode_t mode = file_inode(file)->i_mode;
 686
 687         if (S_ISBLK(mode) || S_ISCHR(mode))
 688                 return true;
 689         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
 690                 return true;
 691
 692         return false;
 693 }
 694
 695 static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 696                       bool force_nonblock, struct io_submit_state *state)
 697 {
 698         const struct io_uring_sqe *sqe = s->sqe;
 699         struct io_ring_ctx *ctx = req->ctx;
 700         struct kiocb *kiocb = &req->rw;
 701         unsigned ioprio, flags;
 702         int fd, ret;
 703
 704         /* For -EAGAIN retry, everything is already prepped */
 705         if (kiocb->ki_filp)
 706                 return 0;
 707
 708         flags = READ_ONCE(sqe->flags);
 709         fd = READ_ONCE(sqe->fd);
 710
 711         if (flags & IOSQE_FIXED_FILE) {
 712                 if (unlikely(!ctx->user_files ||
 713                     (unsigned) fd >= ctx->nr_user_files))
 714                         return -EBADF;
 715                 kiocb->ki_filp = ctx->user_files[fd];
 716                 req->flags |= REQ_F_FIXED_FILE;
 717         } else {
 718                 if (s->needs_fixed_file)
 719                         return -EBADF;
 720                 kiocb->ki_filp = io_file_get(state, fd);
 721                 if (unlikely(!kiocb->ki_filp))
 722                         return -EBADF;
 723                 if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
 724                         force_nonblock = false;
 725         }
 726         kiocb->ki_pos = READ_ONCE(sqe->off);
 727         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 728         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 729
 730         ioprio = READ_ONCE(sqe->ioprio);
 731         if (ioprio) {
 732                 ret = ioprio_check_cap(ioprio);
 733                 if (ret)
 734                         goto out_fput;
 735
 736                 kiocb->ki_ioprio = ioprio;
 737         } else
 738                 kiocb->ki_ioprio = get_current_ioprio();
 739
 740         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 741         if (unlikely(ret))
 742                 goto out_fput;
 743         if (force_nonblock) {
 744                 kiocb->ki_flags |= IOCB_NOWAIT;
 745                 req->flags |= REQ_F_FORCE_NONBLOCK;
 746         }
 747         if (ctx->flags & IORING_SETUP_IOPOLL) {
 748                 ret = -EOPNOTSUPP;
 749                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 750                     !kiocb->ki_filp->f_op->iopoll)
 751                         goto out_fput;
 752
 753                 req->error = 0;
 754                 kiocb->ki_flags |= IOCB_HIPRI;
 755                 kiocb->ki_complete = io_complete_rw_iopoll;
 756         } else {
 757                 if (kiocb->ki_flags & IOCB_HIPRI) {
 758                         ret = -EINVAL;
 759                         goto out_fput;
 760                 }
 761                 kiocb->ki_complete = io_complete_rw;
 762         }
 763         return 0;
 764 out_fput:
 765         if (!(flags & IOSQE_FIXED_FILE)) {
 766                 /*
 767                  * in case of error, we didn't use this file reference. drop it.
 768                  */
 769                 if (state)
 770                         state->used_refs--;
 771                 io_file_put(state, kiocb->ki_filp);
 772         }
 773         return ret;
 774 }
 775
 776 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 777 {
 778         switch (ret) {
 779         case -EIOCBQUEUED:
 780                 break;
 781         case -ERESTARTSYS:
 782         case -ERESTARTNOINTR:
 783         case -ERESTARTNOHAND:
 784         case -ERESTART_RESTARTBLOCK:
 785                 /*
 786                  * We can't just restart the syscall, since previously
 787                  * submitted sqes may already be in progress. Just fail this
 788                  * IO with EINTR.
 789                  */
 790                 ret = -EINTR;
 791                 /* fall through */
 792         default:
 793                 kiocb->ki_complete(kiocb, ret, 0);
 794         }
 795 }
 796
 797 static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 798                            const struct io_uring_sqe *sqe,
 799                            struct iov_iter *iter)
 800 {
 801         size_t len = READ_ONCE(sqe->len);
 802         struct io_mapped_ubuf *imu;
 803         unsigned index, buf_index;
 804         size_t offset;
 805         u64 buf_addr;
 806
 807         /* attempt to use fixed buffers without having provided iovecs */
 808         if (unlikely(!ctx->user_bufs))
 809                 return -EFAULT;
 810
 811         buf_index = READ_ONCE(sqe->buf_index);
 812         if (unlikely(buf_index >= ctx->nr_user_bufs))
 813                 return -EFAULT;
 814
 815         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
 816         imu = &ctx->user_bufs[index];
 817         buf_addr = READ_ONCE(sqe->addr);
 818
 819         /* overflow */
 820         if (buf_addr + len < buf_addr)
 821                 return -EFAULT;
 822         /* not inside the mapped region */
 823         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
 824                 return -EFAULT;
 825
 826         /*
 827          * May not be a start of buffer, set size appropriately
 828          * and advance us to the beginning.
 829          */
 830         offset = buf_addr - imu->ubuf;
 831         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 832         if (offset)
 833                 iov_iter_advance(iter, offset);
 834         return 0;
 835 }
 836
 837 static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 838                            const struct sqe_submit *s, struct iovec **iovec,
 839                            struct iov_iter *iter)
 840 {
 841         const struct io_uring_sqe *sqe = s->sqe;
 842         void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 843         size_t sqe_len = READ_ONCE(sqe->len);
 844         u8 opcode;
 845
 846         /*
 847          * We're reading ->opcode for the second time, but the first read
 848          * doesn't care whether it's _FIXED or not, so it doesn't matter
 849          * whether ->opcode changes concurrently. The first read does care
 850          * about whether it is a READ or a WRITE, so we don't trust this read
 851          * for that purpose and instead let the caller pass in the read/write
 852          * flag.
 853          */
 854         opcode = READ_ONCE(sqe->opcode);
 855         if (opcode == IORING_OP_READ_FIXED ||
 856             opcode == IORING_OP_WRITE_FIXED) {
 857                 ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
 858                 *iovec = NULL;
 859                 return ret;
 860         }
 861
 862         if (!s->has_user)
 863                 return -EFAULT;
 864
 865 #ifdef CONFIG_COMPAT
 866         if (ctx->compat)
 867                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
 868                                                 iovec, iter);
 869 #endif
 870
 871         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
 872 }
 873
 874 static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
 875                        bool force_nonblock, struct io_submit_state *state)
 876 {
 877         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 878         struct kiocb *kiocb = &req->rw;
 879         struct iov_iter iter;
 880         struct file *file;
 881         ssize_t ret;
 882
 883         ret = io_prep_rw(req, s, force_nonblock, state);
 884         if (ret)
 885                 return ret;
 886         file = kiocb->ki_filp;
 887
 888         ret = -EBADF;
 889         if (unlikely(!(file->f_mode & FMODE_READ)))
 890                 goto out_fput;
 891         ret = -EINVAL;
 892         if (unlikely(!file->f_op->read_iter))
 893                 goto out_fput;
 894
 895         ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
 896         if (ret)
 897                 goto out_fput;
 898
 899         ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
 900         if (!ret) {
 901                 ssize_t ret2;
 902
 903                 /* Catch -EAGAIN return for forced non-blocking submission */
 904                 ret2 = call_read_iter(file, kiocb, &iter);
 905                 if (!force_nonblock || ret2 != -EAGAIN)
 906                         io_rw_done(kiocb, ret2);
 907                 else
 908                         ret = -EAGAIN;
 909         }
 910         kfree(iovec);
 911 out_fput:
 912         /* Hold on to the file for -EAGAIN */
 913         if (unlikely(ret && ret != -EAGAIN))
 914                 io_fput(req);
 915         return ret;
 916 }
 917
 918 static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
 919                         bool force_nonblock, struct io_submit_state *state)
 920 {
 921         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 922         struct kiocb *kiocb = &req->rw;
 923         struct iov_iter iter;
 924         struct file *file;
 925         ssize_t ret;
 926
 927         ret = io_prep_rw(req, s, force_nonblock, state);
 928         if (ret)
 929                 return ret;
 930         /* Hold on to the file for -EAGAIN */
 931         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
 932                 return -EAGAIN;
 933
 934         ret = -EBADF;
 935         file = kiocb->ki_filp;
 936         if (unlikely(!(file->f_mode & FMODE_WRITE)))
 937                 goto out_fput;
 938         ret = -EINVAL;
 939         if (unlikely(!file->f_op->write_iter))
 940                 goto out_fput;
 941
 942         ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
 943         if (ret)
 944                 goto out_fput;
 945
 946         ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
 947                                 iov_iter_count(&iter));
 948         if (!ret) {
 949                 /*
 950                  * Open-code file_start_write here to grab freeze protection,
 951                  * which will be released by another thread in
 952                  * io_complete_rw().  Fool lockdep by telling it the lock got
 953                  * released so that it doesn't complain about the held lock when
 954                  * we return to userspace.
 955                  */
 956                 if (S_ISREG(file_inode(file)->i_mode)) {
 957                         __sb_start_write(file_inode(file)->i_sb,
 958                                                 SB_FREEZE_WRITE, true);
 959                         __sb_writers_release(file_inode(file)->i_sb,
 960                                                 SB_FREEZE_WRITE);
 961                 }
 962                 kiocb->ki_flags |= IOCB_WRITE;
 963                 io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
 964         }
 965         kfree(iovec);
 966 out_fput:
 967         if (unlikely(ret))
 968                 io_fput(req);
 969         return ret;
 970 }
 971
 972 /*
 973  * IORING_OP_NOP just posts a completion event, nothing else.
 974  */
 975 static int io_nop(struct io_kiocb *req, u64 user_data)
 976 {
 977         struct io_ring_ctx *ctx = req->ctx;
 978         long err = 0;
 979
 980         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 981                 return -EINVAL;
 982
 983         /*
 984          * Twilight zone - it's possible that someone issued an opcode that
 985          * has a file attached, then got -EAGAIN on submission, and changed
 986          * the sqe before we retried it from async context. Avoid dropping
 987          * a file reference for this malicious case, and flag the error.
 988          */
 989         if (req->rw.ki_filp) {
 990                 err = -EBADF;
 991                 io_fput(req);
 992         }
 993         io_cqring_add_event(ctx, user_data, err, 0);
 994         io_free_req(req);
 995         return 0;
 996 }
 997
 998 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 999 {
1000         struct io_ring_ctx *ctx = req->ctx;
1001         unsigned flags;
1002         int fd;
1003
1004         /* Prep already done */
1005         if (req->rw.ki_filp)
1006                 return 0;
1007
1008         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1009                 return -EINVAL;
1010         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1011                 return -EINVAL;
1012
1013         fd = READ_ONCE(sqe->fd);
1014         flags = READ_ONCE(sqe->flags);
1015
1016         if (flags & IOSQE_FIXED_FILE) {
1017                 if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
1018                         return -EBADF;
1019                 req->rw.ki_filp = ctx->user_files[fd];
1020                 req->flags |= REQ_F_FIXED_FILE;
1021         } else {
1022                 req->rw.ki_filp = fget(fd);
1023                 if (unlikely(!req->rw.ki_filp))
1024                         return -EBADF;
1025         }
1026
1027         return 0;
1028 }
1029
1030 static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1031                     bool force_nonblock)
1032 {
1033         loff_t sqe_off = READ_ONCE(sqe->off);
1034         loff_t sqe_len = READ_ONCE(sqe->len);
1035         loff_t end = sqe_off + sqe_len;
1036         unsigned fsync_flags;
1037         int ret;
1038
1039         fsync_flags = READ_ONCE(sqe->fsync_flags);
1040         if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
1041                 return -EINVAL;
1042
1043         ret = io_prep_fsync(req, sqe);
1044         if (ret)
1045                 return ret;
1046
1047         /* fsync always requires a blocking context */
1048         if (force_nonblock)
1049                 return -EAGAIN;
1050
1051         ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
1052                                 end > 0 ? end : LLONG_MAX,
1053                                 fsync_flags & IORING_FSYNC_DATASYNC);
1054
1055         io_fput(req);
1056         io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
1057         io_free_req(req);
1058         return 0;
1059 }
1060
1061 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1062                            const struct sqe_submit *s, bool force_nonblock,
1063                            struct io_submit_state *state)
1064 {
1065         ssize_t ret;
1066         int opcode;
1067
1068         if (unlikely(s->index >= ctx->sq_entries))
1069                 return -EINVAL;
1070         req->user_data = READ_ONCE(s->sqe->user_data);
1071
1072         opcode = READ_ONCE(s->sqe->opcode);
1073         switch (opcode) {
1074         case IORING_OP_NOP:
1075                 ret = io_nop(req, req->user_data);
1076                 break;
1077         case IORING_OP_READV:
1078                 if (unlikely(s->sqe->buf_index))
1079                         return -EINVAL;
1080                 ret = io_read(req, s, force_nonblock, state);
1081                 break;
1082         case IORING_OP_WRITEV:
1083                 if (unlikely(s->sqe->buf_index))
1084                         return -EINVAL;
1085                 ret = io_write(req, s, force_nonblock, state);
1086                 break;
1087         case IORING_OP_READ_FIXED:
1088                 ret = io_read(req, s, force_nonblock, state);
1089                 break;
1090         case IORING_OP_WRITE_FIXED:
1091                 ret = io_write(req, s, force_nonblock, state);
1092                 break;
1093         case IORING_OP_FSYNC:
1094                 ret = io_fsync(req, s->sqe, force_nonblock);
1095                 break;
1096         default:
1097                 ret = -EINVAL;
1098                 break;
1099         }
1100
1101         if (ret)
1102                 return ret;
1103
1104         if (ctx->flags & IORING_SETUP_IOPOLL) {
1105                 if (req->error == -EAGAIN)
1106                         return -EAGAIN;
1107
1108                 /* workqueue context doesn't hold uring_lock, grab it now */
1109                 if (s->needs_lock)
1110                         mutex_lock(&ctx->uring_lock);
1111                 io_iopoll_req_issued(req);
1112                 if (s->needs_lock)
1113                         mutex_unlock(&ctx->uring_lock);
1114         }
1115
1116         return 0;
1117 }
1118
1119 static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
1120 {
1121         u8 opcode = READ_ONCE(sqe->opcode);
1122
1123         return !(opcode == IORING_OP_READ_FIXED ||
1124                  opcode == IORING_OP_WRITE_FIXED);
1125 }
1126
1127 static void io_sq_wq_submit_work(struct work_struct *work)
1128 {
1129         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1130         struct sqe_submit *s = &req->submit;
1131         const struct io_uring_sqe *sqe = s->sqe;
1132         struct io_ring_ctx *ctx = req->ctx;
1133         mm_segment_t old_fs;
1134         bool needs_user;
1135         int ret;
1136
1137          /* Ensure we clear previously set forced non-block flag */
1138         req->flags &= ~REQ_F_FORCE_NONBLOCK;
1139         req->rw.ki_flags &= ~IOCB_NOWAIT;
1140
1141         s->needs_lock = true;
1142         s->has_user = false;
1143
1144         /*
1145          * If we're doing IO to fixed buffers, we don't need to get/set
1146          * user context
1147          */
1148         needs_user = io_sqe_needs_user(s->sqe);
1149         if (needs_user) {
1150                 if (!mmget_not_zero(ctx->sqo_mm)) {
1151                         ret = -EFAULT;
1152                         goto err;
1153                 }
1154                 use_mm(ctx->sqo_mm);
1155                 old_fs = get_fs();
1156                 set_fs(USER_DS);
1157                 s->has_user = true;
1158         }
1159
1160         do {
1161                 ret = __io_submit_sqe(ctx, req, s, false, NULL);
1162                 /*
1163                  * We can get EAGAIN for polled IO even though we're forcing
1164                  * a sync submission from here, since we can't wait for
1165                  * request slots on the block side.
1166                  */
1167                 if (ret != -EAGAIN)
1168                         break;
1169                 cond_resched();
1170         } while (1);
1171
1172         if (needs_user) {
1173                 set_fs(old_fs);
1174                 unuse_mm(ctx->sqo_mm);
1175                 mmput(ctx->sqo_mm);
1176         }
1177 err:
1178         if (ret) {
1179                 io_cqring_add_event(ctx, sqe->user_data, ret, 0);
1180                 io_free_req(req);
1181         }
1182
1183         /* async context always use a copy of the sqe */
1184         kfree(sqe);
1185 }
1186
1187 static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1188                          struct io_submit_state *state)
1189 {
1190         struct io_kiocb *req;
1191         ssize_t ret;
1192
1193         /* enforce forwards compatibility on users */
1194         if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
1195                 return -EINVAL;
1196
1197         req = io_get_req(ctx, state);
1198         if (unlikely(!req))
1199                 return -EAGAIN;
1200
1201         req->rw.ki_filp = NULL;
1202
1203         ret = __io_submit_sqe(ctx, req, s, true, state);
1204         if (ret == -EAGAIN) {
1205                 struct io_uring_sqe *sqe_copy;
1206
1207                 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
1208                 if (sqe_copy) {
1209                         memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
1210                         s->sqe = sqe_copy;
1211
1212                         memcpy(&req->submit, s, sizeof(*s));
1213                         INIT_WORK(&req->work, io_sq_wq_submit_work);
1214                         queue_work(ctx->sqo_wq, &req->work);
1215                         ret = 0;
1216                 }
1217         }
1218         if (ret)
1219                 io_free_req(req);
1220
1221         return ret;
1222 }
1223
1224 /*
1225  * Batched submission is done, ensure local IO is flushed out.
1226  */
1227 static void io_submit_state_end(struct io_submit_state *state)
1228 {
1229         blk_finish_plug(&state->plug);
1230         io_file_put(state, NULL);
1231         if (state->free_reqs)
1232                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
1233                                         &state->reqs[state->cur_req]);
1234 }
1235
1236 /*
1237  * Start submission side cache.
1238  */
1239 static void io_submit_state_start(struct io_submit_state *state,
1240                                   struct io_ring_ctx *ctx, unsigned max_ios)
1241 {
1242         blk_start_plug(&state->plug);
1243         state->free_reqs = 0;
1244         state->file = NULL;
1245         state->ios_left = max_ios;
1246 }
1247
1248 static void io_commit_sqring(struct io_ring_ctx *ctx)
1249 {
1250         struct io_sq_ring *ring = ctx->sq_ring;
1251
1252         if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
1253                 /*
1254                  * Ensure any loads from the SQEs are done at this point,
1255                  * since once we write the new head, the application could
1256                  * write new data to them.
1257                  */
1258                 smp_store_release(&ring->r.head, ctx->cached_sq_head);
1259
1260                 /*
1261                  * write side barrier of head update, app has read side. See
1262                  * comment at the top of this file
1263                  */
1264                 smp_wmb();
1265         }
1266 }
1267
1268 /*
1269  * Undo last io_get_sqring()
1270  */
1271 static void io_drop_sqring(struct io_ring_ctx *ctx)
1272 {
1273         ctx->cached_sq_head--;
1274 }
1275
1276 /*
1277  * Fetch an sqe, if one is available. Note that s->sqe will point to memory
1278  * that is mapped by userspace. This means that care needs to be taken to
1279  * ensure that reads are stable, as we cannot rely on userspace always
1280  * being a good citizen. If members of the sqe are validated and then later
1281  * used, it's important that those reads are done through READ_ONCE() to
1282  * prevent a re-load down the line.
1283  */
1284 static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
1285 {
1286         struct io_sq_ring *ring = ctx->sq_ring;
1287         unsigned head;
1288
1289         /*
1290          * The cached sq head (or cq tail) serves two purposes:
1291          *
1292          * 1) allows us to batch the cost of updating the user visible
1293          *    head updates.
1294          * 2) allows the kernel side to track the head on its own, even
1295          *    though the application is the one updating it.
1296          */
1297         head = ctx->cached_sq_head;
1298         /* See comment at the top of this file */
1299         smp_rmb();
1300         if (head == READ_ONCE(ring->r.tail))
1301                 return false;
1302
1303         head = READ_ONCE(ring->array[head & ctx->sq_mask]);
1304         if (head < ctx->sq_entries) {
1305                 s->index = head;
1306                 s->sqe = &ctx->sq_sqes[head];
1307                 ctx->cached_sq_head++;
1308                 return true;
1309         }
1310
1311         /* drop invalid entries */
1312         ctx->cached_sq_head++;
1313         ring->dropped++;
1314         /* See comment at the top of this file */
1315         smp_wmb();
1316         return false;
1317 }
1318
1319 static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
1320                           unsigned int nr, bool has_user, bool mm_fault)
1321 {
1322         struct io_submit_state state, *statep = NULL;
1323         int ret, i, submitted = 0;
1324
1325         if (nr > IO_PLUG_THRESHOLD) {
1326                 io_submit_state_start(&state, ctx, nr);
1327                 statep = &state;
1328         }
1329
1330         for (i = 0; i < nr; i++) {
1331                 if (unlikely(mm_fault)) {
1332                         ret = -EFAULT;
1333                 } else {
1334                         sqes[i].has_user = has_user;
1335                         sqes[i].needs_lock = true;
1336                         sqes[i].needs_fixed_file = true;
1337                         ret = io_submit_sqe(ctx, &sqes[i], statep);
1338                 }
1339                 if (!ret) {
1340                         submitted++;
1341                         continue;
1342                 }
1343
1344                 io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
1345         }
1346
1347         if (statep)
1348                 io_submit_state_end(&state);
1349
1350         return submitted;
1351 }
1352
1353 static int io_sq_thread(void *data)
1354 {
1355         struct sqe_submit sqes[IO_IOPOLL_BATCH];
1356         struct io_ring_ctx *ctx = data;
1357         struct mm_struct *cur_mm = NULL;
1358         mm_segment_t old_fs;
1359         DEFINE_WAIT(wait);
1360         unsigned inflight;
1361         unsigned long timeout;
1362
1363         old_fs = get_fs();
1364         set_fs(USER_DS);
1365
1366         timeout = inflight = 0;
1367         while (!kthread_should_stop() && !ctx->sqo_stop) {
1368                 bool all_fixed, mm_fault = false;
1369                 int i;
1370
1371                 if (inflight) {
1372                         unsigned nr_events = 0;
1373
1374                         if (ctx->flags & IORING_SETUP_IOPOLL) {
1375                                 /*
1376                                  * We disallow the app entering submit/complete
1377                                  * with polling, but we still need to lock the
1378                                  * ring to prevent racing with polled issue
1379                                  * that got punted to a workqueue.
1380                                  */
1381                                 mutex_lock(&ctx->uring_lock);
1382                                 io_iopoll_check(ctx, &nr_events, 0);
1383                                 mutex_unlock(&ctx->uring_lock);
1384                         } else {
1385                                 /*
1386                                  * Normal IO, just pretend everything completed.
1387                                  * We don't have to poll completions for that.
1388                                  */
1389                                 nr_events = inflight;
1390                         }
1391
1392                         inflight -= nr_events;
1393                         if (!inflight)
1394                                 timeout = jiffies + ctx->sq_thread_idle;
1395                 }
1396
1397                 if (!io_get_sqring(ctx, &sqes[0])) {
1398                         /*
1399                          * We're polling. If we're within the defined idle
1400                          * period, then let us spin without work before going
1401                          * to sleep.
1402                          */
1403                         if (inflight || !time_after(jiffies, timeout)) {
1404                                 cpu_relax();
1405                                 continue;
1406                         }
1407
1408                         /*
1409                          * Drop cur_mm before scheduling, we can't hold it for
1410                          * long periods (or over schedule()). Do this before
1411                          * adding ourselves to the waitqueue, as the unuse/drop
1412                          * may sleep.
1413                          */
1414                         if (cur_mm) {
1415                                 unuse_mm(cur_mm);
1416                                 mmput(cur_mm);
1417                                 cur_mm = NULL;
1418                         }
1419
1420                         prepare_to_wait(&ctx->sqo_wait, &wait,
1421                                                 TASK_INTERRUPTIBLE);
1422
1423                         /* Tell userspace we may need a wakeup call */
1424                         ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
1425                         smp_wmb();
1426
1427                         if (!io_get_sqring(ctx, &sqes[0])) {
1428                                 if (kthread_should_stop()) {
1429                                         finish_wait(&ctx->sqo_wait, &wait);
1430                                         break;
1431                                 }
1432                                 if (signal_pending(current))
1433                                         flush_signals(current);
1434                                 schedule();
1435                                 finish_wait(&ctx->sqo_wait, &wait);
1436
1437                                 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
1438                                 smp_wmb();
1439                                 continue;
1440                         }
1441                         finish_wait(&ctx->sqo_wait, &wait);
1442
1443                         ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
1444                         smp_wmb();
1445                 }
1446
1447                 i = 0;
1448                 all_fixed = true;
1449                 do {
1450                         if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
1451                                 all_fixed = false;
1452
1453                         i++;
1454                         if (i == ARRAY_SIZE(sqes))
1455                                 break;
1456                 } while (io_get_sqring(ctx, &sqes[i]));
1457
1458                 /* Unless all new commands are FIXED regions, grab mm */
1459                 if (!all_fixed && !cur_mm) {
1460                         mm_fault = !mmget_not_zero(ctx->sqo_mm);
1461                         if (!mm_fault) {
1462                                 use_mm(ctx->sqo_mm);
1463                                 cur_mm = ctx->sqo_mm;
1464                         }
1465                 }
1466
1467                 inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
1468                                                 mm_fault);
1469
1470                 /* Commit SQ ring head once we've consumed all SQEs */
1471                 io_commit_sqring(ctx);
1472         }
1473
1474         set_fs(old_fs);
1475         if (cur_mm) {
1476                 unuse_mm(cur_mm);
1477                 mmput(cur_mm);
1478         }
1479         return 0;
1480 }
1481
1482 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
1483 {
1484         struct io_submit_state state, *statep = NULL;
1485         int i, ret = 0, submit = 0;
1486
1487         if (to_submit > IO_PLUG_THRESHOLD) {
1488                 io_submit_state_start(&state, ctx, to_submit);
1489                 statep = &state;
1490         }
1491
1492         for (i = 0; i < to_submit; i++) {
1493                 struct sqe_submit s;
1494
1495                 if (!io_get_sqring(ctx, &s))
1496                         break;
1497
1498                 s.has_user = true;
1499                 s.needs_lock = false;
1500                 s.needs_fixed_file = false;
1501
1502                 ret = io_submit_sqe(ctx, &s, statep);
1503                 if (ret) {
1504                         io_drop_sqring(ctx);
1505                         break;
1506                 }
1507
1508                 submit++;
1509         }
1510         io_commit_sqring(ctx);
1511
1512         if (statep)
1513                 io_submit_state_end(statep);
1514
1515         return submit ? submit : ret;
1516 }
1517
1518 static unsigned io_cqring_events(struct io_cq_ring *ring)
1519 {
1520         return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
1521 }
1522
1523 /*
1524  * Wait until events become available, if we don't already have some. The
1525  * application must reap them itself, as they reside on the shared cq ring.
1526  */
1527 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
1528                           const sigset_t __user *sig, size_t sigsz)
1529 {
1530         struct io_cq_ring *ring = ctx->cq_ring;
1531         sigset_t ksigmask, sigsaved;
1532         DEFINE_WAIT(wait);
1533         int ret;
1534
1535         /* See comment at the top of this file */
1536         smp_rmb();
1537         if (io_cqring_events(ring) >= min_events)
1538                 return 0;
1539
1540         if (sig) {
1541                 ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
1542                 if (ret)
1543                         return ret;
1544         }
1545
1546         do {
1547                 prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
1548
1549                 ret = 0;
1550                 /* See comment at the top of this file */
1551                 smp_rmb();
1552                 if (io_cqring_events(ring) >= min_events)
1553                         break;
1554
1555                 schedule();
1556
1557                 ret = -EINTR;
1558                 if (signal_pending(current))
1559                         break;
1560         } while (1);
1561
1562         finish_wait(&ctx->wait, &wait);
1563
1564         if (sig)
1565                 restore_user_sigmask(sig, &sigsaved);
1566
1567         return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
1568 }
1569
1570 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
1571 {
1572 #if defined(CONFIG_UNIX)
1573         if (ctx->ring_sock) {
1574                 struct sock *sock = ctx->ring_sock->sk;
1575                 struct sk_buff *skb;
1576
1577                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
1578                         kfree_skb(skb);
1579         }
1580 #else
1581         int i;
1582
1583         for (i = 0; i < ctx->nr_user_files; i++)
1584                 fput(ctx->user_files[i]);
1585 #endif
1586 }
1587
1588 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
1589 {
1590         if (!ctx->user_files)
1591                 return -ENXIO;
1592
1593         __io_sqe_files_unregister(ctx);
1594         kfree(ctx->user_files);
1595         ctx->user_files = NULL;
1596         ctx->nr_user_files = 0;
1597         return 0;
1598 }
1599
1600 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
1601 {
1602         if (ctx->sqo_thread) {
1603                 ctx->sqo_stop = 1;
1604                 mb();
1605                 kthread_stop(ctx->sqo_thread);
1606                 ctx->sqo_thread = NULL;
1607         }
1608 }
1609
1610 static void io_finish_async(struct io_ring_ctx *ctx)
1611 {
1612         io_sq_thread_stop(ctx);
1613
1614         if (ctx->sqo_wq) {
1615                 destroy_workqueue(ctx->sqo_wq);
1616                 ctx->sqo_wq = NULL;
1617         }
1618 }
1619
1620 #if defined(CONFIG_UNIX)
1621 static void io_destruct_skb(struct sk_buff *skb)
1622 {
1623         struct io_ring_ctx *ctx = skb->sk->sk_user_data;
1624
1625         io_finish_async(ctx);
1626         unix_destruct_scm(skb);
1627 }
1628
1629 /*
1630  * Ensure the UNIX gc is aware of our file set, so we are certain that
1631  * the io_uring can be safely unregistered on process exit, even if we have
1632  * loops in the file referencing.
1633  */
1634 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
1635 {
1636         struct sock *sk = ctx->ring_sock->sk;
1637         struct scm_fp_list *fpl;
1638         struct sk_buff *skb;
1639         int i;
1640
1641         if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
1642                 unsigned long inflight = ctx->user->unix_inflight + nr;
1643
1644                 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
1645                         return -EMFILE;
1646         }
1647
1648         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
1649         if (!fpl)
1650                 return -ENOMEM;
1651
1652         skb = alloc_skb(0, GFP_KERNEL);
1653         if (!skb) {
1654                 kfree(fpl);
1655                 return -ENOMEM;
1656         }
1657
1658         skb->sk = sk;
1659         skb->destructor = io_destruct_skb;
1660
1661         fpl->user = get_uid(ctx->user);
1662         for (i = 0; i < nr; i++) {
1663                 fpl->fp[i] = get_file(ctx->user_files[i + offset]);
1664                 unix_inflight(fpl->user, fpl->fp[i]);
1665         }
1666
1667         fpl->max = fpl->count = nr;
1668         UNIXCB(skb).fp = fpl;
1669         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1670         skb_queue_head(&sk->sk_receive_queue, skb);
1671
1672         for (i = 0; i < nr; i++)
1673                 fput(fpl->fp[i]);
1674
1675         return 0;
1676 }
1677
1678 /*
1679  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
1680  * causes regular reference counting to break down. We rely on the UNIX
1681  * garbage collection to take care of this problem for us.
1682  */
1683 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
1684 {
1685         unsigned left, total;
1686         int ret = 0;
1687
1688         total = 0;
1689         left = ctx->nr_user_files;
1690         while (left) {
1691                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
1692                 int ret;
1693
1694                 ret = __io_sqe_files_scm(ctx, this_files, total);
1695                 if (ret)
1696                         break;
1697                 left -= this_files;
1698                 total += this_files;
1699         }
1700
1701         if (!ret)
1702                 return 0;
1703
1704         while (total < ctx->nr_user_files) {
1705                 fput(ctx->user_files[total]);
1706                 total++;
1707         }
1708
1709         return ret;
1710 }
1711 #else
1712 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
1713 {
1714         return 0;
1715 }
1716 #endif
1717
1718 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
1719                                  unsigned nr_args)
1720 {
1721         __s32 __user *fds = (__s32 __user *) arg;
1722         int fd, ret = 0;
1723         unsigned i;
1724
1725         if (ctx->user_files)
1726                 return -EBUSY;
1727         if (!nr_args)
1728                 return -EINVAL;
1729         if (nr_args > IORING_MAX_FIXED_FILES)
1730                 return -EMFILE;
1731
1732         ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
1733         if (!ctx->user_files)
1734                 return -ENOMEM;
1735
1736         for (i = 0; i < nr_args; i++) {
1737                 ret = -EFAULT;
1738                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
1739                         break;
1740
1741                 ctx->user_files[i] = fget(fd);
1742
1743                 ret = -EBADF;
1744                 if (!ctx->user_files[i])
1745                         break;
1746                 /*
1747                  * Don't allow io_uring instances to be registered. If UNIX
1748                  * isn't enabled, then this causes a reference cycle and this
1749                  * instance can never get freed. If UNIX is enabled we'll
1750                  * handle it just fine, but there's still no point in allowing
1751                  * a ring fd as it doesn't support regular read/write anyway.
1752                  */
1753                 if (ctx->user_files[i]->f_op == &io_uring_fops) {
1754                         fput(ctx->user_files[i]);
1755                         break;
1756                 }
1757                 ctx->nr_user_files++;
1758                 ret = 0;
1759         }
1760
1761         if (ret) {
1762                 for (i = 0; i < ctx->nr_user_files; i++)
1763                         fput(ctx->user_files[i]);
1764
1765                 kfree(ctx->user_files);
1766                 ctx->nr_user_files = 0;
1767                 return ret;
1768         }
1769
1770         ret = io_sqe_files_scm(ctx);
1771         if (ret)
1772                 io_sqe_files_unregister(ctx);
1773
1774         return ret;
1775 }
1776
1777 static int io_sq_offload_start(struct io_ring_ctx *ctx,
1778                                struct io_uring_params *p)
1779 {
1780         int ret;
1781
1782         init_waitqueue_head(&ctx->sqo_wait);
1783         mmgrab(current->mm);
1784         ctx->sqo_mm = current->mm;
1785
1786         ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
1787         if (!ctx->sq_thread_idle)
1788                 ctx->sq_thread_idle = HZ;
1789
1790         ret = -EINVAL;
1791         if (!cpu_possible(p->sq_thread_cpu))
1792                 goto err;
1793
1794         if (ctx->flags & IORING_SETUP_SQPOLL) {
1795                 if (p->flags & IORING_SETUP_SQ_AFF) {
1796                         int cpu;
1797
1798                         cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
1799                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
1800                                                         ctx, cpu,
1801                                                         "io_uring-sq");
1802                 } else {
1803                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
1804                                                         "io_uring-sq");
1805                 }
1806                 if (IS_ERR(ctx->sqo_thread)) {
1807                         ret = PTR_ERR(ctx->sqo_thread);
1808                         ctx->sqo_thread = NULL;
1809                         goto err;
1810                 }
1811                 wake_up_process(ctx->sqo_thread);
1812         } else if (p->flags & IORING_SETUP_SQ_AFF) {
1813                 /* Can't have SQ_AFF without SQPOLL */
1814                 ret = -EINVAL;
1815                 goto err;
1816         }
1817
1818         /* Do QD, or 2 * CPUS, whatever is smallest */
1819         ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
1820                         min(ctx->sq_entries - 1, 2 * num_online_cpus()));
1821         if (!ctx->sqo_wq) {
1822                 ret = -ENOMEM;
1823                 goto err;
1824         }
1825
1826         return 0;
1827 err:
1828         io_sq_thread_stop(ctx);
1829         mmdrop(ctx->sqo_mm);
1830         ctx->sqo_mm = NULL;
1831         return ret;
1832 }
1833
1834 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
1835 {
1836         atomic_long_sub(nr_pages, &user->locked_vm);
1837 }
1838
1839 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
1840 {
1841         unsigned long page_limit, cur_pages, new_pages;
1842
1843         /* Don't allow more pages than we can safely lock */
1844         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1845
1846         do {
1847                 cur_pages = atomic_long_read(&user->locked_vm);
1848                 new_pages = cur_pages + nr_pages;
1849                 if (new_pages > page_limit)
1850                         return -ENOMEM;
1851         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
1852                                         new_pages) != cur_pages);
1853
1854         return 0;
1855 }
1856
1857 static void io_mem_free(void *ptr)
1858 {
1859         struct page *page = virt_to_head_page(ptr);
1860
1861         if (put_page_testzero(page))
1862                 free_compound_page(page);
1863 }
1864
1865 static void *io_mem_alloc(size_t size)
1866 {
1867         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
1868                                 __GFP_NORETRY;
1869
1870         return (void *) __get_free_pages(gfp_flags, get_order(size));
1871 }
1872
1873 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
1874 {
1875         struct io_sq_ring *sq_ring;
1876         struct io_cq_ring *cq_ring;
1877         size_t bytes;
1878
1879         bytes = struct_size(sq_ring, array, sq_entries);
1880         bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
1881         bytes += struct_size(cq_ring, cqes, cq_entries);
1882
1883         return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1884 }
1885
1886 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
1887 {
1888         int i, j;
1889
1890         if (!ctx->user_bufs)
1891                 return -ENXIO;
1892
1893         for (i = 0; i < ctx->nr_user_bufs; i++) {
1894                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
1895
1896                 for (j = 0; j < imu->nr_bvecs; j++)
1897                         put_page(imu->bvec[j].bv_page);
1898
1899                 if (ctx->account_mem)
1900                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
1901                 kfree(imu->bvec);
1902                 imu->nr_bvecs = 0;
1903         }
1904
1905         kfree(ctx->user_bufs);
1906         ctx->user_bufs = NULL;
1907         ctx->nr_user_bufs = 0;
1908         return 0;
1909 }
1910
1911 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
1912                        void __user *arg, unsigned index)
1913 {
1914         struct iovec __user *src;
1915
1916 #ifdef CONFIG_COMPAT
1917         if (ctx->compat) {
1918                 struct compat_iovec __user *ciovs;
1919                 struct compat_iovec ciov;
1920
1921                 ciovs = (struct compat_iovec __user *) arg;
1922                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
1923                         return -EFAULT;
1924
1925                 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
1926                 dst->iov_len = ciov.iov_len;
1927                 return 0;
1928         }
1929 #endif
1930         src = (struct iovec __user *) arg;
1931         if (copy_from_user(dst, &src[index], sizeof(*dst)))
1932                 return -EFAULT;
1933         return 0;
1934 }
1935
1936 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
1937                                   unsigned nr_args)
1938 {
1939         struct vm_area_struct **vmas = NULL;
1940         struct page **pages = NULL;
1941         int i, j, got_pages = 0;
1942         int ret = -EINVAL;
1943
1944         if (ctx->user_bufs)
1945                 return -EBUSY;
1946         if (!nr_args || nr_args > UIO_MAXIOV)
1947                 return -EINVAL;
1948
1949         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
1950                                         GFP_KERNEL);
1951         if (!ctx->user_bufs)
1952                 return -ENOMEM;
1953
1954         for (i = 0; i < nr_args; i++) {
1955                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
1956                 unsigned long off, start, end, ubuf;
1957                 int pret, nr_pages;
1958                 struct iovec iov;
1959                 size_t size;
1960
1961                 ret = io_copy_iov(ctx, &iov, arg, i);
1962                 if (ret)
1963                         break;
1964
1965                 /*
1966                  * Don't impose further limits on the size and buffer
1967                  * constraints here, we'll -EINVAL later when IO is
1968                  * submitted if they are wrong.
1969                  */
1970                 ret = -EFAULT;
1971                 if (!iov.iov_base || !iov.iov_len)
1972                         goto err;
1973
1974                 /* arbitrary limit, but we need something */
1975                 if (iov.iov_len > SZ_1G)
1976                         goto err;
1977
1978                 ubuf = (unsigned long) iov.iov_base;
1979                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1980                 start = ubuf >> PAGE_SHIFT;
1981                 nr_pages = end - start;
1982
1983                 if (ctx->account_mem) {
1984                         ret = io_account_mem(ctx->user, nr_pages);
1985                         if (ret)
1986                                 goto err;
1987                 }
1988
1989                 ret = 0;
1990                 if (!pages || nr_pages > got_pages) {
1991                         kfree(vmas);
1992                         kfree(pages);
1993                         pages = kmalloc_array(nr_pages, sizeof(struct page *),
1994                                                 GFP_KERNEL);
1995                         vmas = kmalloc_array(nr_pages,
1996                                         sizeof(struct vm_area_struct *),
1997                                         GFP_KERNEL);
1998                         if (!pages || !vmas) {
1999                                 ret = -ENOMEM;
2000                                 if (ctx->account_mem)
2001                                         io_unaccount_mem(ctx->user, nr_pages);
2002                                 goto err;
2003                         }
2004                         got_pages = nr_pages;
2005                 }
2006
2007                 imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
2008                                                 GFP_KERNEL);
2009                 ret = -ENOMEM;
2010                 if (!imu->bvec) {
2011                         if (ctx->account_mem)
2012                                 io_unaccount_mem(ctx->user, nr_pages);
2013                         goto err;
2014                 }
2015
2016                 ret = 0;
2017                 down_read(&current->mm->mmap_sem);
2018                 pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
2019                                                 pages, vmas);
2020                 if (pret == nr_pages) {
2021                         /* don't support file backed memory */
2022                         for (j = 0; j < nr_pages; j++) {
2023                                 struct vm_area_struct *vma = vmas[j];
2024
2025                                 if (vma->vm_file &&
2026                                     !is_file_hugepages(vma->vm_file)) {
2027                                         ret = -EOPNOTSUPP;
2028                                         break;
2029                                 }
2030                         }
2031                 } else {
2032                         ret = pret < 0 ? pret : -EFAULT;
2033                 }
2034                 up_read(&current->mm->mmap_sem);
2035                 if (ret) {
2036                         /*
2037                          * if we did partial map, or found file backed vmas,
2038                          * release any pages we did get
2039                          */
2040                         if (pret > 0) {
2041                                 for (j = 0; j < pret; j++)
2042                                         put_page(pages[j]);
2043                         }
2044                         if (ctx->account_mem)
2045                                 io_unaccount_mem(ctx->user, nr_pages);
2046                         goto err;
2047                 }
2048
2049                 off = ubuf & ~PAGE_MASK;
2050                 size = iov.iov_len;
2051                 for (j = 0; j < nr_pages; j++) {
2052                         size_t vec_len;
2053
2054                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
2055                         imu->bvec[j].bv_page = pages[j];
2056                         imu->bvec[j].bv_len = vec_len;
2057                         imu->bvec[j].bv_offset = off;
2058                         off = 0;
2059                         size -= vec_len;
2060                 }
2061                 /* store original address for later verification */
2062                 imu->ubuf = ubuf;
2063                 imu->len = iov.iov_len;
2064                 imu->nr_bvecs = nr_pages;
2065
2066                 ctx->nr_user_bufs++;
2067         }
2068         kfree(pages);
2069         kfree(vmas);
2070         return 0;
2071 err:
2072         kfree(pages);
2073         kfree(vmas);
2074         io_sqe_buffer_unregister(ctx);
2075         return ret;
2076 }
2077
2078 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
2079 {
2080         io_finish_async(ctx);
2081         if (ctx->sqo_mm)
2082                 mmdrop(ctx->sqo_mm);
2083
2084         io_iopoll_reap_events(ctx);
2085         io_sqe_buffer_unregister(ctx);
2086         io_sqe_files_unregister(ctx);
2087
2088 #if defined(CONFIG_UNIX)
2089         if (ctx->ring_sock)
2090                 sock_release(ctx->ring_sock);
2091 #endif
2092
2093         io_mem_free(ctx->sq_ring);
2094         io_mem_free(ctx->sq_sqes);
2095         io_mem_free(ctx->cq_ring);
2096
2097         percpu_ref_exit(&ctx->refs);
2098         if (ctx->account_mem)
2099                 io_unaccount_mem(ctx->user,
2100                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
2101         free_uid(ctx->user);
2102         kfree(ctx);
2103 }
2104
2105 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
2106 {
2107         struct io_ring_ctx *ctx = file->private_data;
2108         __poll_t mask = 0;
2109
2110         poll_wait(file, &ctx->cq_wait, wait);
2111         /* See comment at the top of this file */
2112         smp_rmb();
2113         if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
2114                 mask |= EPOLLOUT | EPOLLWRNORM;
2115         if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
2116                 mask |= EPOLLIN | EPOLLRDNORM;
2117
2118         return mask;
2119 }
2120
2121 static int io_uring_fasync(int fd, struct file *file, int on)
2122 {
2123         struct io_ring_ctx *ctx = file->private_data;
2124
2125         return fasync_helper(fd, file, on, &ctx->cq_fasync);
2126 }
2127
2128 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2129 {
2130         mutex_lock(&ctx->uring_lock);
2131         percpu_ref_kill(&ctx->refs);
2132         mutex_unlock(&ctx->uring_lock);
2133
2134         io_iopoll_reap_events(ctx);
2135         wait_for_completion(&ctx->ctx_done);
2136         io_ring_ctx_free(ctx);
2137 }
2138
2139 static int io_uring_release(struct inode *inode, struct file *file)
2140 {
2141         struct io_ring_ctx *ctx = file->private_data;
2142
2143         file->private_data = NULL;
2144         io_ring_ctx_wait_and_kill(ctx);
2145         return 0;
2146 }
2147
2148 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
2149 {
2150         loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
2151         unsigned long sz = vma->vm_end - vma->vm_start;
2152         struct io_ring_ctx *ctx = file->private_data;
2153         unsigned long pfn;
2154         struct page *page;
2155         void *ptr;
2156
2157         switch (offset) {
2158         case IORING_OFF_SQ_RING:
2159                 ptr = ctx->sq_ring;
2160                 break;
2161         case IORING_OFF_SQES:
2162                 ptr = ctx->sq_sqes;
2163                 break;
2164         case IORING_OFF_CQ_RING:
2165                 ptr = ctx->cq_ring;
2166                 break;
2167         default:
2168                 return -EINVAL;
2169         }
2170
2171         page = virt_to_head_page(ptr);
2172         if (sz > (PAGE_SIZE << compound_order(page)))
2173                 return -EINVAL;
2174
2175         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
2176         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2177 }
2178
2179 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
2180                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
2181                 size_t, sigsz)
2182 {
2183         struct io_ring_ctx *ctx;
2184         long ret = -EBADF;
2185         int submitted = 0;
2186         struct fd f;
2187
2188         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
2189                 return -EINVAL;
2190
2191         f = fdget(fd);
2192         if (!f.file)
2193                 return -EBADF;
2194
2195         ret = -EOPNOTSUPP;
2196         if (f.file->f_op != &io_uring_fops)
2197                 goto out_fput;
2198
2199         ret = -ENXIO;
2200         ctx = f.file->private_data;
2201         if (!percpu_ref_tryget(&ctx->refs))
2202                 goto out_fput;
2203
2204         /*
2205          * For SQ polling, the thread will do all submissions and completions.
2206          * Just return the requested submit count, and wake the thread if
2207          * we were asked to.
2208          */
2209         if (ctx->flags & IORING_SETUP_SQPOLL) {
2210                 if (flags & IORING_ENTER_SQ_WAKEUP)
2211                         wake_up(&ctx->sqo_wait);
2212                 submitted = to_submit;
2213                 goto out_ctx;
2214         }
2215
2216         ret = 0;
2217         if (to_submit) {
2218                 to_submit = min(to_submit, ctx->sq_entries);
2219
2220                 mutex_lock(&ctx->uring_lock);
2221                 submitted = io_ring_submit(ctx, to_submit);
2222                 mutex_unlock(&ctx->uring_lock);
2223
2224                 if (submitted < 0)
2225                         goto out_ctx;
2226         }
2227         if (flags & IORING_ENTER_GETEVENTS) {
2228                 unsigned nr_events = 0;
2229
2230                 min_complete = min(min_complete, ctx->cq_entries);
2231
2232                 /*
2233                  * The application could have included the 'to_submit' count
2234                  * in how many events it wanted to wait for. If we failed to
2235                  * submit the desired count, we may need to adjust the number
2236                  * of events to poll/wait for.
2237                  */
2238                 if (submitted < to_submit)
2239                         min_complete = min_t(unsigned, submitted, min_complete);
2240
2241                 if (ctx->flags & IORING_SETUP_IOPOLL) {
2242                         mutex_lock(&ctx->uring_lock);
2243                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
2244                         mutex_unlock(&ctx->uring_lock);
2245                 } else {
2246                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
2247                 }
2248         }
2249
2250 out_ctx:
2251         io_ring_drop_ctx_refs(ctx, 1);
2252 out_fput:
2253         fdput(f);
2254         return submitted ? submitted : ret;
2255 }
2256
2257 static const struct file_operations io_uring_fops = {
2258         .release        = io_uring_release,
2259         .mmap           = io_uring_mmap,
2260         .poll           = io_uring_poll,
2261         .fasync         = io_uring_fasync,
2262 };
2263
2264 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
2265                                   struct io_uring_params *p)
2266 {
2267         struct io_sq_ring *sq_ring;
2268         struct io_cq_ring *cq_ring;
2269         size_t size;
2270
2271         sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
2272         if (!sq_ring)
2273                 return -ENOMEM;
2274
2275         ctx->sq_ring = sq_ring;
2276         sq_ring->ring_mask = p->sq_entries - 1;
2277         sq_ring->ring_entries = p->sq_entries;
2278         ctx->sq_mask = sq_ring->ring_mask;
2279         ctx->sq_entries = sq_ring->ring_entries;
2280
2281         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
2282         if (size == SIZE_MAX)
2283                 return -EOVERFLOW;
2284
2285         ctx->sq_sqes = io_mem_alloc(size);
2286         if (!ctx->sq_sqes) {
2287                 io_mem_free(ctx->sq_ring);
2288                 return -ENOMEM;
2289         }
2290
2291         cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
2292         if (!cq_ring) {
2293                 io_mem_free(ctx->sq_ring);
2294                 io_mem_free(ctx->sq_sqes);
2295                 return -ENOMEM;
2296         }
2297
2298         ctx->cq_ring = cq_ring;
2299         cq_ring->ring_mask = p->cq_entries - 1;
2300         cq_ring->ring_entries = p->cq_entries;
2301         ctx->cq_mask = cq_ring->ring_mask;
2302         ctx->cq_entries = cq_ring->ring_entries;
2303         return 0;
2304 }
2305
2306 /*
2307  * Allocate an anonymous fd, this is what constitutes the application
2308  * visible backing of an io_uring instance. The application mmaps this
2309  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
2310  * we have to tie this fd to a socket for file garbage collection purposes.
2311  */
2312 static int io_uring_get_fd(struct io_ring_ctx *ctx)
2313 {
2314         struct file *file;
2315         int ret;
2316
2317 #if defined(CONFIG_UNIX)
2318         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
2319                                 &ctx->ring_sock);
2320         if (ret)
2321                 return ret;
2322 #endif
2323
2324         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2325         if (ret < 0)
2326                 goto err;
2327
2328         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
2329                                         O_RDWR | O_CLOEXEC);
2330         if (IS_ERR(file)) {
2331                 put_unused_fd(ret);
2332                 ret = PTR_ERR(file);
2333                 goto err;
2334         }
2335
2336 #if defined(CONFIG_UNIX)
2337         ctx->ring_sock->file = file;
2338         ctx->ring_sock->sk->sk_user_data = ctx;
2339 #endif
2340         fd_install(ret, file);
2341         return ret;
2342 err:
2343 #if defined(CONFIG_UNIX)
2344         sock_release(ctx->ring_sock);
2345         ctx->ring_sock = NULL;
2346 #endif
2347         return ret;
2348 }
2349
2350 static int io_uring_create(unsigned entries, struct io_uring_params *p)
2351 {
2352         struct user_struct *user = NULL;
2353         struct io_ring_ctx *ctx;
2354         bool account_mem;
2355         int ret;
2356
2357         if (!entries || entries > IORING_MAX_ENTRIES)
2358                 return -EINVAL;
2359
2360         /*
2361          * Use twice as many entries for the CQ ring. It's possible for the
2362          * application to drive a higher depth than the size of the SQ ring,
2363          * since the sqes are only used at submission time. This allows for
2364          * some flexibility in overcommitting a bit.
2365          */
2366         p->sq_entries = roundup_pow_of_two(entries);
2367         p->cq_entries = 2 * p->sq_entries;
2368
2369         user = get_uid(current_user());
2370         account_mem = !capable(CAP_IPC_LOCK);
2371
2372         if (account_mem) {
2373                 ret = io_account_mem(user,
2374                                 ring_pages(p->sq_entries, p->cq_entries));
2375                 if (ret) {
2376                         free_uid(user);
2377                         return ret;
2378                 }
2379         }
2380
2381         ctx = io_ring_ctx_alloc(p);
2382         if (!ctx) {
2383                 if (account_mem)
2384                         io_unaccount_mem(user, ring_pages(p->sq_entries,
2385                                                                 p->cq_entries));
2386                 free_uid(user);
2387                 return -ENOMEM;
2388         }
2389         ctx->compat = in_compat_syscall();
2390         ctx->account_mem = account_mem;
2391         ctx->user = user;
2392
2393         ret = io_allocate_scq_urings(ctx, p);
2394         if (ret)
2395                 goto err;
2396
2397         ret = io_sq_offload_start(ctx, p);
2398         if (ret)
2399                 goto err;
2400
2401         ret = io_uring_get_fd(ctx);
2402         if (ret < 0)
2403                 goto err;
2404
2405         memset(&p->sq_off, 0, sizeof(p->sq_off));
2406         p->sq_off.head = offsetof(struct io_sq_ring, r.head);
2407         p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
2408         p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
2409         p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
2410         p->sq_off.flags = offsetof(struct io_sq_ring, flags);
2411         p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
2412         p->sq_off.array = offsetof(struct io_sq_ring, array);
2413
2414         memset(&p->cq_off, 0, sizeof(p->cq_off));
2415         p->cq_off.head = offsetof(struct io_cq_ring, r.head);
2416         p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
2417         p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
2418         p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
2419         p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
2420         p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
2421         return ret;
2422 err:
2423         io_ring_ctx_wait_and_kill(ctx);
2424         return ret;
2425 }
2426
2427 /*
2428  * Sets up an aio uring context, and returns the fd. Applications asks for a
2429  * ring size, we return the actual sq/cq ring sizes (among other things) in the
2430  * params structure passed in.
2431  */
2432 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
2433 {
2434         struct io_uring_params p;
2435         long ret;
2436         int i;
2437
2438         if (copy_from_user(&p, params, sizeof(p)))
2439                 return -EFAULT;
2440         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
2441                 if (p.resv[i])
2442                         return -EINVAL;
2443         }
2444
2445         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
2446                         IORING_SETUP_SQ_AFF))
2447                 return -EINVAL;
2448
2449         ret = io_uring_create(entries, &p);
2450         if (ret < 0)
2451                 return ret;
2452
2453         if (copy_to_user(params, &p, sizeof(p)))
2454                 return -EFAULT;
2455
2456         return ret;
2457 }
2458
2459 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
2460                 struct io_uring_params __user *, params)
2461 {
2462         return io_uring_setup(entries, params);
2463 }
2464
2465 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
2466                                void __user *arg, unsigned nr_args)
2467 {
2468         int ret;
2469
2470         percpu_ref_kill(&ctx->refs);
2471         wait_for_completion(&ctx->ctx_done);
2472
2473         switch (opcode) {
2474         case IORING_REGISTER_BUFFERS:
2475                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
2476                 break;
2477         case IORING_UNREGISTER_BUFFERS:
2478                 ret = -EINVAL;
2479                 if (arg || nr_args)
2480                         break;
2481                 ret = io_sqe_buffer_unregister(ctx);
2482                 break;
2483         case IORING_REGISTER_FILES:
2484                 ret = io_sqe_files_register(ctx, arg, nr_args);
2485                 break;
2486         case IORING_UNREGISTER_FILES:
2487                 ret = -EINVAL;
2488                 if (arg || nr_args)
2489                         break;
2490                 ret = io_sqe_files_unregister(ctx);
2491                 break;
2492         default:
2493                 ret = -EINVAL;
2494                 break;
2495         }
2496
2497         /* bring the ctx back to life */
2498         reinit_completion(&ctx->ctx_done);
2499         percpu_ref_reinit(&ctx->refs);
2500         return ret;
2501 }
2502
2503 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
2504                 void __user *, arg, unsigned int, nr_args)
2505 {
2506         struct io_ring_ctx *ctx;
2507         long ret = -EBADF;
2508         struct fd f;
2509
2510         f = fdget(fd);
2511         if (!f.file)
2512                 return -EBADF;
2513
2514         ret = -EOPNOTSUPP;
2515         if (f.file->f_op != &io_uring_fops)
2516                 goto out_fput;
2517
2518         ctx = f.file->private_data;
2519
2520         mutex_lock(&ctx->uring_lock);
2521         ret = __io_uring_register(ctx, opcode, arg, nr_args);
2522         mutex_unlock(&ctx->uring_lock);
2523 out_fput:
2524         fdput(f);
2525         return ret;
2526 }
2527
2528 static int __init io_uring_init(void)
2529 {
2530         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
2531         return 0;
2532 };
2533 __initcall(io_uring_init);