src/patches/xen_aio.diff

   1 --- linux-2.6.27.21/fs/aio.c    2009-03-23 22:04:09.000000000 +0000
   2 +++ linux-2.6.27.8/fs/aio.c     2009-03-29 15:53:57.000000000 +0000
   3 @@ -36,6 +36,11 @@
   4  #include <asm/uaccess.h>
   5  #include <asm/mmu_context.h>
   6
   7 +#ifdef CONFIG_EPOLL
   8 +#include <linux/poll.h>
   9 +#include <linux/eventpoll.h>
  10 +#endif
  11 +
  12  #if DEBUG > 1
  13  #define dprintk                printk
  14  #else
  15 @@ -428,7 +433,7 @@
  16         req->private = NULL;
  17         req->ki_iovec = NULL;
  18         INIT_LIST_HEAD(&req->ki_run_list);
  19 -       req->ki_eventfd = NULL;
  20 +       req->ki_eventfd = ERR_PTR(-EINVAL);
  21
  22         /* Check if the completion queue has enough free space to
  23          * accept an event from this io.
  24 @@ -470,6 +475,8 @@
  25  {
  26         assert_spin_locked(&ctx->ctx_lock);
  27
  28 +       if (!IS_ERR(req->ki_eventfd))
  29 +               fput(req->ki_eventfd);
  30         if (req->ki_dtor)
  31                 req->ki_dtor(req);
  32         if (req->ki_iovec != &req->ki_inline_vec)
  33 @@ -491,11 +498,8 @@
  34                 list_del(&req->ki_list);
  35                 spin_unlock_irq(&fput_lock);
  36
  37 -               /* Complete the fput(s) */
  38 -               if (req->ki_filp != NULL)
  39 -                       __fput(req->ki_filp);
  40 -               if (req->ki_eventfd != NULL)
  41 -                       __fput(req->ki_eventfd);
  42 +               /* Complete the fput */
  43 +               __fput(req->ki_filp);
  44
  45                 /* Link the iocb into the context's free list */
  46                 spin_lock_irq(&ctx->ctx_lock);
  47 @@ -513,14 +517,12 @@
  48   */
  49  static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
  50  {
  51 -       int schedule_putreq = 0;
  52 -
  53         dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
  54                 req, atomic_long_read(&req->ki_filp->f_count));
  55
  56         assert_spin_locked(&ctx->ctx_lock);
  57
  58 -       req->ki_users--;
  59 +       req->ki_users --;
  60         BUG_ON(req->ki_users < 0);
  61         if (likely(req->ki_users))
  62                 return 0;
  63 @@ -528,23 +530,10 @@
  64         req->ki_cancel = NULL;
  65         req->ki_retry = NULL;
  66
  67 -       /*
  68 -        * Try to optimize the aio and eventfd file* puts, by avoiding to
  69 -        * schedule work in case it is not __fput() time. In normal cases,
  70 -        * we would not be holding the last reference to the file*, so
  71 -        * this function will be executed w/out any aio kthread wakeup.
  72 +       /* Must be done under the lock to serialise against cancellation.
  73 +        * Call this aio_fput as it duplicates fput via the fput_work.
  74          */
  75 -       if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
  76 -               schedule_putreq++;
  77 -       else
  78 -               req->ki_filp = NULL;
  79 -       if (req->ki_eventfd != NULL) {
  80 -               if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
  81 -                       schedule_putreq++;
  82 -               else
  83 -                       req->ki_eventfd = NULL;
  84 -       }
  85 -       if (unlikely(schedule_putreq)) {
  86 +       if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
  87                 get_ioctx(ctx);
  88                 spin_lock(&fput_lock);
  89                 list_add(&req->ki_list, &fput_head);
  90 @@ -1008,7 +997,7 @@
  91          * eventfd. The eventfd_signal() function is safe to be called
  92          * from IRQ context.
  93          */
  94 -       if (iocb->ki_eventfd != NULL)
  95 +       if (!IS_ERR(iocb->ki_eventfd))
  96                 eventfd_signal(iocb->ki_eventfd, 1);
  97
  98  put_rq:
  99 @@ -1026,6 +1015,11 @@
 100         if (waitqueue_active(&ctx->wait))
 101                 wake_up(&ctx->wait);
 102
 103 +#ifdef CONFIG_EPOLL
 104 +       if (ctx->file && waitqueue_active(&ctx->poll_wait))
 105 +               wake_up(&ctx->poll_wait);
 106 +#endif
 107 +
 108         spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 109         return ret;
 110  }
 111 @@ -1033,6 +1027,8 @@
 112  /* aio_read_evt
 113   *     Pull an event off of the ioctx's event ring.  Returns the number of
 114   *     events fetched (0 or 1 ;-)
 115 + *     If ent parameter is 0, just returns the number of events that would
 116 + *     be fetched.
 117   *     FIXME: make this use cmpxchg.
 118   *     TODO: make the ringbuffer user mmap()able (requires FIXME).
 119   */
 120 @@ -1055,13 +1051,18 @@
 121
 122         head = ring->head % info->nr;
 123         if (head != ring->tail) {
 124 -               struct io_event *evp = aio_ring_event(info, head, KM_USER1);
 125 -               *ent = *evp;
 126 -               head = (head + 1) % info->nr;
 127 -               smp_mb(); /* finish reading the event before updatng the head */
 128 -               ring->head = head;
 129 -               ret = 1;
 130 -               put_aio_ring_event(evp, KM_USER1);
 131 +               if (ent) { /* event requested */
 132 +                       struct io_event *evp =
 133 +                               aio_ring_event(info, head, KM_USER1);
 134 +                       *ent = *evp;
 135 +                       head = (head + 1) % info->nr;
 136 +                       /* finish reading the event before updatng the head */
 137 +                       smp_mb();
 138 +                       ring->head = head;
 139 +                       ret = 1;
 140 +                       put_aio_ring_event(evp, KM_USER1);
 141 +               } else /* only need to know availability */
 142 +                       ret = 1;
 143         }
 144         spin_unlock(&info->ring_lock);
 145
 146 @@ -1251,6 +1252,13 @@
 147
 148         aio_cancel_all(ioctx);
 149         wait_for_all_aios(ioctx);
 150 +#ifdef CONFIG_EPOLL
 151 +       /* forget the poll file, but it's up to the user to close it */
 152 +       if (ioctx->file) {
 153 +               ioctx->file->private_data = 0;
 154 +               ioctx->file = 0;
 155 +       }
 156 +#endif
 157
 158         /*
 159          * Wake up any waiters.  The setting of ctx->dead must be seen
 160 @@ -1261,6 +1269,67 @@
 161         put_ioctx(ioctx);       /* once for the lookup */
 162  }
 163
 164 +#ifdef CONFIG_EPOLL
 165 +
 166 +static int aio_queue_fd_close(struct inode *inode, struct file *file)
 167 +{
 168 +       struct kioctx *ioctx = file->private_data;
 169 +       if (ioctx) {
 170 +               file->private_data = 0;
 171 +               spin_lock_irq(&ioctx->ctx_lock);
 172 +               ioctx->file = 0;
 173 +               spin_unlock_irq(&ioctx->ctx_lock);
 174 +       }
 175 +       return 0;
 176 +}
 177 +
 178 +static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
 179 +{      unsigned int pollflags = 0;
 180 +       struct kioctx *ioctx = file->private_data;
 181 +
 182 +       if (ioctx) {
 183 +
 184 +               spin_lock_irq(&ioctx->ctx_lock);
 185 +               /* Insert inside our poll wait queue */
 186 +               poll_wait(file, &ioctx->poll_wait, wait);
 187 +
 188 +               /* Check our condition */
 189 +               if (aio_read_evt(ioctx, 0))
 190 +                       pollflags = POLLIN | POLLRDNORM;
 191 +               spin_unlock_irq(&ioctx->ctx_lock);
 192 +       }
 193 +
 194 +       return pollflags;
 195 +}
 196 +
 197 +static const struct file_operations aioq_fops = {
 198 +       .release        = aio_queue_fd_close,
 199 +       .poll           = aio_queue_fd_poll
 200 +};
 201 +
 202 +/* make_aio_fd:
 203 + *  Create a file descriptor that can be used to poll the event queue.
 204 + *  Based and piggybacked on the excellent epoll code.
 205 + */
 206 +
 207 +static int make_aio_fd(struct kioctx *ioctx)
 208 +{
 209 +       int error, fd;
 210 +       struct inode *inode;
 211 +       struct file *file;
 212 +
 213 +       error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops);
 214 +       if (error)
 215 +               return error;
 216 +
 217 +       /* associate the file with the IO context */
 218 +       file->private_data = ioctx;
 219 +       ioctx->file = file;
 220 +       init_waitqueue_head(&ioctx->poll_wait);
 221 +       return fd;
 222 +}
 223 +#endif
 224 +
 225  /* sys_io_setup:
 226   *     Create an aio_context capable of receiving at least nr_events.
 227   *     ctxp must not point to an aio_context that already exists, and
 228 @@ -1273,18 +1342,30 @@
 229   *     resources are available.  May fail with -EFAULT if an invalid
 230   *     pointer is passed for ctxp.  Will fail with -ENOSYS if not
 231   *     implemented.
 232 + *
 233 + *     To request a selectable fd, the user context has to be initialized
 234 + *     to 1, instead of 0, and the return value is the fd.
 235 + *     This keeps the system call compatible, since a non-zero value
 236 + *     was not allowed so far.
 237   */
 238 -SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 239 +asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
 240  {
 241         struct kioctx *ioctx = NULL;
 242         unsigned long ctx;
 243         long ret;
 244 +       int make_fd = 0;
 245
 246         ret = get_user(ctx, ctxp);
 247         if (unlikely(ret))
 248                 goto out;
 249
 250         ret = -EINVAL;
 251 +#ifdef CONFIG_EPOLL
 252 +       if (ctx == 1) {
 253 +               make_fd = 1;
 254 +               ctx = 0;
 255 +       }
 256 +#endif
 257         if (unlikely(ctx || nr_events == 0)) {
 258                 pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
 259                          ctx, nr_events);
 260 @@ -1295,8 +1376,12 @@
 261         ret = PTR_ERR(ioctx);
 262         if (!IS_ERR(ioctx)) {
 263                 ret = put_user(ioctx->user_id, ctxp);
 264 -               if (!ret)
 265 -                       return 0;
 266 +#ifdef CONFIG_EPOLL
 267 +               if (make_fd && ret >= 0)
 268 +                       ret = make_aio_fd(ioctx);
 269 +#endif
 270 +               if (ret >= 0)
 271 +                       return ret;
 272
 273                 get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
 274                 io_destroy(ioctx);
 275 @@ -1312,7 +1397,7 @@
 276   *     implemented.  May fail with -EFAULT if the context pointed to
 277   *     is invalid.
 278   */
 279 -SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 280 +asmlinkage long sys_io_destroy(aio_context_t ctx)
 281  {
 282         struct kioctx *ioctx = lookup_ioctx(ctx);
 283         if (likely(NULL != ioctx)) {
 284 @@ -1612,7 +1697,6 @@
 285                 req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
 286                 if (IS_ERR(req->ki_eventfd)) {
 287                         ret = PTR_ERR(req->ki_eventfd);
 288 -                       req->ki_eventfd = NULL;
 289                         goto out_put_req;
 290                 }
 291         }
 292 @@ -1667,8 +1751,8 @@
 293   *     are available to queue any iocbs.  Will return 0 if nr is 0.  Will
 294   *     fail with -ENOSYS if not implemented.
 295   */
 296 -SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 297 -               struct iocb __user * __user *, iocbpp)
 298 +asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr,
 299 +                             struct iocb __user * __user *iocbpp)
 300  {
 301         struct kioctx *ctx;
 302         long ret = 0;
 303 @@ -1742,8 +1826,8 @@
 304   *     invalid.  May fail with -EAGAIN if the iocb specified was not
 305   *     cancelled.  Will fail with -ENOSYS if not implemented.
 306   */
 307 -SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 308 -               struct io_event __user *, result)
 309 +asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
 310 +                             struct io_event __user *result)
 311  {
 312         int (*cancel)(struct kiocb *iocb, struct io_event *res);
 313         struct kioctx *ctx;
 314 @@ -1804,11 +1888,11 @@
 315   *     will be updated if not NULL and the operation blocks.  Will fail
 316   *     with -ENOSYS if not implemented.
 317   */
 318 -SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 319 -               long, min_nr,
 320 -               long, nr,
 321 -               struct io_event __user *, events,
 322 -               struct timespec __user *, timeout)
 323 +asmlinkage long sys_io_getevents(aio_context_t ctx_id,
 324 +                                long min_nr,
 325 +                                long nr,
 326 +                                struct io_event __user *events,
 327 +                                struct timespec __user *timeout)
 328  {
 329         struct kioctx *ioctx = lookup_ioctx(ctx_id);
 330         long ret = -EINVAL;