]>
Commit | Line | Data |
---|---|---|
cc90b958 BS |
1 | --- linux-2.6.27.21/fs/aio.c 2009-03-23 22:04:09.000000000 +0000 |
2 | +++ linux-2.6.27.8/fs/aio.c 2009-03-29 15:53:57.000000000 +0000 | |
3 | @@ -36,6 +36,11 @@ | |
4 | #include <asm/uaccess.h> | |
5 | #include <asm/mmu_context.h> | |
6 | ||
7 | +#ifdef CONFIG_EPOLL | |
8 | +#include <linux/poll.h> | |
9 | +#include <linux/eventpoll.h> | |
10 | +#endif | |
11 | + | |
12 | #if DEBUG > 1 | |
13 | #define dprintk printk | |
14 | #else | |
15 | @@ -428,7 +433,7 @@ | |
16 | req->private = NULL; | |
17 | req->ki_iovec = NULL; | |
18 | INIT_LIST_HEAD(&req->ki_run_list); | |
19 | - req->ki_eventfd = NULL; | |
20 | + req->ki_eventfd = ERR_PTR(-EINVAL); | |
21 | ||
22 | /* Check if the completion queue has enough free space to | |
23 | * accept an event from this io. | |
24 | @@ -470,6 +475,8 @@ | |
25 | { | |
26 | assert_spin_locked(&ctx->ctx_lock); | |
27 | ||
28 | + if (!IS_ERR(req->ki_eventfd)) | |
29 | + fput(req->ki_eventfd); | |
30 | if (req->ki_dtor) | |
31 | req->ki_dtor(req); | |
32 | if (req->ki_iovec != &req->ki_inline_vec) | |
33 | @@ -491,11 +498,8 @@ | |
34 | list_del(&req->ki_list); | |
35 | spin_unlock_irq(&fput_lock); | |
36 | ||
37 | - /* Complete the fput(s) */ | |
38 | - if (req->ki_filp != NULL) | |
39 | - __fput(req->ki_filp); | |
40 | - if (req->ki_eventfd != NULL) | |
41 | - __fput(req->ki_eventfd); | |
42 | + /* Complete the fput */ | |
43 | + __fput(req->ki_filp); | |
44 | ||
45 | /* Link the iocb into the context's free list */ | |
46 | spin_lock_irq(&ctx->ctx_lock); | |
47 | @@ -513,14 +517,12 @@ | |
48 | */ | |
49 | static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) | |
50 | { | |
51 | - int schedule_putreq = 0; | |
52 | - | |
53 | dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", | |
54 | req, atomic_long_read(&req->ki_filp->f_count)); | |
55 | ||
56 | assert_spin_locked(&ctx->ctx_lock); | |
57 | ||
58 | - req->ki_users--; | |
59 | + req->ki_users --; | |
60 | BUG_ON(req->ki_users < 0); | |
61 | if (likely(req->ki_users)) | |
62 | return 0; | |
63 | @@ -528,23 +530,10 @@ | |
64 | req->ki_cancel = NULL; | |
65 | req->ki_retry = NULL; | |
66 | ||
67 | - /* | |
68 | - * Try to optimize the aio and eventfd file* puts, by avoiding to | |
69 | - * schedule work in case it is not __fput() time. In normal cases, | |
70 | - * we would not be holding the last reference to the file*, so | |
71 | - * this function will be executed w/out any aio kthread wakeup. | |
72 | + /* Must be done under the lock to serialise against cancellation. | |
73 | + * Call this aio_fput as it duplicates fput via the fput_work. | |
74 | */ | |
75 | - if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) | |
76 | - schedule_putreq++; | |
77 | - else | |
78 | - req->ki_filp = NULL; | |
79 | - if (req->ki_eventfd != NULL) { | |
80 | - if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count))) | |
81 | - schedule_putreq++; | |
82 | - else | |
83 | - req->ki_eventfd = NULL; | |
84 | - } | |
85 | - if (unlikely(schedule_putreq)) { | |
86 | + if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { | |
87 | get_ioctx(ctx); | |
88 | spin_lock(&fput_lock); | |
89 | list_add(&req->ki_list, &fput_head); | |
90 | @@ -1008,7 +997,7 @@ | |
91 | * eventfd. The eventfd_signal() function is safe to be called | |
92 | * from IRQ context. | |
93 | */ | |
94 | - if (iocb->ki_eventfd != NULL) | |
95 | + if (!IS_ERR(iocb->ki_eventfd)) | |
96 | eventfd_signal(iocb->ki_eventfd, 1); | |
97 | ||
98 | put_rq: | |
99 | @@ -1026,6 +1015,11 @@ | |
100 | if (waitqueue_active(&ctx->wait)) | |
101 | wake_up(&ctx->wait); | |
102 | ||
103 | +#ifdef CONFIG_EPOLL | |
104 | + if (ctx->file && waitqueue_active(&ctx->poll_wait)) | |
105 | + wake_up(&ctx->poll_wait); | |
106 | +#endif | |
107 | + | |
108 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | |
109 | return ret; | |
110 | } | |
111 | @@ -1033,6 +1027,8 @@ | |
112 | /* aio_read_evt | |
113 | * Pull an event off of the ioctx's event ring. Returns the number of | |
114 | * events fetched (0 or 1 ;-) | |
115 | + * If ent parameter is 0, just returns the number of events that would | |
116 | + * be fetched. | |
117 | * FIXME: make this use cmpxchg. | |
118 | * TODO: make the ringbuffer user mmap()able (requires FIXME). | |
119 | */ | |
120 | @@ -1055,13 +1051,18 @@ | |
121 | ||
122 | head = ring->head % info->nr; | |
123 | if (head != ring->tail) { | |
124 | - struct io_event *evp = aio_ring_event(info, head, KM_USER1); | |
125 | - *ent = *evp; | |
126 | - head = (head + 1) % info->nr; | |
127 | - smp_mb(); /* finish reading the event before updatng the head */ | |
128 | - ring->head = head; | |
129 | - ret = 1; | |
130 | - put_aio_ring_event(evp, KM_USER1); | |
131 | + if (ent) { /* event requested */ | |
132 | + struct io_event *evp = | |
133 | + aio_ring_event(info, head, KM_USER1); | |
134 | + *ent = *evp; | |
135 | + head = (head + 1) % info->nr; | |
136 | + /* finish reading the event before updatng the head */ | |
137 | + smp_mb(); | |
138 | + ring->head = head; | |
139 | + ret = 1; | |
140 | + put_aio_ring_event(evp, KM_USER1); | |
141 | + } else /* only need to know availability */ | |
142 | + ret = 1; | |
143 | } | |
144 | spin_unlock(&info->ring_lock); | |
145 | ||
146 | @@ -1251,6 +1252,13 @@ | |
147 | ||
148 | aio_cancel_all(ioctx); | |
149 | wait_for_all_aios(ioctx); | |
150 | +#ifdef CONFIG_EPOLL | |
151 | + /* forget the poll file, but it's up to the user to close it */ | |
152 | + if (ioctx->file) { | |
153 | + ioctx->file->private_data = 0; | |
154 | + ioctx->file = 0; | |
155 | + } | |
156 | +#endif | |
157 | ||
158 | /* | |
159 | * Wake up any waiters. The setting of ctx->dead must be seen | |
160 | @@ -1261,6 +1269,67 @@ | |
161 | put_ioctx(ioctx); /* once for the lookup */ | |
162 | } | |
163 | ||
164 | +#ifdef CONFIG_EPOLL | |
165 | + | |
166 | +static int aio_queue_fd_close(struct inode *inode, struct file *file) | |
167 | +{ | |
168 | + struct kioctx *ioctx = file->private_data; | |
169 | + if (ioctx) { | |
170 | + file->private_data = 0; | |
171 | + spin_lock_irq(&ioctx->ctx_lock); | |
172 | + ioctx->file = 0; | |
173 | + spin_unlock_irq(&ioctx->ctx_lock); | |
174 | + } | |
175 | + return 0; | |
176 | +} | |
177 | + | |
178 | +static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait) | |
179 | +{ unsigned int pollflags = 0; | |
180 | + struct kioctx *ioctx = file->private_data; | |
181 | + | |
182 | + if (ioctx) { | |
183 | + | |
184 | + spin_lock_irq(&ioctx->ctx_lock); | |
185 | + /* Insert inside our poll wait queue */ | |
186 | + poll_wait(file, &ioctx->poll_wait, wait); | |
187 | + | |
188 | + /* Check our condition */ | |
189 | + if (aio_read_evt(ioctx, 0)) | |
190 | + pollflags = POLLIN | POLLRDNORM; | |
191 | + spin_unlock_irq(&ioctx->ctx_lock); | |
192 | + } | |
193 | + | |
194 | + return pollflags; | |
195 | +} | |
196 | + | |
197 | +static const struct file_operations aioq_fops = { | |
198 | + .release = aio_queue_fd_close, | |
199 | + .poll = aio_queue_fd_poll | |
200 | +}; | |
201 | + | |
202 | +/* make_aio_fd: | |
203 | + * Create a file descriptor that can be used to poll the event queue. | |
204 | + * Based and piggybacked on the excellent epoll code. | |
205 | + */ | |
206 | + | |
207 | +static int make_aio_fd(struct kioctx *ioctx) | |
208 | +{ | |
209 | + int error, fd; | |
210 | + struct inode *inode; | |
211 | + struct file *file; | |
212 | + | |
213 | + error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops); | |
214 | + if (error) | |
215 | + return error; | |
216 | + | |
217 | + /* associate the file with the IO context */ | |
218 | + file->private_data = ioctx; | |
219 | + ioctx->file = file; | |
220 | + init_waitqueue_head(&ioctx->poll_wait); | |
221 | + return fd; | |
222 | +} | |
223 | +#endif | |
224 | + | |
225 | /* sys_io_setup: | |
226 | * Create an aio_context capable of receiving at least nr_events. | |
227 | * ctxp must not point to an aio_context that already exists, and | |
228 | @@ -1273,18 +1342,30 @@ | |
229 | * resources are available. May fail with -EFAULT if an invalid | |
230 | * pointer is passed for ctxp. Will fail with -ENOSYS if not | |
231 | * implemented. | |
232 | + * | |
233 | + * To request a selectable fd, the user context has to be initialized | |
234 | + * to 1, instead of 0, and the return value is the fd. | |
235 | + * This keeps the system call compatible, since a non-zero value | |
236 | + * was not allowed so far. | |
237 | */ | |
238 | -SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) | |
239 | +asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp) | |
240 | { | |
241 | struct kioctx *ioctx = NULL; | |
242 | unsigned long ctx; | |
243 | long ret; | |
244 | + int make_fd = 0; | |
245 | ||
246 | ret = get_user(ctx, ctxp); | |
247 | if (unlikely(ret)) | |
248 | goto out; | |
249 | ||
250 | ret = -EINVAL; | |
251 | +#ifdef CONFIG_EPOLL | |
252 | + if (ctx == 1) { | |
253 | + make_fd = 1; | |
254 | + ctx = 0; | |
255 | + } | |
256 | +#endif | |
257 | if (unlikely(ctx || nr_events == 0)) { | |
258 | pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", | |
259 | ctx, nr_events); | |
260 | @@ -1295,8 +1376,12 @@ | |
261 | ret = PTR_ERR(ioctx); | |
262 | if (!IS_ERR(ioctx)) { | |
263 | ret = put_user(ioctx->user_id, ctxp); | |
264 | - if (!ret) | |
265 | - return 0; | |
266 | +#ifdef CONFIG_EPOLL | |
267 | + if (make_fd && ret >= 0) | |
268 | + ret = make_aio_fd(ioctx); | |
269 | +#endif | |
270 | + if (ret >= 0) | |
271 | + return ret; | |
272 | ||
273 | get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ | |
274 | io_destroy(ioctx); | |
275 | @@ -1312,7 +1397,7 @@ | |
276 | * implemented. May fail with -EFAULT if the context pointed to | |
277 | * is invalid. | |
278 | */ | |
279 | -SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) | |
280 | +asmlinkage long sys_io_destroy(aio_context_t ctx) | |
281 | { | |
282 | struct kioctx *ioctx = lookup_ioctx(ctx); | |
283 | if (likely(NULL != ioctx)) { | |
284 | @@ -1612,7 +1697,6 @@ | |
285 | req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); | |
286 | if (IS_ERR(req->ki_eventfd)) { | |
287 | ret = PTR_ERR(req->ki_eventfd); | |
288 | - req->ki_eventfd = NULL; | |
289 | goto out_put_req; | |
290 | } | |
291 | } | |
292 | @@ -1667,8 +1751,8 @@ | |
293 | * are available to queue any iocbs. Will return 0 if nr is 0. Will | |
294 | * fail with -ENOSYS if not implemented. | |
295 | */ | |
296 | -SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, | |
297 | - struct iocb __user * __user *, iocbpp) | |
298 | +asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, | |
299 | + struct iocb __user * __user *iocbpp) | |
300 | { | |
301 | struct kioctx *ctx; | |
302 | long ret = 0; | |
303 | @@ -1742,8 +1826,8 @@ | |
304 | * invalid. May fail with -EAGAIN if the iocb specified was not | |
305 | * cancelled. Will fail with -ENOSYS if not implemented. | |
306 | */ | |
307 | -SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | |
308 | - struct io_event __user *, result) | |
309 | +asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb, | |
310 | + struct io_event __user *result) | |
311 | { | |
312 | int (*cancel)(struct kiocb *iocb, struct io_event *res); | |
313 | struct kioctx *ctx; | |
314 | @@ -1804,11 +1888,11 @@ | |
315 | * will be updated if not NULL and the operation blocks. Will fail | |
316 | * with -ENOSYS if not implemented. | |
317 | */ | |
318 | -SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, | |
319 | - long, min_nr, | |
320 | - long, nr, | |
321 | - struct io_event __user *, events, | |
322 | - struct timespec __user *, timeout) | |
323 | +asmlinkage long sys_io_getevents(aio_context_t ctx_id, | |
324 | + long min_nr, | |
325 | + long nr, | |
326 | + struct io_event __user *events, | |
327 | + struct timespec __user *timeout) | |
328 | { | |
329 | struct kioctx *ioctx = lookup_ioctx(ctx_id); | |
330 | long ret = -EINVAL; |