]>
Commit | Line | Data |
---|---|---|
1 | From: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
2 | Subject: nfs: enable swap on NFS | |
3 | Patch-mainline: No | |
4 | References: FATE#303834 | |
5 | ||
6 | Implement all the new swapfile a_ops for NFS. This will set the NFS socket to | |
7 | SOCK_MEMALLOC and run socket reconnect under PF_MEMALLOC as well as reset | |
8 | SOCK_MEMALLOC before engaging the protocol ->connect() method. | |
9 | ||
10 | PF_MEMALLOC should allow the allocation of struct socket and related objects | |
11 | and the early (re)setting of SOCK_MEMALLOC should allow us to receive the | |
12 | packets required for the TCP connection buildup. | |
13 | ||
14 | (swapping continues over a server reset during heavy network traffic) | |
15 | ||
16 | Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
17 | Acked-by: Neil Brown <neilb@suse.de> | |
18 | Acked-by: Suresh Jayaraman <sjayaraman@suse.de> | |
19 | ||
20 | --- | |
21 | fs/Kconfig | 17 ++++++++++ | |
22 | fs/nfs/file.c | 18 ++++++++++ | |
23 | fs/nfs/write.c | 22 +++++++++++++ | |
24 | include/linux/nfs_fs.h | 2 + | |
25 | include/linux/sunrpc/xprt.h | 5 ++- | |
26 | net/sunrpc/sched.c | 9 ++++- | |
27 | net/sunrpc/xprtsock.c | 73 ++++++++++++++++++++++++++++++++++++++++++++ | |
28 | 7 files changed, 143 insertions(+), 3 deletions(-) | |
29 | ||
30 | --- a/fs/Kconfig | |
31 | +++ b/fs/Kconfig | |
32 | @@ -1748,6 +1748,18 @@ config ROOT_NFS | |
33 | ||
34 | Most people say N here. | |
35 | ||
36 | +config NFS_SWAP | |
37 | + bool "Provide swap over NFS support" | |
38 | + default n | |
39 | + depends on NFS_FS | |
40 | + select SUNRPC_SWAP | |
41 | + help | |
42 | + This option enables swapon to work on files located on NFS mounts. | |
43 | + | |
44 | + For more details, see Documentation/network-swap.txt | |
45 | + | |
46 | + If unsure, say N. | |
47 | + | |
48 | config NFSD | |
49 | tristate "NFS server support" | |
50 | depends on INET | |
51 | @@ -1869,6 +1881,11 @@ config SUNRPC_XPRT_RDMA | |
52 | ||
53 | If unsure, say N. | |
54 | ||
55 | +config SUNRPC_SWAP | |
56 | + def_bool n | |
57 | + depends on SUNRPC | |
58 | + select NETVM | |
59 | + | |
60 | config RPCSEC_GSS_KRB5 | |
61 | tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" | |
62 | depends on SUNRPC && EXPERIMENTAL | |
63 | --- a/fs/nfs/file.c | |
64 | +++ b/fs/nfs/file.c | |
65 | @@ -434,6 +434,18 @@ static int nfs_launder_page(struct page | |
66 | return nfs_wb_page(inode, page); | |
67 | } | |
68 | ||
69 | +#ifdef CONFIG_NFS_SWAP | |
70 | +static int nfs_swapon(struct file *file) | |
71 | +{ | |
72 | + return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); | |
73 | +} | |
74 | + | |
75 | +static int nfs_swapoff(struct file *file) | |
76 | +{ | |
77 | + return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); | |
78 | +} | |
79 | +#endif | |
80 | + | |
81 | const struct address_space_operations nfs_file_aops = { | |
82 | .readpage = nfs_readpage, | |
83 | .readpages = nfs_readpages, | |
84 | @@ -446,6 +458,12 @@ const struct address_space_operations nf | |
85 | .releasepage = nfs_release_page, | |
86 | .direct_IO = nfs_direct_IO, | |
87 | .launder_page = nfs_launder_page, | |
88 | +#ifdef CONFIG_NFS_SWAP | |
89 | + .swapon = nfs_swapon, | |
90 | + .swapoff = nfs_swapoff, | |
91 | + .swap_out = nfs_swap_out, | |
92 | + .swap_in = nfs_readpage, | |
93 | +#endif | |
94 | }; | |
95 | ||
96 | static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |
97 | --- a/fs/nfs/write.c | |
98 | +++ b/fs/nfs/write.c | |
99 | @@ -336,6 +336,28 @@ int nfs_writepage(struct page *page, str | |
100 | return ret; | |
101 | } | |
102 | ||
103 | +static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, | |
104 | + unsigned int offset, unsigned int count); | |
105 | + | |
106 | +int nfs_swap_out(struct file *file, struct page *page, | |
107 | + struct writeback_control *wbc) | |
108 | +{ | |
109 | + struct nfs_open_context *ctx = nfs_file_open_context(file); | |
110 | + int status; | |
111 | + | |
112 | + status = nfs_writepage_setup(ctx, page, 0, nfs_page_length(page)); | |
113 | + if (status < 0) { | |
114 | + nfs_set_pageerror(page); | |
115 | + goto out; | |
116 | + } | |
117 | + | |
118 | + status = nfs_writepage_locked(page, wbc); | |
119 | + | |
120 | +out: | |
121 | + unlock_page(page); | |
122 | + return status; | |
123 | +} | |
124 | + | |
125 | static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) | |
126 | { | |
127 | int ret; | |
128 | --- a/include/linux/nfs_fs.h | |
129 | +++ b/include/linux/nfs_fs.h | |
130 | @@ -464,6 +464,8 @@ extern int nfs_writepages(struct addres | |
131 | extern int nfs_flush_incompatible(struct file *file, struct page *page); | |
132 | extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); | |
133 | extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); | |
134 | +extern int nfs_swap_out(struct file *file, struct page *page, | |
135 | + struct writeback_control *wbc); | |
136 | ||
137 | /* | |
138 | * Try to write back everything synchronously (but check the | |
139 | --- a/include/linux/sunrpc/xprt.h | |
140 | +++ b/include/linux/sunrpc/xprt.h | |
141 | @@ -147,7 +147,9 @@ struct rpc_xprt { | |
142 | unsigned int max_reqs; /* total slots */ | |
143 | unsigned long state; /* transport state */ | |
144 | unsigned char shutdown : 1, /* being shut down */ | |
145 | - resvport : 1; /* use a reserved port */ | |
146 | + resvport : 1, /* use a reserved port */ | |
147 | + swapper : 1; /* we're swapping over this | |
148 | + transport */ | |
149 | unsigned int bind_index; /* bind function index */ | |
150 | ||
151 | /* | |
152 | @@ -249,6 +251,7 @@ void xprt_release_rqst_cong(struct rpc | |
153 | void xprt_disconnect_done(struct rpc_xprt *xprt); | |
154 | void xprt_force_disconnect(struct rpc_xprt *xprt); | |
155 | void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); | |
156 | +int xs_swapper(struct rpc_xprt *xprt, int enable); | |
157 | ||
158 | /* | |
159 | * Reserved bit positions in xprt->state | |
160 | --- a/net/sunrpc/sched.c | |
161 | +++ b/net/sunrpc/sched.c | |
162 | @@ -729,7 +729,10 @@ struct rpc_buffer { | |
163 | void *rpc_malloc(struct rpc_task *task, size_t size) | |
164 | { | |
165 | struct rpc_buffer *buf; | |
166 | - gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; | |
167 | + gfp_t gfp = GFP_NOWAIT; | |
168 | + | |
169 | + if (RPC_IS_SWAPPER(task)) | |
170 | + gfp |= __GFP_MEMALLOC; | |
171 | ||
172 | size += sizeof(struct rpc_buffer); | |
173 | if (size <= RPC_BUFFER_MAXSIZE) | |
174 | @@ -800,6 +803,8 @@ static void rpc_init_task(struct rpc_tas | |
175 | kref_get(&task->tk_client->cl_kref); | |
176 | if (task->tk_client->cl_softrtry) | |
177 | task->tk_flags |= RPC_TASK_SOFT; | |
178 | + if (task->tk_client->cl_xprt->swapper) | |
179 | + task->tk_flags |= RPC_TASK_SWAPPER; | |
180 | } | |
181 | ||
182 | if (task->tk_ops->rpc_call_prepare != NULL) | |
183 | @@ -825,7 +830,7 @@ static void rpc_init_task(struct rpc_tas | |
184 | static struct rpc_task * | |
185 | rpc_alloc_task(void) | |
186 | { | |
187 | - return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); | |
188 | + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO); | |
189 | } | |
190 | ||
191 | /* | |
192 | --- a/net/sunrpc/xprtsock.c | |
193 | +++ b/net/sunrpc/xprtsock.c | |
194 | @@ -1454,6 +1454,55 @@ static inline void xs_reclassify_socket6 | |
195 | } | |
196 | #endif | |
197 | ||
198 | +#ifdef CONFIG_SUNRPC_SWAP | |
199 | +static void xs_set_memalloc(struct rpc_xprt *xprt) | |
200 | +{ | |
201 | + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); | |
202 | + | |
203 | + if (xprt->swapper) | |
204 | + sk_set_memalloc(transport->inet); | |
205 | +} | |
206 | + | |
207 | +#define RPC_BUF_RESERVE_PAGES \ | |
208 | + kmalloc_estimate_objs(sizeof(struct rpc_rqst), GFP_KERNEL, RPC_MAX_SLOT_TABLE) | |
209 | +#define RPC_RESERVE_PAGES (RPC_BUF_RESERVE_PAGES + TX_RESERVE_PAGES) | |
210 | + | |
211 | +/** | |
212 | + * xs_swapper - Tag this transport as being used for swap. | |
213 | + * @xprt: transport to tag | |
214 | + * @enable: enable/disable | |
215 | + * | |
216 | + */ | |
217 | +int xs_swapper(struct rpc_xprt *xprt, int enable) | |
218 | +{ | |
219 | + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); | |
220 | + int err = 0; | |
221 | + | |
222 | + if (enable) { | |
223 | + /* | |
224 | + * keep one extra sock reference so the reserve won't dip | |
225 | + * when the socket gets reconnected. | |
226 | + */ | |
227 | + err = sk_adjust_memalloc(1, RPC_RESERVE_PAGES); | |
228 | + if (!err) { | |
229 | + xprt->swapper = 1; | |
230 | + xs_set_memalloc(xprt); | |
231 | + } | |
232 | + } else if (xprt->swapper) { | |
233 | + xprt->swapper = 0; | |
234 | + sk_clear_memalloc(transport->inet); | |
235 | + sk_adjust_memalloc(-1, -RPC_RESERVE_PAGES); | |
236 | + } | |
237 | + | |
238 | + return err; | |
239 | +} | |
240 | +EXPORT_SYMBOL_GPL(xs_swapper); | |
241 | +#else | |
242 | +static void xs_set_memalloc(struct rpc_xprt *xprt) | |
243 | +{ | |
244 | +} | |
245 | +#endif | |
246 | + | |
247 | static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) | |
248 | { | |
249 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); | |
250 | @@ -1478,6 +1527,8 @@ static void xs_udp_finish_connecting(str | |
251 | transport->sock = sock; | |
252 | transport->inet = sk; | |
253 | ||
254 | + xs_set_memalloc(xprt); | |
255 | + | |
256 | write_unlock_bh(&sk->sk_callback_lock); | |
257 | } | |
258 | xs_udp_do_set_buffer_size(xprt); | |
259 | @@ -1495,11 +1546,15 @@ static void xs_udp_connect_worker4(struc | |
260 | container_of(work, struct sock_xprt, connect_worker.work); | |
261 | struct rpc_xprt *xprt = &transport->xprt; | |
262 | struct socket *sock = transport->sock; | |
263 | + unsigned long pflags = current->flags; | |
264 | int err, status = -EIO; | |
265 | ||
266 | if (xprt->shutdown || !xprt_bound(xprt)) | |
267 | goto out; | |
268 | ||
269 | + if (xprt->swapper) | |
270 | + current->flags |= PF_MEMALLOC; | |
271 | + | |
272 | /* Start by resetting any existing state */ | |
273 | xs_close(xprt); | |
274 | ||
275 | @@ -1522,6 +1577,7 @@ static void xs_udp_connect_worker4(struc | |
276 | out: | |
277 | xprt_wake_pending_tasks(xprt, status); | |
278 | xprt_clear_connecting(xprt); | |
279 | + tsk_restore_flags(current, pflags, PF_MEMALLOC); | |
280 | } | |
281 | ||
282 | /** | |
283 | @@ -1536,11 +1592,15 @@ static void xs_udp_connect_worker6(struc | |
284 | container_of(work, struct sock_xprt, connect_worker.work); | |
285 | struct rpc_xprt *xprt = &transport->xprt; | |
286 | struct socket *sock = transport->sock; | |
287 | + unsigned long pflags = current->flags; | |
288 | int err, status = -EIO; | |
289 | ||
290 | if (xprt->shutdown || !xprt_bound(xprt)) | |
291 | goto out; | |
292 | ||
293 | + if (xprt->swapper) | |
294 | + current->flags |= PF_MEMALLOC; | |
295 | + | |
296 | /* Start by resetting any existing state */ | |
297 | xs_close(xprt); | |
298 | ||
299 | @@ -1563,6 +1623,7 @@ static void xs_udp_connect_worker6(struc | |
300 | out: | |
301 | xprt_wake_pending_tasks(xprt, status); | |
302 | xprt_clear_connecting(xprt); | |
303 | + tsk_restore_flags(current, pflags, PF_MEMALLOC); | |
304 | } | |
305 | ||
306 | /* | |
307 | @@ -1632,6 +1693,8 @@ static int xs_tcp_finish_connecting(stru | |
308 | write_unlock_bh(&sk->sk_callback_lock); | |
309 | } | |
310 | ||
311 | + xs_set_memalloc(xprt); | |
312 | + | |
313 | /* Tell the socket layer to start connecting... */ | |
314 | xprt->stat.connect_count++; | |
315 | xprt->stat.connect_start = jiffies; | |
316 | @@ -1650,11 +1713,15 @@ static void xs_tcp_connect_worker4(struc | |
317 | container_of(work, struct sock_xprt, connect_worker.work); | |
318 | struct rpc_xprt *xprt = &transport->xprt; | |
319 | struct socket *sock = transport->sock; | |
320 | + unsigned long pflags = current->flags; | |
321 | int err, status = -EIO; | |
322 | ||
323 | if (xprt->shutdown || !xprt_bound(xprt)) | |
324 | goto out; | |
325 | ||
326 | + if (xprt->swapper) | |
327 | + current->flags |= PF_MEMALLOC; | |
328 | + | |
329 | if (!sock) { | |
330 | /* start from scratch */ | |
331 | if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { | |
332 | @@ -1696,6 +1763,7 @@ out: | |
333 | xprt_wake_pending_tasks(xprt, status); | |
334 | out_clear: | |
335 | xprt_clear_connecting(xprt); | |
336 | + tsk_restore_flags(current, pflags, PF_MEMALLOC); | |
337 | } | |
338 | ||
339 | /** | |
340 | @@ -1710,11 +1778,15 @@ static void xs_tcp_connect_worker6(struc | |
341 | container_of(work, struct sock_xprt, connect_worker.work); | |
342 | struct rpc_xprt *xprt = &transport->xprt; | |
343 | struct socket *sock = transport->sock; | |
344 | + unsigned long pflags = current->flags; | |
345 | int err, status = -EIO; | |
346 | ||
347 | if (xprt->shutdown || !xprt_bound(xprt)) | |
348 | goto out; | |
349 | ||
350 | + if (xprt->swapper) | |
351 | + current->flags |= PF_MEMALLOC; | |
352 | + | |
353 | if (!sock) { | |
354 | /* start from scratch */ | |
355 | if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { | |
356 | @@ -1755,6 +1827,7 @@ out: | |
357 | xprt_wake_pending_tasks(xprt, status); | |
358 | out_clear: | |
359 | xprt_clear_connecting(xprt); | |
360 | + tsk_restore_flags(current, pflags, PF_MEMALLOC); | |
361 | } | |
362 | ||
363 | /** |