1 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
2 Subject: nfs: enable swap on NFS
4 References: FATE#303834
6 Implement all the new swapfile a_ops for NFS. This will set the NFS socket to
7 SOCK_MEMALLOC and run socket reconnect under PF_MEMALLOC as well as reset
8 SOCK_MEMALLOC before engaging the protocol ->connect() method.
10 PF_MEMALLOC should allow the allocation of struct socket and related objects
11 and the early (re)setting of SOCK_MEMALLOC should allow us to receive the
12 packets required for the TCP connection buildup.
14 (swapping continues over a server reset during heavy network traffic)
16 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
17 Acked-by: Neil Brown <neilb@suse.de>
18 Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
21 fs/Kconfig | 17 ++++++++++
22 fs/nfs/file.c | 18 ++++++++++
23 fs/nfs/write.c | 22 +++++++++++++
24 include/linux/nfs_fs.h | 2 +
25 include/linux/sunrpc/xprt.h | 5 ++-
26 net/sunrpc/sched.c | 9 ++++-
27 net/sunrpc/xprtsock.c | 73 ++++++++++++++++++++++++++++++++++++++++++++
28 7 files changed, 143 insertions(+), 3 deletions(-)
32 @@ -1748,6 +1748,18 @@ config ROOT_NFS
34 Most people say N here.
37 + bool "Provide swap over NFS support"
42 + This option enables swapon to work on files located on NFS mounts.
44 + For more details, see Documentation/network-swap.txt
49 tristate "NFS server support"
51 @@ -1869,6 +1881,11 @@ config SUNRPC_XPRT_RDMA
60 config RPCSEC_GSS_KRB5
61 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
62 depends on SUNRPC && EXPERIMENTAL
65 @@ -434,6 +434,18 @@ static int nfs_launder_page(struct page
66 return nfs_wb_page(inode, page);
69 +#ifdef CONFIG_NFS_SWAP
70 +static int nfs_swapon(struct file *file)
72 + return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
75 +static int nfs_swapoff(struct file *file)
77 + return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
81 const struct address_space_operations nfs_file_aops = {
82 .readpage = nfs_readpage,
83 .readpages = nfs_readpages,
84 @@ -446,6 +458,12 @@ const struct address_space_operations nf
85 .releasepage = nfs_release_page,
86 .direct_IO = nfs_direct_IO,
87 .launder_page = nfs_launder_page,
88 +#ifdef CONFIG_NFS_SWAP
89 + .swapon = nfs_swapon,
90 + .swapoff = nfs_swapoff,
91 + .swap_out = nfs_swap_out,
92 + .swap_in = nfs_readpage,
96 static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
99 @@ -338,6 +338,28 @@ int nfs_writepage(struct page *page, str
103 +static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
104 + unsigned int offset, unsigned int count);
106 +int nfs_swap_out(struct file *file, struct page *page,
107 + struct writeback_control *wbc)
109 + struct nfs_open_context *ctx = nfs_file_open_context(file);
112 + status = nfs_writepage_setup(ctx, page, 0, nfs_page_length(page));
114 + nfs_set_pageerror(page);
118 + status = nfs_writepage_locked(page, wbc);
125 static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
128 --- a/include/linux/nfs_fs.h
129 +++ b/include/linux/nfs_fs.h
130 @@ -465,6 +465,8 @@ extern int nfs_flush_incompatible(struc
131 extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
132 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
133 extern void nfs_writedata_release(void *);
134 +extern int nfs_swap_out(struct file *file, struct page *page,
135 + struct writeback_control *wbc);
138 * Try to write back everything synchronously (but check the
139 --- a/include/linux/sunrpc/xprt.h
140 +++ b/include/linux/sunrpc/xprt.h
141 @@ -147,7 +147,9 @@ struct rpc_xprt {
142 unsigned int max_reqs; /* total slots */
143 unsigned long state; /* transport state */
144 unsigned char shutdown : 1, /* being shut down */
145 - resvport : 1; /* use a reserved port */
146 + resvport : 1, /* use a reserved port */
147 + swapper : 1; /* we're swapping over this
149 unsigned int bind_index; /* bind function index */
152 @@ -249,6 +251,7 @@ void xprt_release_rqst_cong(struct rpc
153 void xprt_disconnect_done(struct rpc_xprt *xprt);
154 void xprt_force_disconnect(struct rpc_xprt *xprt);
155 void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
156 +int xs_swapper(struct rpc_xprt *xprt, int enable);
159 * Reserved bit positions in xprt->state
160 --- a/net/sunrpc/sched.c
161 +++ b/net/sunrpc/sched.c
162 @@ -729,7 +729,10 @@ struct rpc_buffer {
163 void *rpc_malloc(struct rpc_task *task, size_t size)
165 struct rpc_buffer *buf;
166 - gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT;
167 + gfp_t gfp = GFP_NOWAIT;
169 + if (RPC_IS_SWAPPER(task))
170 + gfp |= __GFP_MEMALLOC;
172 size += sizeof(struct rpc_buffer);
173 if (size <= RPC_BUFFER_MAXSIZE)
174 @@ -800,6 +803,8 @@ static void rpc_init_task(struct rpc_tas
175 kref_get(&task->tk_client->cl_kref);
176 if (task->tk_client->cl_softrtry)
177 task->tk_flags |= RPC_TASK_SOFT;
178 + if (task->tk_client->cl_xprt->swapper)
179 + task->tk_flags |= RPC_TASK_SWAPPER;
182 if (task->tk_ops->rpc_call_prepare != NULL)
183 @@ -825,7 +830,7 @@ static void rpc_init_task(struct rpc_tas
184 static struct rpc_task *
187 - return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
188 + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
192 --- a/net/sunrpc/xprtsock.c
193 +++ b/net/sunrpc/xprtsock.c
194 @@ -1453,6 +1453,55 @@ static inline void xs_reclassify_socket6
198 +#ifdef CONFIG_SUNRPC_SWAP
199 +static void xs_set_memalloc(struct rpc_xprt *xprt)
201 + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
204 + sk_set_memalloc(transport->inet);
207 +#define RPC_BUF_RESERVE_PAGES \
208 + kmalloc_estimate_objs(sizeof(struct rpc_rqst), GFP_KERNEL, RPC_MAX_SLOT_TABLE)
209 +#define RPC_RESERVE_PAGES (RPC_BUF_RESERVE_PAGES + TX_RESERVE_PAGES)
212 + * xs_swapper - Tag this transport as being used for swap.
213 + * @xprt: transport to tag
214 + * @enable: enable/disable
217 +int xs_swapper(struct rpc_xprt *xprt, int enable)
219 + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
224 + * keep one extra sock reference so the reserve won't dip
225 + * when the socket gets reconnected.
227 + err = sk_adjust_memalloc(1, RPC_RESERVE_PAGES);
230 + xs_set_memalloc(xprt);
232 + } else if (xprt->swapper) {
234 + sk_clear_memalloc(transport->inet);
235 + sk_adjust_memalloc(-1, -RPC_RESERVE_PAGES);
240 +EXPORT_SYMBOL_GPL(xs_swapper);
242 +static void xs_set_memalloc(struct rpc_xprt *xprt)
247 static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
249 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
250 @@ -1477,6 +1526,8 @@ static void xs_udp_finish_connecting(str
251 transport->sock = sock;
252 transport->inet = sk;
254 + xs_set_memalloc(xprt);
256 write_unlock_bh(&sk->sk_callback_lock);
258 xs_udp_do_set_buffer_size(xprt);
259 @@ -1494,11 +1545,15 @@ static void xs_udp_connect_worker4(struc
260 container_of(work, struct sock_xprt, connect_worker.work);
261 struct rpc_xprt *xprt = &transport->xprt;
262 struct socket *sock = transport->sock;
263 + unsigned long pflags = current->flags;
264 int err, status = -EIO;
266 if (xprt->shutdown || !xprt_bound(xprt))
270 + current->flags |= PF_MEMALLOC;
272 /* Start by resetting any existing state */
275 @@ -1521,6 +1576,7 @@ static void xs_udp_connect_worker4(struc
277 xprt_wake_pending_tasks(xprt, status);
278 xprt_clear_connecting(xprt);
279 + tsk_restore_flags(current, pflags, PF_MEMALLOC);
283 @@ -1535,11 +1591,15 @@ static void xs_udp_connect_worker6(struc
284 container_of(work, struct sock_xprt, connect_worker.work);
285 struct rpc_xprt *xprt = &transport->xprt;
286 struct socket *sock = transport->sock;
287 + unsigned long pflags = current->flags;
288 int err, status = -EIO;
290 if (xprt->shutdown || !xprt_bound(xprt))
294 + current->flags |= PF_MEMALLOC;
296 /* Start by resetting any existing state */
299 @@ -1562,6 +1622,7 @@ static void xs_udp_connect_worker6(struc
301 xprt_wake_pending_tasks(xprt, status);
302 xprt_clear_connecting(xprt);
303 + tsk_restore_flags(current, pflags, PF_MEMALLOC);
307 @@ -1621,6 +1682,8 @@ static int xs_tcp_finish_connecting(stru
308 write_unlock_bh(&sk->sk_callback_lock);
311 + xs_set_memalloc(xprt);
313 /* Tell the socket layer to start connecting... */
314 xprt->stat.connect_count++;
315 xprt->stat.connect_start = jiffies;
316 @@ -1639,11 +1702,15 @@ static void xs_tcp_connect_worker4(struc
317 container_of(work, struct sock_xprt, connect_worker.work);
318 struct rpc_xprt *xprt = &transport->xprt;
319 struct socket *sock = transport->sock;
320 + unsigned long pflags = current->flags;
321 int err, status = -EIO;
323 if (xprt->shutdown || !xprt_bound(xprt))
327 + current->flags |= PF_MEMALLOC;
330 /* start from scratch */
331 if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
332 @@ -1685,6 +1752,7 @@ out:
333 xprt_wake_pending_tasks(xprt, status);
335 xprt_clear_connecting(xprt);
336 + tsk_restore_flags(current, pflags, PF_MEMALLOC);
340 @@ -1699,11 +1767,15 @@ static void xs_tcp_connect_worker6(struc
341 container_of(work, struct sock_xprt, connect_worker.work);
342 struct rpc_xprt *xprt = &transport->xprt;
343 struct socket *sock = transport->sock;
344 + unsigned long pflags = current->flags;
345 int err, status = -EIO;
347 if (xprt->shutdown || !xprt_bound(xprt))
351 + current->flags |= PF_MEMALLOC;
354 /* start from scratch */
355 if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
356 @@ -1744,6 +1816,7 @@ out:
357 xprt_wake_pending_tasks(xprt, status);
359 xprt_clear_connecting(xprt);
360 + tsk_restore_flags(current, pflags, PF_MEMALLOC);