1 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
2 Subject: netvm: network reserve infrastructure
4 References: FATE#303834
6 Provide the basic infrastructure to reserve and charge/account network memory.
8 We provide the following reserve tree:
10 1) total network reserve
16 [1] is used to make all the network reserves a single subtree, for easy
19 [2] and [4] are merely for eastetic reasons.
21 The TX pages reserve [3] is assumed bounded by it being the upper bound of
22 memory that can be used for sending pages (not quite true, but good enough)
24 The SKB reserve [5] is an aggregate reserve, which is used to charge SKB data
25 against in the fallback path.
27 The consumers for these reserves are sockets marked with:
30 Such sockets are to be used to service the VM (iow. to swap over). They
31 must be handled kernel side, exposing such a socket to user-space is a BUG.
33 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
34 Acked-by: Neil Brown <neilb@suse.de>
35 Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
38 include/net/sock.h | 43 ++++++++++++++++++++-
40 net/core/sock.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++
41 3 files changed, 152 insertions(+), 1 deletion(-)
43 --- a/include/net/sock.h
44 +++ b/include/net/sock.h
46 #include <linux/skbuff.h> /* struct sk_buff */
48 #include <linux/security.h>
49 +#include <linux/reserve.h>
51 #include <linux/filter.h>
53 @@ -413,6 +414,7 @@ enum sock_flags {
54 SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
55 SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
56 SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
57 + SOCK_MEMALLOC, /* the VM depends on us - make sure we're serviced */
60 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
61 @@ -435,9 +437,48 @@ static inline int sock_flag(struct sock
62 return test_bit(flag, &sk->sk_flags);
65 +static inline int sk_has_memalloc(struct sock *sk)
67 + return sock_flag(sk, SOCK_MEMALLOC);
70 +extern struct mem_reserve net_rx_reserve;
71 +extern struct mem_reserve net_skb_reserve;
75 + * Guestimate the per request queue TX upper bound.
77 + * Max packet size is 64k, and we need to reserve that much since the data
78 + * might need to bounce it. Double it to be on the safe side.
80 +#define TX_RESERVE_PAGES DIV_ROUND_UP(2*65536, PAGE_SIZE)
82 +extern int memalloc_socks;
84 +static inline int sk_memalloc_socks(void)
86 + return memalloc_socks;
89 +extern int sk_adjust_memalloc(int socks, long tx_reserve_pages);
90 +extern int sk_set_memalloc(struct sock *sk);
91 +extern int sk_clear_memalloc(struct sock *sk);
93 +static inline int sk_memalloc_socks(void)
98 +static inline int sk_clear_memalloc(struct sock *sk)
104 static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
107 + return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
110 static inline void sk_acceptq_removed(struct sock *sk)
111 --- a/net/core/sock.c
112 +++ b/net/core/sock.c
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 +#include <linux/reserve.h>
119 #include <asm/uaccess.h>
120 #include <asm/system.h>
121 @@ -211,6 +212,105 @@ __u32 sysctl_rmem_default __read_mostly
122 /* Maximal space eaten by iovec or ancilliary data plus some space */
123 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
125 +static struct mem_reserve net_reserve;
126 +struct mem_reserve net_rx_reserve;
127 +EXPORT_SYMBOL_GPL(net_rx_reserve); /* modular ipv6 only */
128 +struct mem_reserve net_skb_reserve;
129 +EXPORT_SYMBOL_GPL(net_skb_reserve); /* modular ipv6 only */
130 +static struct mem_reserve net_tx_reserve;
131 +static struct mem_reserve net_tx_pages;
134 +static DEFINE_MUTEX(memalloc_socks_lock);
138 + * sk_adjust_memalloc - adjust the global memalloc reserve for critical RX
139 + * @socks: number of new %SOCK_MEMALLOC sockets
140 + * @tx_resserve_pages: number of pages to (un)reserve for TX
142 + * This function adjusts the memalloc reserve based on system demand.
143 + * The RX reserve is a limit, and only added once, not for each socket.
146 + * @tx_reserve_pages is an upper-bound of memory used for TX hence
147 + * we need not account the pages like we do for RX pages.
149 +int sk_adjust_memalloc(int socks, long tx_reserve_pages)
153 + mutex_lock(&memalloc_socks_lock);
154 + err = mem_reserve_pages_add(&net_tx_pages, tx_reserve_pages);
159 + * either socks is positive and we need to check for 0 -> !0
160 + * transition and connect the reserve tree when we observe it.
162 + if (!memalloc_socks && socks > 0) {
163 + err = mem_reserve_connect(&net_reserve, &mem_reserve_root);
166 + * if we failed to connect the tree, undo the tx
167 + * reserve so that failure has no side effects.
169 + mem_reserve_pages_add(&net_tx_pages, -tx_reserve_pages);
173 + memalloc_socks += socks;
175 + * or socks is negative and we must observe the !0 -> 0 transition
176 + * and disconnect the reserve tree.
178 + if (!memalloc_socks && socks)
179 + mem_reserve_disconnect(&net_reserve);
182 + mutex_unlock(&memalloc_socks_lock);
186 +EXPORT_SYMBOL_GPL(sk_adjust_memalloc);
189 + * sk_set_memalloc - sets %SOCK_MEMALLOC
190 + * @sk: socket to set it on
192 + * Set %SOCK_MEMALLOC on a socket and increase the memalloc reserve
195 +int sk_set_memalloc(struct sock *sk)
197 + int set = sock_flag(sk, SOCK_MEMALLOC);
200 + int err = sk_adjust_memalloc(1, 0);
204 + sock_set_flag(sk, SOCK_MEMALLOC);
205 + sk->sk_allocation |= __GFP_MEMALLOC;
209 +EXPORT_SYMBOL_GPL(sk_set_memalloc);
211 +int sk_clear_memalloc(struct sock *sk)
213 + int set = sock_flag(sk, SOCK_MEMALLOC);
215 + sk_adjust_memalloc(-1, 0);
216 + sock_reset_flag(sk, SOCK_MEMALLOC);
217 + sk->sk_allocation &= ~__GFP_MEMALLOC;
221 +EXPORT_SYMBOL_GPL(sk_clear_memalloc);
224 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
227 @@ -959,6 +1059,7 @@ void sk_free(struct sock *sk)
229 struct sk_filter *filter;
231 + sk_clear_memalloc(sk);
235 @@ -1108,6 +1209,12 @@ void __init sk_init(void)
236 sysctl_wmem_max = 131071;
237 sysctl_rmem_max = 131071;
240 + mem_reserve_init(&net_reserve, "total network reserve", NULL);
241 + mem_reserve_init(&net_rx_reserve, "network RX reserve", &net_reserve);
242 + mem_reserve_init(&net_skb_reserve, "SKB data reserve", &net_rx_reserve);
243 + mem_reserve_init(&net_tx_reserve, "network TX reserve", &net_reserve);
244 + mem_reserve_init(&net_tx_pages, "protocol TX pages", &net_tx_reserve);
250 @@ -249,4 +249,7 @@ endmenu
251 source "net/rfkill/Kconfig"
252 source "net/9p/Kconfig"