]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
2 | Subject: netvm: network reserve infrastructure | |
3 | Patch-mainline: No | |
4 | References: FATE#303834 | |
5 | ||
6 | Provide the basic infrastructure to reserve and charge/account network memory. | |
7 | ||
8 | We provide the following reserve tree: | |
9 | ||
10 | 1) total network reserve | |
11 | 2) network TX reserve | |
12 | 3) protocol TX pages | |
13 | 4) network RX reserve | |
14 | 5) SKB data reserve | |
15 | ||
16 | [1] is used to make all the network reserves a single subtree, for easy | |
17 | manipulation. | |
18 | ||
19 | [2] and [4] are merely for eastetic reasons. | |
20 | ||
21 | The TX pages reserve [3] is assumed bounded by it being the upper bound of | |
22 | memory that can be used for sending pages (not quite true, but good enough) | |
23 | ||
24 | The SKB reserve [5] is an aggregate reserve, which is used to charge SKB data | |
25 | against in the fallback path. | |
26 | ||
27 | The consumers for these reserves are sockets marked with: | |
28 | SOCK_MEMALLOC | |
29 | ||
30 | Such sockets are to be used to service the VM (iow. to swap over). They | |
31 | must be handled kernel side, exposing such a socket to user-space is a BUG. | |
32 | ||
33 | Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
34 | Acked-by: Neil Brown <neilb@suse.de> | |
35 | Acked-by: Suresh Jayaraman <sjayaraman@suse.de> | |
36 | ||
37 | --- | |
38 | include/net/sock.h | 43 ++++++++++++++++++++- | |
39 | net/Kconfig | 3 + | |
40 | net/core/sock.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
41 | 3 files changed, 152 insertions(+), 1 deletion(-) | |
42 | ||
43 | --- a/include/net/sock.h | |
44 | +++ b/include/net/sock.h | |
45 | @@ -50,6 +50,7 @@ | |
46 | #include <linux/skbuff.h> /* struct sk_buff */ | |
47 | #include <linux/mm.h> | |
48 | #include <linux/security.h> | |
49 | +#include <linux/reserve.h> | |
50 | ||
51 | #include <linux/filter.h> | |
52 | ||
53 | @@ -413,6 +414,7 @@ enum sock_flags { | |
54 | SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ | |
55 | SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ | |
56 | SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ | |
57 | + SOCK_MEMALLOC, /* the VM depends on us - make sure we're serviced */ | |
58 | }; | |
59 | ||
60 | static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) | |
61 | @@ -435,9 +437,48 @@ static inline int sock_flag(struct sock | |
62 | return test_bit(flag, &sk->sk_flags); | |
63 | } | |
64 | ||
65 | +static inline int sk_has_memalloc(struct sock *sk) | |
66 | +{ | |
67 | + return sock_flag(sk, SOCK_MEMALLOC); | |
68 | +} | |
69 | + | |
70 | +extern struct mem_reserve net_rx_reserve; | |
71 | +extern struct mem_reserve net_skb_reserve; | |
72 | + | |
73 | +#ifdef CONFIG_NETVM | |
74 | +/* | |
75 | + * Guestimate the per request queue TX upper bound. | |
76 | + * | |
77 | + * Max packet size is 64k, and we need to reserve that much since the data | |
78 | + * might need to bounce it. Double it to be on the safe side. | |
79 | + */ | |
80 | +#define TX_RESERVE_PAGES DIV_ROUND_UP(2*65536, PAGE_SIZE) | |
81 | + | |
82 | +extern int memalloc_socks; | |
83 | + | |
84 | +static inline int sk_memalloc_socks(void) | |
85 | +{ | |
86 | + return memalloc_socks; | |
87 | +} | |
88 | + | |
89 | +extern int sk_adjust_memalloc(int socks, long tx_reserve_pages); | |
90 | +extern int sk_set_memalloc(struct sock *sk); | |
91 | +extern int sk_clear_memalloc(struct sock *sk); | |
92 | +#else | |
93 | +static inline int sk_memalloc_socks(void) | |
94 | +{ | |
95 | + return 0; | |
96 | +} | |
97 | + | |
98 | +static inline int sk_clear_memalloc(struct sock *sk) | |
99 | +{ | |
100 | + return 0; | |
101 | +} | |
102 | +#endif | |
103 | + | |
104 | static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask) | |
105 | { | |
106 | - return gfp_mask; | |
107 | + return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); | |
108 | } | |
109 | ||
110 | static inline void sk_acceptq_removed(struct sock *sk) | |
111 | --- a/net/core/sock.c | |
112 | +++ b/net/core/sock.c | |
113 | @@ -110,6 +110,7 @@ | |
114 | #include <linux/tcp.h> | |
115 | #include <linux/init.h> | |
116 | #include <linux/highmem.h> | |
117 | +#include <linux/reserve.h> | |
118 | ||
119 | #include <asm/uaccess.h> | |
120 | #include <asm/system.h> | |
121 | @@ -211,6 +212,105 @@ __u32 sysctl_rmem_default __read_mostly | |
122 | /* Maximal space eaten by iovec or ancilliary data plus some space */ | |
123 | int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); | |
124 | ||
125 | +static struct mem_reserve net_reserve; | |
126 | +struct mem_reserve net_rx_reserve; | |
127 | +EXPORT_SYMBOL_GPL(net_rx_reserve); /* modular ipv6 only */ | |
128 | +struct mem_reserve net_skb_reserve; | |
129 | +EXPORT_SYMBOL_GPL(net_skb_reserve); /* modular ipv6 only */ | |
130 | +static struct mem_reserve net_tx_reserve; | |
131 | +static struct mem_reserve net_tx_pages; | |
132 | + | |
133 | +#ifdef CONFIG_NETVM | |
134 | +static DEFINE_MUTEX(memalloc_socks_lock); | |
135 | +int memalloc_socks; | |
136 | + | |
137 | +/** | |
138 | + * sk_adjust_memalloc - adjust the global memalloc reserve for critical RX | |
139 | + * @socks: number of new %SOCK_MEMALLOC sockets | |
140 | + * @tx_resserve_pages: number of pages to (un)reserve for TX | |
141 | + * | |
142 | + * This function adjusts the memalloc reserve based on system demand. | |
143 | + * The RX reserve is a limit, and only added once, not for each socket. | |
144 | + * | |
145 | + * NOTE: | |
146 | + * @tx_reserve_pages is an upper-bound of memory used for TX hence | |
147 | + * we need not account the pages like we do for RX pages. | |
148 | + */ | |
149 | +int sk_adjust_memalloc(int socks, long tx_reserve_pages) | |
150 | +{ | |
151 | + int err; | |
152 | + | |
153 | + mutex_lock(&memalloc_socks_lock); | |
154 | + err = mem_reserve_pages_add(&net_tx_pages, tx_reserve_pages); | |
155 | + if (err) | |
156 | + goto unlock; | |
157 | + | |
158 | + /* | |
159 | + * either socks is positive and we need to check for 0 -> !0 | |
160 | + * transition and connect the reserve tree when we observe it. | |
161 | + */ | |
162 | + if (!memalloc_socks && socks > 0) { | |
163 | + err = mem_reserve_connect(&net_reserve, &mem_reserve_root); | |
164 | + if (err) { | |
165 | + /* | |
166 | + * if we failed to connect the tree, undo the tx | |
167 | + * reserve so that failure has no side effects. | |
168 | + */ | |
169 | + mem_reserve_pages_add(&net_tx_pages, -tx_reserve_pages); | |
170 | + goto unlock; | |
171 | + } | |
172 | + } | |
173 | + memalloc_socks += socks; | |
174 | + /* | |
175 | + * or socks is negative and we must observe the !0 -> 0 transition | |
176 | + * and disconnect the reserve tree. | |
177 | + */ | |
178 | + if (!memalloc_socks && socks) | |
179 | + mem_reserve_disconnect(&net_reserve); | |
180 | + | |
181 | +unlock: | |
182 | + mutex_unlock(&memalloc_socks_lock); | |
183 | + | |
184 | + return err; | |
185 | +} | |
186 | +EXPORT_SYMBOL_GPL(sk_adjust_memalloc); | |
187 | + | |
188 | +/** | |
189 | + * sk_set_memalloc - sets %SOCK_MEMALLOC | |
190 | + * @sk: socket to set it on | |
191 | + * | |
192 | + * Set %SOCK_MEMALLOC on a socket and increase the memalloc reserve | |
193 | + * accordingly. | |
194 | + */ | |
195 | +int sk_set_memalloc(struct sock *sk) | |
196 | +{ | |
197 | + int set = sock_flag(sk, SOCK_MEMALLOC); | |
198 | + | |
199 | + if (!set) { | |
200 | + int err = sk_adjust_memalloc(1, 0); | |
201 | + if (err) | |
202 | + return err; | |
203 | + | |
204 | + sock_set_flag(sk, SOCK_MEMALLOC); | |
205 | + sk->sk_allocation |= __GFP_MEMALLOC; | |
206 | + } | |
207 | + return !set; | |
208 | +} | |
209 | +EXPORT_SYMBOL_GPL(sk_set_memalloc); | |
210 | + | |
211 | +int sk_clear_memalloc(struct sock *sk) | |
212 | +{ | |
213 | + int set = sock_flag(sk, SOCK_MEMALLOC); | |
214 | + if (set) { | |
215 | + sk_adjust_memalloc(-1, 0); | |
216 | + sock_reset_flag(sk, SOCK_MEMALLOC); | |
217 | + sk->sk_allocation &= ~__GFP_MEMALLOC; | |
218 | + } | |
219 | + return set; | |
220 | +} | |
221 | +EXPORT_SYMBOL_GPL(sk_clear_memalloc); | |
222 | +#endif | |
223 | + | |
224 | static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) | |
225 | { | |
226 | struct timeval tv; | |
227 | @@ -959,6 +1059,7 @@ void sk_free(struct sock *sk) | |
228 | { | |
229 | struct sk_filter *filter; | |
230 | ||
231 | + sk_clear_memalloc(sk); | |
232 | if (sk->sk_destruct) | |
233 | sk->sk_destruct(sk); | |
234 | ||
235 | @@ -1108,6 +1209,12 @@ void __init sk_init(void) | |
236 | sysctl_wmem_max = 131071; | |
237 | sysctl_rmem_max = 131071; | |
238 | } | |
239 | + | |
240 | + mem_reserve_init(&net_reserve, "total network reserve", NULL); | |
241 | + mem_reserve_init(&net_rx_reserve, "network RX reserve", &net_reserve); | |
242 | + mem_reserve_init(&net_skb_reserve, "SKB data reserve", &net_rx_reserve); | |
243 | + mem_reserve_init(&net_tx_reserve, "network TX reserve", &net_reserve); | |
244 | + mem_reserve_init(&net_tx_pages, "protocol TX pages", &net_tx_reserve); | |
245 | } | |
246 | ||
247 | /* | |
248 | --- a/net/Kconfig | |
249 | +++ b/net/Kconfig | |
250 | @@ -249,4 +249,7 @@ endmenu | |
251 | source "net/rfkill/Kconfig" | |
252 | source "net/9p/Kconfig" | |
253 | ||
254 | +config NETVM | |
255 | + def_bool n | |
256 | + | |
257 | endif # if NET |