]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/suse-2.6.27.25/patches.suse/SoN-19-netvm-reserve.patch
Updated xen patches taken from suse.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.suse / SoN-19-netvm-reserve.patch
1 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
2 Subject: netvm: network reserve infrastructure
3 Patch-mainline: No
4 References: FATE#303834
5
6 Provide the basic infrastructure to reserve and charge/account network memory.
7
8 We provide the following reserve tree:
9
10 1) total network reserve
11 2) network TX reserve
12 3) protocol TX pages
13 4) network RX reserve
14 5) SKB data reserve
15
16 [1] is used to make all the network reserves a single subtree, for easy
17 manipulation.
18
19 [2] and [4] are merely for eastetic reasons.
20
21 The TX pages reserve [3] is assumed bounded by it being the upper bound of
22 memory that can be used for sending pages (not quite true, but good enough)
23
24 The SKB reserve [5] is an aggregate reserve, which is used to charge SKB data
25 against in the fallback path.
26
27 The consumers for these reserves are sockets marked with:
28 SOCK_MEMALLOC
29
30 Such sockets are to be used to service the VM (iow. to swap over). They
31 must be handled kernel side, exposing such a socket to user-space is a BUG.
32
33 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
34 Acked-by: Neil Brown <neilb@suse.de>
35 Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
36
37 ---
38 include/net/sock.h | 43 ++++++++++++++++++++-
39 net/Kconfig | 3 +
40 net/core/sock.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++
41 3 files changed, 152 insertions(+), 1 deletion(-)
42
43 --- a/include/net/sock.h
44 +++ b/include/net/sock.h
45 @@ -50,6 +50,7 @@
46 #include <linux/skbuff.h> /* struct sk_buff */
47 #include <linux/mm.h>
48 #include <linux/security.h>
49 +#include <linux/reserve.h>
50
51 #include <linux/filter.h>
52
53 @@ -413,6 +414,7 @@ enum sock_flags {
54 SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
55 SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
56 SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
57 + SOCK_MEMALLOC, /* the VM depends on us - make sure we're serviced */
58 };
59
60 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
61 @@ -435,9 +437,48 @@ static inline int sock_flag(struct sock
62 return test_bit(flag, &sk->sk_flags);
63 }
64
65 +static inline int sk_has_memalloc(struct sock *sk)
66 +{
67 + return sock_flag(sk, SOCK_MEMALLOC);
68 +}
69 +
70 +extern struct mem_reserve net_rx_reserve;
71 +extern struct mem_reserve net_skb_reserve;
72 +
73 +#ifdef CONFIG_NETVM
74 +/*
75 + * Guestimate the per request queue TX upper bound.
76 + *
77 + * Max packet size is 64k, and we need to reserve that much since the data
78 + * might need to bounce it. Double it to be on the safe side.
79 + */
80 +#define TX_RESERVE_PAGES DIV_ROUND_UP(2*65536, PAGE_SIZE)
81 +
82 +extern int memalloc_socks;
83 +
84 +static inline int sk_memalloc_socks(void)
85 +{
86 + return memalloc_socks;
87 +}
88 +
89 +extern int sk_adjust_memalloc(int socks, long tx_reserve_pages);
90 +extern int sk_set_memalloc(struct sock *sk);
91 +extern int sk_clear_memalloc(struct sock *sk);
92 +#else
93 +static inline int sk_memalloc_socks(void)
94 +{
95 + return 0;
96 +}
97 +
98 +static inline int sk_clear_memalloc(struct sock *sk)
99 +{
100 + return 0;
101 +}
102 +#endif
103 +
104 static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
105 {
106 - return gfp_mask;
107 + return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
108 }
109
110 static inline void sk_acceptq_removed(struct sock *sk)
111 --- a/net/core/sock.c
112 +++ b/net/core/sock.c
113 @@ -110,6 +110,7 @@
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 +#include <linux/reserve.h>
118
119 #include <asm/uaccess.h>
120 #include <asm/system.h>
121 @@ -211,6 +212,105 @@ __u32 sysctl_rmem_default __read_mostly
122 /* Maximal space eaten by iovec or ancilliary data plus some space */
123 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
124
125 +static struct mem_reserve net_reserve;
126 +struct mem_reserve net_rx_reserve;
127 +EXPORT_SYMBOL_GPL(net_rx_reserve); /* modular ipv6 only */
128 +struct mem_reserve net_skb_reserve;
129 +EXPORT_SYMBOL_GPL(net_skb_reserve); /* modular ipv6 only */
130 +static struct mem_reserve net_tx_reserve;
131 +static struct mem_reserve net_tx_pages;
132 +
133 +#ifdef CONFIG_NETVM
134 +static DEFINE_MUTEX(memalloc_socks_lock);
135 +int memalloc_socks;
136 +
137 +/**
138 + * sk_adjust_memalloc - adjust the global memalloc reserve for critical RX
139 + * @socks: number of new %SOCK_MEMALLOC sockets
140 + * @tx_resserve_pages: number of pages to (un)reserve for TX
141 + *
142 + * This function adjusts the memalloc reserve based on system demand.
143 + * The RX reserve is a limit, and only added once, not for each socket.
144 + *
145 + * NOTE:
146 + * @tx_reserve_pages is an upper-bound of memory used for TX hence
147 + * we need not account the pages like we do for RX pages.
148 + */
149 +int sk_adjust_memalloc(int socks, long tx_reserve_pages)
150 +{
151 + int err;
152 +
153 + mutex_lock(&memalloc_socks_lock);
154 + err = mem_reserve_pages_add(&net_tx_pages, tx_reserve_pages);
155 + if (err)
156 + goto unlock;
157 +
158 + /*
159 + * either socks is positive and we need to check for 0 -> !0
160 + * transition and connect the reserve tree when we observe it.
161 + */
162 + if (!memalloc_socks && socks > 0) {
163 + err = mem_reserve_connect(&net_reserve, &mem_reserve_root);
164 + if (err) {
165 + /*
166 + * if we failed to connect the tree, undo the tx
167 + * reserve so that failure has no side effects.
168 + */
169 + mem_reserve_pages_add(&net_tx_pages, -tx_reserve_pages);
170 + goto unlock;
171 + }
172 + }
173 + memalloc_socks += socks;
174 + /*
175 + * or socks is negative and we must observe the !0 -> 0 transition
176 + * and disconnect the reserve tree.
177 + */
178 + if (!memalloc_socks && socks)
179 + mem_reserve_disconnect(&net_reserve);
180 +
181 +unlock:
182 + mutex_unlock(&memalloc_socks_lock);
183 +
184 + return err;
185 +}
186 +EXPORT_SYMBOL_GPL(sk_adjust_memalloc);
187 +
188 +/**
189 + * sk_set_memalloc - sets %SOCK_MEMALLOC
190 + * @sk: socket to set it on
191 + *
192 + * Set %SOCK_MEMALLOC on a socket and increase the memalloc reserve
193 + * accordingly.
194 + */
195 +int sk_set_memalloc(struct sock *sk)
196 +{
197 + int set = sock_flag(sk, SOCK_MEMALLOC);
198 +
199 + if (!set) {
200 + int err = sk_adjust_memalloc(1, 0);
201 + if (err)
202 + return err;
203 +
204 + sock_set_flag(sk, SOCK_MEMALLOC);
205 + sk->sk_allocation |= __GFP_MEMALLOC;
206 + }
207 + return !set;
208 +}
209 +EXPORT_SYMBOL_GPL(sk_set_memalloc);
210 +
211 +int sk_clear_memalloc(struct sock *sk)
212 +{
213 + int set = sock_flag(sk, SOCK_MEMALLOC);
214 + if (set) {
215 + sk_adjust_memalloc(-1, 0);
216 + sock_reset_flag(sk, SOCK_MEMALLOC);
217 + sk->sk_allocation &= ~__GFP_MEMALLOC;
218 + }
219 + return set;
220 +}
221 +EXPORT_SYMBOL_GPL(sk_clear_memalloc);
222 +#endif
223 +
224 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
225 {
226 struct timeval tv;
227 @@ -959,6 +1059,7 @@ void sk_free(struct sock *sk)
228 {
229 struct sk_filter *filter;
230
231 + sk_clear_memalloc(sk);
232 if (sk->sk_destruct)
233 sk->sk_destruct(sk);
234
235 @@ -1108,6 +1209,12 @@ void __init sk_init(void)
236 sysctl_wmem_max = 131071;
237 sysctl_rmem_max = 131071;
238 }
239 +
240 + mem_reserve_init(&net_reserve, "total network reserve", NULL);
241 + mem_reserve_init(&net_rx_reserve, "network RX reserve", &net_reserve);
242 + mem_reserve_init(&net_skb_reserve, "SKB data reserve", &net_rx_reserve);
243 + mem_reserve_init(&net_tx_reserve, "network TX reserve", &net_reserve);
244 + mem_reserve_init(&net_tx_pages, "protocol TX pages", &net_tx_reserve);
245 }
246
247 /*
248 --- a/net/Kconfig
249 +++ b/net/Kconfig
250 @@ -249,4 +249,7 @@ endmenu
251 source "net/rfkill/Kconfig"
252 source "net/9p/Kconfig"
253
254 +config NETVM
255 + def_bool n
256 +
257 endif # if NET