1 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
2 Subject: netvm: INET reserves.
4 References: FATE#303834
8 The two big users seem to be the route cache and ip-fragment cache.
10 Reserve the route cache under generic RX reserve, its usage is bounded by
11 the high reclaim watermark, and thus does not need further accounting.
13 Reserve the ip-fragement caches under SKB data reserve, these add to the
14 SKB RX limit. By ensuring we can at least receive as much data as fits in
15 the reassmbly line we avoid fragment attack deadlocks.
17 Adds to the reserve tree:
29 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
30 Acked-by: Neil Brown <neilb@suse.de>
31 Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
34 include/net/inet_frag.h | 7 +++
35 include/net/netns/ipv6.h | 4 ++
36 net/ipv4/inet_fragment.c | 3 +
37 net/ipv4/ip_fragment.c | 87 +++++++++++++++++++++++++++++++++++++++++++++--
38 net/ipv4/route.c | 71 +++++++++++++++++++++++++++++++++++++-
39 net/ipv6/reassembly.c | 86 +++++++++++++++++++++++++++++++++++++++++++++-
40 net/ipv6/route.c | 78 +++++++++++++++++++++++++++++++++++++++++-
41 7 files changed, 329 insertions(+), 7 deletions(-)
43 Index: linux-2.6.26/net/ipv4/ip_fragment.c
44 ===================================================================
45 --- linux-2.6.26.orig/net/ipv4/ip_fragment.c
46 +++ linux-2.6.26/net/ipv4/ip_fragment.c
48 #include <linux/udp.h>
49 #include <linux/inet.h>
50 #include <linux/netfilter_ipv4.h>
51 +#include <linux/reserve.h>
52 +#include <linux/nsproxy.h>
54 /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
55 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
56 @@ -599,6 +601,64 @@ int ip_defrag(struct sk_buff *skb, u32 u
60 +static int proc_dointvec_fragment(struct ctl_table *table, int write,
61 + struct file *filp, void __user *buffer, size_t *lenp,
64 + struct net *net = container_of(table->data, struct net,
65 + ipv4.frags.high_thresh);
66 + ctl_table tmp = *table;
69 + mutex_lock(&net->ipv4.frags.lock);
71 + tmp.data = &new_bytes;
75 + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
77 + if (!ret && write) {
78 + ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
81 + net->ipv4.frags.high_thresh = new_bytes;
83 + mutex_unlock(&net->ipv4.frags.lock);
88 +static int sysctl_intvec_fragment(struct ctl_table *table,
89 + int __user *name, int nlen,
90 + void __user *oldval, size_t __user *oldlenp,
91 + void __user *newval, size_t newlen)
93 + struct net *net = container_of(table->data, struct net,
94 + ipv4.frags.high_thresh);
95 + int write = (newval && newlen);
96 + ctl_table tmp = *table;
99 + mutex_lock(&net->ipv4.frags.lock);
101 + tmp.data = &new_bytes;
105 + ret = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen);
107 + if (!ret && write) {
108 + ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
111 + net->ipv4.frags.high_thresh = new_bytes;
113 + mutex_unlock(&net->ipv4.frags.lock);
120 static struct ctl_table ip4_frags_ns_ctl_table[] = {
121 @@ -608,7 +668,8 @@ static struct ctl_table ip4_frags_ns_ctl
122 .data = &init_net.ipv4.frags.high_thresh,
123 .maxlen = sizeof(int),
125 - .proc_handler = &proc_dointvec
126 + .proc_handler = &proc_dointvec_fragment,
127 + .strategy = &sysctl_intvec_fragment,
130 .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
131 @@ -711,6 +772,8 @@ static inline void ip4_frags_ctl_registe
133 static int ipv4_frags_init_net(struct net *net)
138 * Fragment cache limits. We will commit 256K at one time. Should we
139 * cross that limit we will prune down to 192K. This should cope with
140 @@ -728,11 +791,31 @@ static int ipv4_frags_init_net(struct ne
142 inet_frags_init_net(&net->ipv4.frags);
144 - return ip4_frags_ns_ctl_register(net);
145 + ret = ip4_frags_ns_ctl_register(net);
149 + mem_reserve_init(&net->ipv4.frags.reserve, "IPv4 fragment cache",
151 + ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
152 + net->ipv4.frags.high_thresh);
159 + mem_reserve_disconnect(&net->ipv4.frags.reserve);
160 + ip4_frags_ns_ctl_unregister(net);
162 + inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
167 static void ipv4_frags_exit_net(struct net *net)
169 + mem_reserve_disconnect(&net->ipv4.frags.reserve);
170 ip4_frags_ns_ctl_unregister(net);
171 inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
173 Index: linux-2.6.26/net/ipv6/reassembly.c
174 ===================================================================
175 --- linux-2.6.26.orig/net/ipv6/reassembly.c
176 +++ linux-2.6.26/net/ipv6/reassembly.c
178 #include <linux/random.h>
179 #include <linux/jhash.h>
180 #include <linux/skbuff.h>
181 +#include <linux/reserve.h>
183 #include <net/sock.h>
184 #include <net/snmp.h>
185 @@ -632,6 +633,64 @@ static struct inet6_protocol frag_protoc
189 +static int proc_dointvec_fragment(struct ctl_table *table, int write,
190 + struct file *filp, void __user *buffer, size_t *lenp,
193 + struct net *net = container_of(table->data, struct net,
194 + ipv6.frags.high_thresh);
195 + ctl_table tmp = *table;
196 + int new_bytes, ret;
198 + mutex_lock(&net->ipv6.frags.lock);
200 + tmp.data = &new_bytes;
204 + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
206 + if (!ret && write) {
207 + ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
210 + net->ipv6.frags.high_thresh = new_bytes;
212 + mutex_unlock(&net->ipv6.frags.lock);
217 +static int sysctl_intvec_fragment(struct ctl_table *table,
218 + int __user *name, int nlen,
219 + void __user *oldval, size_t __user *oldlenp,
220 + void __user *newval, size_t newlen)
222 + struct net *net = container_of(table->data, struct net,
223 + ipv6.frags.high_thresh);
224 + int write = (newval && newlen);
225 + ctl_table tmp = *table;
226 + int new_bytes, ret;
228 + mutex_lock(&net->ipv6.frags.lock);
230 + tmp.data = &new_bytes;
234 + ret = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen);
236 + if (!ret && write) {
237 + ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
240 + net->ipv6.frags.high_thresh = new_bytes;
242 + mutex_unlock(&net->ipv6.frags.lock);
247 static struct ctl_table ip6_frags_ns_ctl_table[] = {
249 .ctl_name = NET_IPV6_IP6FRAG_HIGH_THRESH,
250 @@ -639,7 +698,8 @@ static struct ctl_table ip6_frags_ns_ctl
251 .data = &init_net.ipv6.frags.high_thresh,
252 .maxlen = sizeof(int),
254 - .proc_handler = &proc_dointvec
255 + .proc_handler = &proc_dointvec_fragment,
256 + .strategy = &sysctl_intvec_fragment,
259 .ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH,
260 @@ -748,17 +808,39 @@ static inline void ip6_frags_sysctl_unre
262 static int ipv6_frags_init_net(struct net *net)
266 net->ipv6.frags.high_thresh = 256 * 1024;
267 net->ipv6.frags.low_thresh = 192 * 1024;
268 net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
270 inet_frags_init_net(&net->ipv6.frags);
272 - return ip6_frags_ns_sysctl_register(net);
273 + ret = ip6_frags_ns_sysctl_register(net);
277 + mem_reserve_init(&net->ipv6.frags.reserve, "IPv6 fragment cache",
279 + ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
280 + net->ipv6.frags.high_thresh);
287 + mem_reserve_disconnect(&net->ipv6.frags.reserve);
288 + ip6_frags_ns_sysctl_unregister(net);
290 + inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
295 static void ipv6_frags_exit_net(struct net *net)
297 + mem_reserve_disconnect(&net->ipv6.frags.reserve);
298 ip6_frags_ns_sysctl_unregister(net);
299 inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
301 Index: linux-2.6.26/net/ipv4/route.c
302 ===================================================================
303 --- linux-2.6.26.orig/net/ipv4/route.c
304 +++ linux-2.6.26/net/ipv4/route.c
307 #include <linux/sysctl.h>
309 +#include <linux/reserve.h>
311 #define RT_FL_TOS(oldflp) \
312 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
313 @@ -269,6 +270,8 @@ static inline int rt_genid(struct net *n
314 return atomic_read(&net->ipv4.rt_genid);
317 +static struct mem_reserve ipv4_route_reserve;
319 #ifdef CONFIG_PROC_FS
320 struct rt_cache_iter_state {
321 struct seq_net_private p;
322 @@ -393,6 +396,62 @@ static int rt_cache_seq_show(struct seq_
326 +static struct mutex ipv4_route_lock;
328 +static int proc_dointvec_route(struct ctl_table *table, int write,
329 + struct file *filp, void __user *buffer, size_t *lenp,
332 + ctl_table tmp = *table;
335 + mutex_lock(&ipv4_route_lock);
337 + tmp.data = &new_size;
341 + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
343 + if (!ret && write) {
344 + ret = mem_reserve_kmem_cache_set(&ipv4_route_reserve,
345 + ipv4_dst_ops.kmem_cachep, new_size);
347 + ip_rt_max_size = new_size;
349 + mutex_unlock(&ipv4_route_lock);
354 +static int sysctl_intvec_route(struct ctl_table *table,
355 + int __user *name, int nlen,
356 + void __user *oldval, size_t __user *oldlenp,
357 + void __user *newval, size_t newlen)
359 + int write = (newval && newlen);
360 + ctl_table tmp = *table;
363 + mutex_lock(&ipv4_route_lock);
365 + tmp.data = &new_size;
369 + ret = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen);
371 + if (!ret && write) {
372 + ret = mem_reserve_kmem_cache_set(&ipv4_route_reserve,
373 + ipv4_dst_ops.kmem_cachep, new_size);
375 + ip_rt_max_size = new_size;
377 + mutex_unlock(&ipv4_route_lock);
382 static const struct seq_operations rt_cache_seq_ops = {
383 .start = rt_cache_seq_start,
384 .next = rt_cache_seq_next,
385 @@ -2991,7 +3050,8 @@ static ctl_table ipv4_route_table[] = {
386 .data = &ip_rt_max_size,
387 .maxlen = sizeof(int),
389 - .proc_handler = &proc_dointvec,
390 + .proc_handler = &proc_dointvec_route,
391 + .strategy = &sysctl_intvec_route,
394 /* Deprecated. Use gc_min_interval_ms */
395 @@ -3270,6 +3330,15 @@ int __init ip_rt_init(void)
396 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
397 ip_rt_max_size = (rt_hash_mask + 1) * 16;
399 +#ifdef CONFIG_PROCFS
400 + mutex_init(&ipv4_route_lock);
403 + mem_reserve_init(&ipv4_route_reserve, "IPv4 route cache",
405 + mem_reserve_kmem_cache_set(&ipv4_route_reserve,
406 + ipv4_dst_ops.kmem_cachep, ip_rt_max_size);
411 Index: linux-2.6.26/net/ipv6/route.c
412 ===================================================================
413 --- linux-2.6.26.orig/net/ipv6/route.c
414 +++ linux-2.6.26/net/ipv6/route.c
416 #include <linux/mroute6.h>
417 #include <linux/init.h>
418 #include <linux/if_arp.h>
419 +#include <linux/reserve.h>
420 #include <linux/proc_fs.h>
421 #include <linux/seq_file.h>
422 #include <linux/nsproxy.h>
423 @@ -2473,6 +2474,64 @@ int ipv6_sysctl_rtcache_flush(ctl_table
427 +static int proc_dointvec_route(struct ctl_table *table, int write,
428 + struct file *filp, void __user *buffer, size_t *lenp,
431 + struct net *net = container_of(table->data, struct net,
432 + ipv6.sysctl.ip6_rt_max_size);
433 + ctl_table tmp = *table;
436 + mutex_lock(&net->ipv6.sysctl.ip6_rt_lock);
438 + tmp.data = &new_size;
442 + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
444 + if (!ret && write) {
445 + ret = mem_reserve_kmem_cache_set(&net->ipv6.ip6_rt_reserve,
446 + net->ipv6.ip6_dst_ops->kmem_cachep, new_size);
448 + net->ipv6.sysctl.ip6_rt_max_size = new_size;
450 + mutex_unlock(&net->ipv6.sysctl.ip6_rt_lock);
455 +static int sysctl_intvec_route(struct ctl_table *table,
456 + int __user *name, int nlen,
457 + void __user *oldval, size_t __user *oldlenp,
458 + void __user *newval, size_t newlen)
460 + struct net *net = container_of(table->data, struct net,
461 + ipv6.sysctl.ip6_rt_max_size);
462 + int write = (newval && newlen);
463 + ctl_table tmp = *table;
466 + mutex_lock(&net->ipv6.sysctl.ip6_rt_lock);
468 + tmp.data = &new_size;
472 + ret = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen);
474 + if (!ret && write) {
475 + ret = mem_reserve_kmem_cache_set(&net->ipv6.ip6_rt_reserve,
476 + net->ipv6.ip6_dst_ops->kmem_cachep, new_size);
478 + net->ipv6.sysctl.ip6_rt_max_size = new_size;
480 + mutex_unlock(&net->ipv6.sysctl.ip6_rt_lock);
485 ctl_table ipv6_route_table_template[] = {
488 @@ -2495,7 +2554,8 @@ ctl_table ipv6_route_table_template[] =
489 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
490 .maxlen = sizeof(int),
492 - .proc_handler = &proc_dointvec,
493 + .proc_handler = &proc_dointvec_route,
494 + .strategy = &sysctl_intvec_route,
497 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
498 @@ -2583,6 +2643,8 @@ struct ctl_table *ipv6_route_sysctl_init
499 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
502 + mutex_init(&net->ipv6.sysctl.ip6_rt_lock);
507 @@ -2636,6 +2698,14 @@ static int ip6_route_net_init(struct net
508 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
509 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
511 + mem_reserve_init(&net->ipv6.ip6_rt_reserve, "IPv6 route cache",
513 + ret = mem_reserve_kmem_cache_set(&net->ipv6.ip6_rt_reserve,
514 + net->ipv6.ip6_dst_ops->kmem_cachep,
515 + net->ipv6.sysctl.ip6_rt_max_size);
517 + goto out_reserve_fail;
519 #ifdef CONFIG_PROC_FS
520 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
521 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
522 @@ -2646,12 +2716,15 @@ static int ip6_route_net_init(struct net
527 + mem_reserve_disconnect(&net->ipv6.ip6_rt_reserve);
528 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
529 + kfree(net->ipv6.ip6_blk_hole_entry);
530 out_ip6_prohibit_entry:
531 kfree(net->ipv6.ip6_prohibit_entry);
533 - kfree(net->ipv6.ip6_null_entry);
535 + kfree(net->ipv6.ip6_null_entry);
537 release_net(net->ipv6.ip6_dst_ops->dst_net);
538 kfree(net->ipv6.ip6_dst_ops);
539 @@ -2664,6 +2737,7 @@ static void ip6_route_net_exit(struct ne
540 proc_net_remove(net, "ipv6_route");
541 proc_net_remove(net, "rt6_stats");
543 + mem_reserve_disconnect(&net->ipv6.ip6_rt_reserve);
544 kfree(net->ipv6.ip6_null_entry);
545 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
546 kfree(net->ipv6.ip6_prohibit_entry);
547 Index: linux-2.6.26/include/net/inet_frag.h
548 ===================================================================
549 --- linux-2.6.26.orig/include/net/inet_frag.h
550 +++ linux-2.6.26/include/net/inet_frag.h
552 #ifndef __NET_FRAG_H__
553 #define __NET_FRAG_H__
555 +#include <linux/reserve.h>
556 +#include <linux/mutex.h>
561 @@ -10,6 +13,10 @@ struct netns_frags {
568 + struct mem_reserve reserve;
571 struct inet_frag_queue {
572 Index: linux-2.6.26/net/ipv4/inet_fragment.c
573 ===================================================================
574 --- linux-2.6.26.orig/net/ipv4/inet_fragment.c
575 +++ linux-2.6.26/net/ipv4/inet_fragment.c
577 #include <linux/random.h>
578 #include <linux/skbuff.h>
579 #include <linux/rtnetlink.h>
580 +#include <linux/reserve.h>
582 #include <net/inet_frag.h>
584 @@ -74,6 +75,8 @@ void inet_frags_init_net(struct netns_fr
586 atomic_set(&nf->mem, 0);
587 INIT_LIST_HEAD(&nf->lru_list);
588 + mutex_init(&nf->lock);
589 + mem_reserve_init(&nf->reserve, "IP fragement cache", NULL);
591 EXPORT_SYMBOL(inet_frags_init_net);
593 Index: linux-2.6.26/include/net/netns/ipv6.h
594 ===================================================================
595 --- linux-2.6.26.orig/include/net/netns/ipv6.h
596 +++ linux-2.6.26/include/net/netns/ipv6.h
597 @@ -24,6 +24,8 @@ struct netns_sysctl_ipv6 {
598 int ip6_rt_mtu_expires;
599 int ip6_rt_min_advmss;
602 + struct mutex ip6_rt_lock;
606 @@ -55,5 +57,7 @@ struct netns_ipv6 {
607 struct sock *ndisc_sk;
609 struct sock *igmp_sk;
611 + struct mem_reserve ip6_rt_reserve;