]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blame - releases/2.6.32.9/netfilter-nf_conntrack-fix-hash-resizing-with-namespaces.patch
4.9-stable patches
[thirdparty/kernel/stable-queue.git] / releases / 2.6.32.9 / netfilter-nf_conntrack-fix-hash-resizing-with-namespaces.patch
CommitLineData
739e1c91
GKH
1From d696c7bdaa55e2208e56c6f98e6bc1599f34286d Mon Sep 17 00:00:00 2001
2From: Patrick McHardy <kaber@trash.net>
3Date: Mon, 8 Feb 2010 11:18:07 -0800
4Subject: netfilter: nf_conntrack: fix hash resizing with namespaces
5
6From: Patrick McHardy <kaber@trash.net>
7
8commit d696c7bdaa55e2208e56c6f98e6bc1599f34286d upstream.
9
10As noticed by Jon Masters <jonathan@jonmasters.org>, the conntrack hash
11size is global and not per namespace, but modifiable at runtime through
12/sys/module/nf_conntrack/hashsize. Changing the hash size will only
13resize the hash in the current namespace however, so other namespaces
14will use an invalid hash size. This can cause crashes when enlarging
15the hashsize, or false negative lookups when shrinking it.
16
17Move the hash size into the per-namespace data and only use the global
18hash size to initialize the per-namespace value when instanciating a
19new namespace. Additionally restrict hash resizing to init_net for
20now as other namespaces are not handled currently.
21
22Signed-off-by: Patrick McHardy <kaber@trash.net>
23Signed-off-by: David S. Miller <davem@davemloft.net>
24Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
25
26---
27 include/net/netns/conntrack.h | 1
28 include/net/netns/ipv4.h | 1
29 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2
30 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 4 -
31 net/ipv4/netfilter/nf_nat_core.c | 22 +++----
32 net/netfilter/nf_conntrack_core.c | 54 +++++++++---------
33 net/netfilter/nf_conntrack_expect.c | 2
34 net/netfilter/nf_conntrack_helper.c | 2
35 net/netfilter/nf_conntrack_netlink.c | 2
36 net/netfilter/nf_conntrack_standalone.c | 7 +-
37 10 files changed, 50 insertions(+), 47 deletions(-)
38
39--- a/include/net/netns/conntrack.h
40+++ b/include/net/netns/conntrack.h
41@@ -11,6 +11,7 @@ struct nf_conntrack_ecache;
42 struct netns_ct {
43 atomic_t count;
44 unsigned int expect_count;
45+ unsigned int htable_size;
46 struct kmem_cache *nf_conntrack_cachep;
47 struct hlist_nulls_head *hash;
48 struct hlist_head *expect_hash;
49--- a/include/net/netns/ipv4.h
50+++ b/include/net/netns/ipv4.h
51@@ -40,6 +40,7 @@ struct netns_ipv4 {
52 struct xt_table *iptable_security;
53 struct xt_table *nat_table;
54 struct hlist_head *nat_bysource;
55+ unsigned int nat_htable_size;
56 int nat_vmalloced;
57 #endif
58
59--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
60+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
61@@ -213,7 +213,7 @@ static ctl_table ip_ct_sysctl_table[] =
62 {
63 .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
64 .procname = "ip_conntrack_buckets",
65- .data = &nf_conntrack_htable_size,
66+ .data = &init_net.ct.htable_size,
67 .maxlen = sizeof(unsigned int),
68 .mode = 0444,
69 .proc_handler = proc_dointvec,
70--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
71+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
72@@ -32,7 +32,7 @@ static struct hlist_nulls_node *ct_get_f
73 struct hlist_nulls_node *n;
74
75 for (st->bucket = 0;
76- st->bucket < nf_conntrack_htable_size;
77+ st->bucket < net->ct.htable_size;
78 st->bucket++) {
79 n = rcu_dereference(net->ct.hash[st->bucket].first);
80 if (!is_a_nulls(n))
81@@ -50,7 +50,7 @@ static struct hlist_nulls_node *ct_get_n
82 head = rcu_dereference(head->next);
83 while (is_a_nulls(head)) {
84 if (likely(get_nulls_value(head) == st->bucket)) {
85- if (++st->bucket >= nf_conntrack_htable_size)
86+ if (++st->bucket >= net->ct.htable_size)
87 return NULL;
88 }
89 head = rcu_dereference(net->ct.hash[st->bucket].first);
90--- a/net/ipv4/netfilter/nf_nat_core.c
91+++ b/net/ipv4/netfilter/nf_nat_core.c
92@@ -35,9 +35,6 @@ static DEFINE_SPINLOCK(nf_nat_lock);
93
94 static struct nf_conntrack_l3proto *l3proto __read_mostly;
95
96-/* Calculated at init based on memory size */
97-static unsigned int nf_nat_htable_size __read_mostly;
98-
99 #define MAX_IP_NAT_PROTO 256
100 static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
101 __read_mostly;
102@@ -72,7 +69,7 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put);
103
104 /* We keep an extra hash for each conntrack, for fast searching. */
105 static inline unsigned int
106-hash_by_src(const struct nf_conntrack_tuple *tuple)
107+hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
108 {
109 unsigned int hash;
110
111@@ -80,7 +77,7 @@ hash_by_src(const struct nf_conntrack_tu
112 hash = jhash_3words((__force u32)tuple->src.u3.ip,
113 (__force u32)tuple->src.u.all,
114 tuple->dst.protonum, 0);
115- return ((u64)hash * nf_nat_htable_size) >> 32;
116+ return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
117 }
118
119 /* Is this tuple already taken? (not by us) */
120@@ -147,7 +144,7 @@ find_appropriate_src(struct net *net,
121 struct nf_conntrack_tuple *result,
122 const struct nf_nat_range *range)
123 {
124- unsigned int h = hash_by_src(tuple);
125+ unsigned int h = hash_by_src(net, tuple);
126 const struct nf_conn_nat *nat;
127 const struct nf_conn *ct;
128 const struct hlist_node *n;
129@@ -330,7 +327,7 @@ nf_nat_setup_info(struct nf_conn *ct,
130 if (have_to_hash) {
131 unsigned int srchash;
132
133- srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
134+ srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
135 spin_lock_bh(&nf_nat_lock);
136 /* nf_conntrack_alter_reply might re-allocate exntension aera */
137 nat = nfct_nat(ct);
138@@ -679,8 +676,10 @@ nfnetlink_parse_nat_setup(struct nf_conn
139
140 static int __net_init nf_nat_net_init(struct net *net)
141 {
142- net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
143- &net->ipv4.nat_vmalloced, 0);
144+ /* Leave them the same for the moment. */
145+ net->ipv4.nat_htable_size = net->ct.htable_size;
146+ net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
147+ &net->ipv4.nat_vmalloced, 0);
148 if (!net->ipv4.nat_bysource)
149 return -ENOMEM;
150 return 0;
151@@ -703,7 +702,7 @@ static void __net_exit nf_nat_net_exit(s
152 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
153 synchronize_rcu();
154 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
155- nf_nat_htable_size);
156+ net->ipv4.nat_htable_size);
157 }
158
159 static struct pernet_operations nf_nat_net_ops = {
160@@ -724,9 +723,6 @@ static int __init nf_nat_init(void)
161 return ret;
162 }
163
164- /* Leave them the same for the moment. */
165- nf_nat_htable_size = nf_conntrack_htable_size;
166-
167 ret = register_pernet_subsys(&nf_nat_net_ops);
168 if (ret < 0)
169 goto cleanup_extend;
170--- a/net/netfilter/nf_conntrack_core.c
171+++ b/net/netfilter/nf_conntrack_core.c
172@@ -30,6 +30,7 @@
173 #include <linux/netdevice.h>
174 #include <linux/socket.h>
175 #include <linux/mm.h>
176+#include <linux/nsproxy.h>
177 #include <linux/rculist_nulls.h>
178
179 #include <net/netfilter/nf_conntrack.h>
180@@ -84,9 +85,10 @@ static u_int32_t __hash_conntrack(const
181 return ((u64)h * size) >> 32;
182 }
183
184-static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
185+static inline u_int32_t hash_conntrack(const struct net *net,
186+ const struct nf_conntrack_tuple *tuple)
187 {
188- return __hash_conntrack(tuple, nf_conntrack_htable_size,
189+ return __hash_conntrack(tuple, net->ct.htable_size,
190 nf_conntrack_hash_rnd);
191 }
192
193@@ -294,7 +296,7 @@ __nf_conntrack_find(struct net *net, con
194 {
195 struct nf_conntrack_tuple_hash *h;
196 struct hlist_nulls_node *n;
197- unsigned int hash = hash_conntrack(tuple);
198+ unsigned int hash = hash_conntrack(net, tuple);
199
200 /* Disable BHs the entire time since we normally need to disable them
201 * at least once for the stats anyway.
202@@ -364,10 +366,11 @@ static void __nf_conntrack_hash_insert(s
203
204 void nf_conntrack_hash_insert(struct nf_conn *ct)
205 {
206+ struct net *net = nf_ct_net(ct);
207 unsigned int hash, repl_hash;
208
209- hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
210- repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
211+ hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
212+ repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
213
214 __nf_conntrack_hash_insert(ct, hash, repl_hash);
215 }
216@@ -395,8 +398,8 @@ __nf_conntrack_confirm(struct sk_buff *s
217 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
218 return NF_ACCEPT;
219
220- hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
221- repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
222+ hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
223+ repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
224
225 /* We're not in hash table, and we refuse to set up related
226 connections for unconfirmed conns. But packet copies and
227@@ -466,7 +469,7 @@ nf_conntrack_tuple_taken(const struct nf
228 struct net *net = nf_ct_net(ignored_conntrack);
229 struct nf_conntrack_tuple_hash *h;
230 struct hlist_nulls_node *n;
231- unsigned int hash = hash_conntrack(tuple);
232+ unsigned int hash = hash_conntrack(net, tuple);
233
234 /* Disable BHs the entire time since we need to disable them at
235 * least once for the stats anyway.
236@@ -501,7 +504,7 @@ static noinline int early_drop(struct ne
237 int dropped = 0;
238
239 rcu_read_lock();
240- for (i = 0; i < nf_conntrack_htable_size; i++) {
241+ for (i = 0; i < net->ct.htable_size; i++) {
242 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
243 hnnode) {
244 tmp = nf_ct_tuplehash_to_ctrack(h);
245@@ -515,7 +518,8 @@ static noinline int early_drop(struct ne
246 ct = NULL;
247 if (ct || cnt >= NF_CT_EVICTION_RANGE)
248 break;
249- hash = (hash + 1) % nf_conntrack_htable_size;
250+
251+ hash = (hash + 1) % net->ct.htable_size;
252 }
253 rcu_read_unlock();
254
255@@ -549,7 +553,7 @@ struct nf_conn *nf_conntrack_alloc(struc
256
257 if (nf_conntrack_max &&
258 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
259- unsigned int hash = hash_conntrack(orig);
260+ unsigned int hash = hash_conntrack(net, orig);
261 if (!early_drop(net, hash)) {
262 atomic_dec(&net->ct.count);
263 if (net_ratelimit())
264@@ -1006,7 +1010,7 @@ get_next_corpse(struct net *net, int (*i
265 struct hlist_nulls_node *n;
266
267 spin_lock_bh(&nf_conntrack_lock);
268- for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
269+ for (; *bucket < net->ct.htable_size; (*bucket)++) {
270 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
271 ct = nf_ct_tuplehash_to_ctrack(h);
272 if (iter(ct, data))
273@@ -1124,7 +1128,7 @@ static void nf_conntrack_cleanup_net(str
274 }
275
276 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
277- nf_conntrack_htable_size);
278+ net->ct.htable_size);
279 nf_conntrack_ecache_fini(net);
280 nf_conntrack_acct_fini(net);
281 nf_conntrack_expect_fini(net);
282@@ -1184,10 +1188,12 @@ int nf_conntrack_set_hashsize(const char
283 {
284 int i, bucket, vmalloced, old_vmalloced;
285 unsigned int hashsize, old_size;
286- int rnd;
287 struct hlist_nulls_head *hash, *old_hash;
288 struct nf_conntrack_tuple_hash *h;
289
290+ if (current->nsproxy->net_ns != &init_net)
291+ return -EOPNOTSUPP;
292+
293 /* On boot, we can set this without any fancy locking. */
294 if (!nf_conntrack_htable_size)
295 return param_set_uint(val, kp);
296@@ -1200,33 +1206,29 @@ int nf_conntrack_set_hashsize(const char
297 if (!hash)
298 return -ENOMEM;
299
300- /* We have to rehahs for the new table anyway, so we also can
301- * use a newrandom seed */
302- get_random_bytes(&rnd, sizeof(rnd));
303-
304 /* Lookups in the old hash might happen in parallel, which means we
305 * might get false negatives during connection lookup. New connections
306 * created because of a false negative won't make it into the hash
307 * though since that required taking the lock.
308 */
309 spin_lock_bh(&nf_conntrack_lock);
310- for (i = 0; i < nf_conntrack_htable_size; i++) {
311+ for (i = 0; i < init_net.ct.htable_size; i++) {
312 while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
313 h = hlist_nulls_entry(init_net.ct.hash[i].first,
314 struct nf_conntrack_tuple_hash, hnnode);
315 hlist_nulls_del_rcu(&h->hnnode);
316- bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
317+ bucket = __hash_conntrack(&h->tuple, hashsize,
318+ nf_conntrack_hash_rnd);
319 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
320 }
321 }
322- old_size = nf_conntrack_htable_size;
323+ old_size = init_net.ct.htable_size;
324 old_vmalloced = init_net.ct.hash_vmalloc;
325 old_hash = init_net.ct.hash;
326
327- nf_conntrack_htable_size = hashsize;
328+ init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
329 init_net.ct.hash_vmalloc = vmalloced;
330 init_net.ct.hash = hash;
331- nf_conntrack_hash_rnd = rnd;
332 spin_unlock_bh(&nf_conntrack_lock);
333
334 nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
335@@ -1322,7 +1324,9 @@ static int nf_conntrack_init_net(struct
336 ret = -ENOMEM;
337 goto err_cache;
338 }
339- net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
340+
341+ net->ct.htable_size = nf_conntrack_htable_size;
342+ net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size,
343 &net->ct.hash_vmalloc, 1);
344 if (!net->ct.hash) {
345 ret = -ENOMEM;
346@@ -1347,7 +1351,7 @@ err_acct:
347 nf_conntrack_expect_fini(net);
348 err_expect:
349 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
350- nf_conntrack_htable_size);
351+ net->ct.htable_size);
352 err_hash:
353 kmem_cache_destroy(net->ct.nf_conntrack_cachep);
354 err_cache:
355--- a/net/netfilter/nf_conntrack_expect.c
356+++ b/net/netfilter/nf_conntrack_expect.c
357@@ -577,7 +577,7 @@ int nf_conntrack_expect_init(struct net
358
359 if (net_eq(net, &init_net)) {
360 if (!nf_ct_expect_hsize) {
361- nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
362+ nf_ct_expect_hsize = net->ct.htable_size / 256;
363 if (!nf_ct_expect_hsize)
364 nf_ct_expect_hsize = 1;
365 }
366--- a/net/netfilter/nf_conntrack_helper.c
367+++ b/net/netfilter/nf_conntrack_helper.c
368@@ -192,7 +192,7 @@ static void __nf_conntrack_helper_unregi
369 /* Get rid of expecteds, set helpers to NULL. */
370 hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode)
371 unhelp(h, me);
372- for (i = 0; i < nf_conntrack_htable_size; i++) {
373+ for (i = 0; i < net->ct.htable_size; i++) {
374 hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
375 unhelp(h, me);
376 }
377--- a/net/netfilter/nf_conntrack_netlink.c
378+++ b/net/netfilter/nf_conntrack_netlink.c
379@@ -594,7 +594,7 @@ ctnetlink_dump_table(struct sk_buff *skb
380
381 rcu_read_lock();
382 last = (struct nf_conn *)cb->args[1];
383- for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
384+ for (; cb->args[0] < init_net.ct.htable_size; cb->args[0]++) {
385 restart:
386 hlist_nulls_for_each_entry_rcu(h, n, &init_net.ct.hash[cb->args[0]],
387 hnnode) {
388--- a/net/netfilter/nf_conntrack_standalone.c
389+++ b/net/netfilter/nf_conntrack_standalone.c
390@@ -51,7 +51,7 @@ static struct hlist_nulls_node *ct_get_f
391 struct hlist_nulls_node *n;
392
393 for (st->bucket = 0;
394- st->bucket < nf_conntrack_htable_size;
395+ st->bucket < net->ct.htable_size;
396 st->bucket++) {
397 n = rcu_dereference(net->ct.hash[st->bucket].first);
398 if (!is_a_nulls(n))
399@@ -69,7 +69,7 @@ static struct hlist_nulls_node *ct_get_n
400 head = rcu_dereference(head->next);
401 while (is_a_nulls(head)) {
402 if (likely(get_nulls_value(head) == st->bucket)) {
403- if (++st->bucket >= nf_conntrack_htable_size)
404+ if (++st->bucket >= net->ct.htable_size)
405 return NULL;
406 }
407 head = rcu_dereference(net->ct.hash[st->bucket].first);
408@@ -358,7 +358,7 @@ static ctl_table nf_ct_sysctl_table[] =
409 {
410 .ctl_name = NET_NF_CONNTRACK_BUCKETS,
411 .procname = "nf_conntrack_buckets",
412- .data = &nf_conntrack_htable_size,
413+ .data = &init_net.ct.htable_size,
414 .maxlen = sizeof(unsigned int),
415 .mode = 0444,
416 .proc_handler = proc_dointvec,
417@@ -429,6 +429,7 @@ static int nf_conntrack_standalone_init_
418 goto out_kmemdup;
419
420 table[1].data = &net->ct.count;
421+ table[2].data = &net->ct.htable_size;
422 table[3].data = &net->ct.sysctl_checksum;
423 table[4].data = &net->ct.sysctl_log_invalid;
424