]> git.ipfire.org Git - people/arne_f/kernel.git/blame - net/netfilter/nf_conntrack_core.c
netfilter: layer7: ct memory optimization
[people/arne_f/kernel.git] / net / netfilter / nf_conntrack_core.c
CommitLineData
9fb9cbb1
YK
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
dc808fe2 6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
9fb9cbb1 7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
f229f6ce 8 * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
9fb9cbb1
YK
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
9fb9cbb1
YK
13 */
14
ccd63c20
WJ
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
9fb9cbb1
YK
17#include <linux/types.h>
18#include <linux/netfilter.h>
19#include <linux/module.h>
d43c36dc 20#include <linux/sched.h>
9fb9cbb1
YK
21#include <linux/skbuff.h>
22#include <linux/proc_fs.h>
23#include <linux/vmalloc.h>
24#include <linux/stddef.h>
25#include <linux/slab.h>
26#include <linux/random.h>
27#include <linux/jhash.h>
28#include <linux/err.h>
29#include <linux/percpu.h>
30#include <linux/moduleparam.h>
31#include <linux/notifier.h>
32#include <linux/kernel.h>
33#include <linux/netdevice.h>
34#include <linux/socket.h>
d7fe0f24 35#include <linux/mm.h>
d696c7bd 36#include <linux/nsproxy.h>
ea781f19 37#include <linux/rculist_nulls.h>
9fb9cbb1 38
9fb9cbb1
YK
39#include <net/netfilter/nf_conntrack.h>
40#include <net/netfilter/nf_conntrack_l3proto.h>
605dcad6 41#include <net/netfilter/nf_conntrack_l4proto.h>
77ab9cff 42#include <net/netfilter/nf_conntrack_expect.h>
9fb9cbb1 43#include <net/netfilter/nf_conntrack_helper.h>
41d73ec0 44#include <net/netfilter/nf_conntrack_seqadj.h>
9fb9cbb1 45#include <net/netfilter/nf_conntrack_core.h>
ecfab2c9 46#include <net/netfilter/nf_conntrack_extend.h>
58401572 47#include <net/netfilter/nf_conntrack_acct.h>
a0891aa6 48#include <net/netfilter/nf_conntrack_ecache.h>
5d0aa2cc 49#include <net/netfilter/nf_conntrack_zones.h>
a992ca2a 50#include <net/netfilter/nf_conntrack_timestamp.h>
dd705072 51#include <net/netfilter/nf_conntrack_timeout.h>
c539f017 52#include <net/netfilter/nf_conntrack_labels.h>
48b1de4c 53#include <net/netfilter/nf_conntrack_synproxy.h>
e6a7d3c0 54#include <net/netfilter/nf_nat.h>
e17b666a 55#include <net/netfilter/nf_nat_core.h>
49376368 56#include <net/netfilter/nf_nat_helper.h>
1b8c8a9f 57#include <net/netns/hash.h>
9fb9cbb1 58
e2a75007
FW
59#include "nf_internals.h"
60
dc808fe2 61#define NF_CONNTRACK_VERSION "0.5.0"
9fb9cbb1 62
e17b666a
PM
63int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
64 enum nf_nat_manip_type manip,
39938324 65 const struct nlattr *attr) __read_mostly;
e6a7d3c0
PNA
66EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
67
93bb0ceb
JDB
68__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
69EXPORT_SYMBOL_GPL(nf_conntrack_locks);
9fb9cbb1 70
ca7433df
JDB
71__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
72EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
73
56d52d48
FW
74struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
75EXPORT_SYMBOL_GPL(nf_conntrack_hash);
76
b87a2f91
FW
77struct conntrack_gc_work {
78 struct delayed_work dwork;
79 u32 last_bucket;
80 bool exiting;
c6dd940b 81 bool early_drop;
e0df8cae 82 long next_gc_run;
b87a2f91
FW
83};
84
0c5366b3 85static __read_mostly struct kmem_cache *nf_conntrack_cachep;
b16c2919 86static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
70d72b7e 87static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
b16c2919
SL
88static __read_mostly bool nf_conntrack_locks_all;
89
e0df8cae 90/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
e5072053
FW
91#define GC_MAX_BUCKETS_DIV 128u
92/* upper bound of full table scan */
93#define GC_MAX_SCAN_JIFFIES (16u * HZ)
94/* desired ratio of entries found to be expired */
95#define GC_EVICT_RATIO 50u
b87a2f91
FW
96
97static struct conntrack_gc_work conntrack_gc_work;
98
b16c2919
SL
99void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
100{
3ef0c7a7 101 /* 1) Acquire the lock */
b16c2919 102 spin_lock(lock);
b316ff78 103
3ef0c7a7
MS
104 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics
105 * It pairs with the smp_store_release() in nf_conntrack_all_unlock()
106 */
107 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
108 return;
109
110 /* fast path failed, unlock */
111 spin_unlock(lock);
112
113 /* Slow path 1) get global lock */
114 spin_lock(&nf_conntrack_locks_all_lock);
115
116 /* Slow path 2) get the lock we want */
117 spin_lock(lock);
118
119 /* Slow path 3) release the global lock */
120 spin_unlock(&nf_conntrack_locks_all_lock);
b16c2919
SL
121}
122EXPORT_SYMBOL_GPL(nf_conntrack_lock);
123
93bb0ceb
JDB
124static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
125{
126 h1 %= CONNTRACK_LOCKS;
127 h2 %= CONNTRACK_LOCKS;
128 spin_unlock(&nf_conntrack_locks[h1]);
129 if (h1 != h2)
130 spin_unlock(&nf_conntrack_locks[h2]);
131}
132
133/* return true if we need to recompute hashes (in case hash table was resized) */
134static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
135 unsigned int h2, unsigned int sequence)
136{
137 h1 %= CONNTRACK_LOCKS;
138 h2 %= CONNTRACK_LOCKS;
139 if (h1 <= h2) {
b16c2919 140 nf_conntrack_lock(&nf_conntrack_locks[h1]);
93bb0ceb
JDB
141 if (h1 != h2)
142 spin_lock_nested(&nf_conntrack_locks[h2],
143 SINGLE_DEPTH_NESTING);
144 } else {
b16c2919 145 nf_conntrack_lock(&nf_conntrack_locks[h2]);
93bb0ceb
JDB
146 spin_lock_nested(&nf_conntrack_locks[h1],
147 SINGLE_DEPTH_NESTING);
148 }
a3efd812 149 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
93bb0ceb
JDB
150 nf_conntrack_double_unlock(h1, h2);
151 return true;
152 }
153 return false;
154}
155
156static void nf_conntrack_all_lock(void)
157{
158 int i;
159
b16c2919 160 spin_lock(&nf_conntrack_locks_all_lock);
b16c2919 161
3ef0c7a7 162 nf_conntrack_locks_all = true;
b316ff78 163
b16c2919 164 for (i = 0; i < CONNTRACK_LOCKS; i++) {
3ef0c7a7
MS
165 spin_lock(&nf_conntrack_locks[i]);
166
167 /* This spin_unlock provides the "release" to ensure that
168 * nf_conntrack_locks_all==true is visible to everyone that
169 * acquired spin_lock(&nf_conntrack_locks[]).
170 */
171 spin_unlock(&nf_conntrack_locks[i]);
b16c2919 172 }
93bb0ceb
JDB
173}
174
175static void nf_conntrack_all_unlock(void)
176{
3ef0c7a7 177 /* All prior stores must be complete before we clear
b316ff78
PZ
178 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
179 * might observe the false value but not the entire
3ef0c7a7
MS
180 * critical section.
181 * It pairs with the smp_load_acquire() in nf_conntrack_lock()
b316ff78
PZ
182 */
183 smp_store_release(&nf_conntrack_locks_all, false);
b16c2919 184 spin_unlock(&nf_conntrack_locks_all_lock);
93bb0ceb
JDB
185}
186
e2b7606c 187unsigned int nf_conntrack_htable_size __read_mostly;
2567c4ea
PNA
188EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
189
e478075c 190unsigned int nf_conntrack_max __read_mostly;
92e47ba8 191seqcount_t nf_conntrack_generation __read_mostly;
141658fb 192static unsigned int nf_conntrack_hash_rnd __read_mostly;
9fb9cbb1 193
1b8c8a9f
FW
194static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
195 const struct net *net)
9fb9cbb1 196{
0794935e 197 unsigned int n;
1b8c8a9f 198 u32 seed;
0794935e 199
141658fb
FW
200 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
201
0794935e
PM
202 /* The direction must be ignored, so we hash everything up to the
203 * destination ports (which is a multiple of 4) and treat the last
204 * three bytes manually.
205 */
1b8c8a9f 206 seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
0794935e 207 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
1b8c8a9f 208 return jhash2((u32 *)tuple, n, seed ^
99f07e91
CG
209 (((__force __u16)tuple->dst.u.all << 16) |
210 tuple->dst.protonum));
211}
212
56d52d48 213static u32 scale_hash(u32 hash)
99f07e91 214{
56d52d48 215 return reciprocal_scale(hash, nf_conntrack_htable_size);
99f07e91 216}
0794935e 217
1b8c8a9f
FW
218static u32 __hash_conntrack(const struct net *net,
219 const struct nf_conntrack_tuple *tuple,
220 unsigned int size)
99f07e91 221{
1b8c8a9f 222 return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
9fb9cbb1
YK
223}
224
1b8c8a9f
FW
225static u32 hash_conntrack(const struct net *net,
226 const struct nf_conntrack_tuple *tuple)
9fb9cbb1 227{
56d52d48 228 return scale_hash(hash_conntrack_raw(tuple, net));
9fb9cbb1
YK
229}
230
5f2b4c90 231bool
9fb9cbb1
YK
232nf_ct_get_tuple(const struct sk_buff *skb,
233 unsigned int nhoff,
234 unsigned int dataoff,
235 u_int16_t l3num,
236 u_int8_t protonum,
a31f1adc 237 struct net *net,
9fb9cbb1
YK
238 struct nf_conntrack_tuple *tuple,
239 const struct nf_conntrack_l3proto *l3proto,
605dcad6 240 const struct nf_conntrack_l4proto *l4proto)
9fb9cbb1 241{
443a70d5 242 memset(tuple, 0, sizeof(*tuple));
9fb9cbb1
YK
243
244 tuple->src.l3num = l3num;
245 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
5f2b4c90 246 return false;
9fb9cbb1
YK
247
248 tuple->dst.protonum = protonum;
249 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
250
a31f1adc 251 return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
9fb9cbb1 252}
13b18339 253EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
9fb9cbb1 254
5f2b4c90 255bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
a31f1adc
EB
256 u_int16_t l3num,
257 struct net *net, struct nf_conntrack_tuple *tuple)
e2a3123f 258{
b3480fe0
FW
259 const struct nf_conntrack_l3proto *l3proto;
260 const struct nf_conntrack_l4proto *l4proto;
e2a3123f
YK
261 unsigned int protoff;
262 u_int8_t protonum;
263 int ret;
264
265 rcu_read_lock();
266
267 l3proto = __nf_ct_l3proto_find(l3num);
268 ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
269 if (ret != NF_ACCEPT) {
270 rcu_read_unlock();
5f2b4c90 271 return false;
e2a3123f
YK
272 }
273
274 l4proto = __nf_ct_l4proto_find(l3num, protonum);
275
a31f1adc 276 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
e2a3123f
YK
277 l3proto, l4proto);
278
279 rcu_read_unlock();
280 return ret;
281}
282EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
283
5f2b4c90 284bool
9fb9cbb1
YK
285nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
286 const struct nf_conntrack_tuple *orig,
287 const struct nf_conntrack_l3proto *l3proto,
605dcad6 288 const struct nf_conntrack_l4proto *l4proto)
9fb9cbb1 289{
443a70d5 290 memset(inverse, 0, sizeof(*inverse));
9fb9cbb1
YK
291
292 inverse->src.l3num = orig->src.l3num;
293 if (l3proto->invert_tuple(inverse, orig) == 0)
5f2b4c90 294 return false;
9fb9cbb1
YK
295
296 inverse->dst.dir = !orig->dst.dir;
297
298 inverse->dst.protonum = orig->dst.protonum;
605dcad6 299 return l4proto->invert_tuple(inverse, orig);
9fb9cbb1 300}
13b18339 301EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
9fb9cbb1 302
9fb9cbb1
YK
303static void
304clean_from_lists(struct nf_conn *ct)
305{
0d53778e 306 pr_debug("clean_from_lists(%p)\n", ct);
ea781f19
ED
307 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
308 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
9fb9cbb1
YK
309
310 /* Destroy all pending expectations */
c1d10adb 311 nf_ct_remove_expectations(ct);
9fb9cbb1
YK
312}
313
b7779d06
JDB
314/* must be called with local_bh_disable */
315static void nf_ct_add_to_dying_list(struct nf_conn *ct)
316{
317 struct ct_pcpu *pcpu;
318
319 /* add this conntrack to the (per cpu) dying list */
320 ct->cpu = smp_processor_id();
321 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
322
323 spin_lock(&pcpu->lock);
324 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
325 &pcpu->dying);
326 spin_unlock(&pcpu->lock);
327}
328
329/* must be called with local_bh_disable */
330static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
331{
332 struct ct_pcpu *pcpu;
333
334 /* add this conntrack to the (per cpu) unconfirmed list */
335 ct->cpu = smp_processor_id();
336 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
337
338 spin_lock(&pcpu->lock);
339 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
340 &pcpu->unconfirmed);
341 spin_unlock(&pcpu->lock);
342}
343
344/* must be called with local_bh_disable */
345static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
346{
347 struct ct_pcpu *pcpu;
348
349 /* We overload first tuple to link into unconfirmed or dying list.*/
350 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
351
352 spin_lock(&pcpu->lock);
353 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
354 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
355 spin_unlock(&pcpu->lock);
356}
357
30322309
FW
358#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
359
0838aa7f 360/* Released via destroy_conntrack() */
308ac914
DB
361struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
362 const struct nf_conntrack_zone *zone,
363 gfp_t flags)
0838aa7f 364{
30322309 365 struct nf_conn *tmpl, *p;
0838aa7f 366
30322309
FW
367 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
368 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
369 if (!tmpl)
370 return NULL;
371
372 p = tmpl;
373 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
374 if (tmpl != p) {
375 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
376 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
377 }
378 } else {
379 tmpl = kzalloc(sizeof(*tmpl), flags);
380 if (!tmpl)
381 return NULL;
382 }
0838aa7f
PNA
383
384 tmpl->status = IPS_TEMPLATE;
385 write_pnet(&tmpl->ct_net, net);
6c8dee98 386 nf_ct_zone_add(tmpl, zone);
0838aa7f
PNA
387 atomic_set(&tmpl->ct_general.use, 0);
388
389 return tmpl;
0838aa7f
PNA
390}
391EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
392
9cf94eab 393void nf_ct_tmpl_free(struct nf_conn *tmpl)
0838aa7f
PNA
394{
395 nf_ct_ext_destroy(tmpl);
396 nf_ct_ext_free(tmpl);
30322309
FW
397
398 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
399 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
400 else
401 kfree(tmpl);
0838aa7f 402}
9cf94eab 403EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
0838aa7f 404
9fb9cbb1
YK
405static void
406destroy_conntrack(struct nf_conntrack *nfct)
407{
408 struct nf_conn *ct = (struct nf_conn *)nfct;
b3480fe0 409 const struct nf_conntrack_l4proto *l4proto;
9fb9cbb1 410
0d53778e 411 pr_debug("destroy_conntrack(%p)\n", ct);
44d6e2f2 412 WARN_ON(atomic_read(&nfct->use) != 0);
9fb9cbb1 413
0838aa7f
PNA
414 if (unlikely(nf_ct_is_template(ct))) {
415 nf_ct_tmpl_free(ct);
416 return;
417 }
5e8fbe2a 418 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
4b4ceb9d 419 if (l4proto->destroy)
605dcad6 420 l4proto->destroy(ct);
9fb9cbb1 421
ca7433df 422 local_bh_disable();
9fb9cbb1
YK
423 /* Expectations will have been removed in clean_from_lists,
424 * except TFTP can create an expectation on the first packet,
425 * before connection is in the list, so we need to clean here,
ca7433df
JDB
426 * too.
427 */
c1d10adb 428 nf_ct_remove_expectations(ct);
9fb9cbb1 429
756cef5e 430#if defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE)
756cef5e
AF
431 if(ct->layer7.app_data)
432 kfree(ct->layer7.app_data);
433#endif
434
b7779d06 435 nf_ct_del_from_dying_or_unconfirmed_list(ct);
9fb9cbb1 436
ca7433df 437 local_bh_enable();
9fb9cbb1
YK
438
439 if (ct->master)
440 nf_ct_put(ct->master);
441
0d53778e 442 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
9fb9cbb1
YK
443 nf_conntrack_free(ct);
444}
445
02982c27 446static void nf_ct_delete_from_lists(struct nf_conn *ct)
9fb9cbb1 447{
0d55af87 448 struct net *net = nf_ct_net(ct);
93bb0ceb 449 unsigned int hash, reply_hash;
93bb0ceb 450 unsigned int sequence;
9fb9cbb1 451
9858a3ae 452 nf_ct_helper_destroy(ct);
93bb0ceb
JDB
453
454 local_bh_disable();
455 do {
a3efd812 456 sequence = read_seqcount_begin(&nf_conntrack_generation);
deedb590 457 hash = hash_conntrack(net,
93bb0ceb 458 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
deedb590 459 reply_hash = hash_conntrack(net,
93bb0ceb
JDB
460 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
461 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
462
9fb9cbb1 463 clean_from_lists(ct);
93bb0ceb
JDB
464 nf_conntrack_double_unlock(hash, reply_hash);
465
b7779d06 466 nf_ct_add_to_dying_list(ct);
93bb0ceb 467
93bb0ceb 468 local_bh_enable();
dd7669a9 469}
dd7669a9 470
02982c27 471bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
dd7669a9 472{
a992ca2a
PNA
473 struct nf_conn_tstamp *tstamp;
474
f330a7fd
FW
475 if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
476 return false;
477
a992ca2a
PNA
478 tstamp = nf_conn_tstamp_find(ct);
479 if (tstamp && tstamp->stop == 0)
d2de875c 480 tstamp->stop = ktime_get_real_ns();
dd7669a9 481
9500507c
FW
482 if (nf_conntrack_event_report(IPCT_DESTROY, ct,
483 portid, report) < 0) {
f330a7fd
FW
484 /* destroy event was not delivered. nf_ct_put will
485 * be done by event cache worker on redelivery.
486 */
dd7669a9 487 nf_ct_delete_from_lists(ct);
9500507c 488 nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
02982c27 489 return false;
dd7669a9 490 }
9500507c
FW
491
492 nf_conntrack_ecache_work(nf_ct_net(ct));
dd7669a9 493 nf_ct_delete_from_lists(ct);
9fb9cbb1 494 nf_ct_put(ct);
02982c27
FW
495 return true;
496}
497EXPORT_SYMBOL_GPL(nf_ct_delete);
498
c6825c09
AV
499static inline bool
500nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
308ac914 501 const struct nf_conntrack_tuple *tuple,
e0c7d472
FW
502 const struct nf_conntrack_zone *zone,
503 const struct net *net)
c6825c09
AV
504{
505 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
506
507 /* A conntrack can be recreated with the equal tuple,
508 * so we need to check that the conntrack is confirmed
509 */
510 return nf_ct_tuple_equal(tuple, &h->tuple) &&
deedb590 511 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
e0c7d472
FW
512 nf_ct_is_confirmed(ct) &&
513 net_eq(net, nf_ct_net(ct));
c6825c09
AV
514}
515
f330a7fd
FW
516/* caller must hold rcu readlock and none of the nf_conntrack_locks */
517static void nf_ct_gc_expired(struct nf_conn *ct)
518{
519 if (!atomic_inc_not_zero(&ct->ct_general.use))
520 return;
521
522 if (nf_ct_should_gc(ct))
523 nf_ct_kill(ct);
524
525 nf_ct_put(ct);
526}
527
ea781f19
ED
528/*
529 * Warning :
530 * - Caller must take a reference on returned object
531 * and recheck nf_ct_tuple_equal(tuple, &h->tuple)
ea781f19 532 */
99f07e91 533static struct nf_conntrack_tuple_hash *
308ac914 534____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
99f07e91 535 const struct nf_conntrack_tuple *tuple, u32 hash)
9fb9cbb1
YK
536{
537 struct nf_conntrack_tuple_hash *h;
5e3c61f9 538 struct hlist_nulls_head *ct_hash;
ea781f19 539 struct hlist_nulls_node *n;
92e47ba8 540 unsigned int bucket, hsize;
9fb9cbb1 541
ea781f19 542begin:
92e47ba8
LZ
543 nf_conntrack_get_ht(&ct_hash, &hsize);
544 bucket = reciprocal_scale(hash, hsize);
5e3c61f9
FW
545
546 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
f330a7fd
FW
547 struct nf_conn *ct;
548
549 ct = nf_ct_tuplehash_to_ctrack(h);
550 if (nf_ct_is_expired(ct)) {
551 nf_ct_gc_expired(ct);
552 continue;
553 }
554
555 if (nf_ct_is_dying(ct))
556 continue;
557
8e8118f8 558 if (nf_ct_key_equal(h, tuple, zone, net))
9fb9cbb1 559 return h;
9fb9cbb1 560 }
ea781f19
ED
561 /*
562 * if the nulls value we got at the end of this lookup is
563 * not the expected one, we must restart lookup.
564 * We probably met an item that was moved to another chain.
565 */
99f07e91 566 if (get_nulls_value(n) != bucket) {
2cf12348 567 NF_CT_STAT_INC_ATOMIC(net, search_restart);
ea781f19 568 goto begin;
af740b2c 569 }
9fb9cbb1
YK
570
571 return NULL;
572}
99f07e91 573
9fb9cbb1 574/* Find a connection corresponding to a tuple. */
99f07e91 575static struct nf_conntrack_tuple_hash *
308ac914 576__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
99f07e91 577 const struct nf_conntrack_tuple *tuple, u32 hash)
9fb9cbb1
YK
578{
579 struct nf_conntrack_tuple_hash *h;
76507f69 580 struct nf_conn *ct;
9fb9cbb1 581
76507f69 582 rcu_read_lock();
ea781f19 583begin:
99f07e91 584 h = ____nf_conntrack_find(net, zone, tuple, hash);
76507f69
PM
585 if (h) {
586 ct = nf_ct_tuplehash_to_ctrack(h);
8d8890b7
PM
587 if (unlikely(nf_ct_is_dying(ct) ||
588 !atomic_inc_not_zero(&ct->ct_general.use)))
76507f69 589 h = NULL;
ea781f19 590 else {
e0c7d472 591 if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
ea781f19
ED
592 nf_ct_put(ct);
593 goto begin;
594 }
595 }
76507f69
PM
596 }
597 rcu_read_unlock();
9fb9cbb1
YK
598
599 return h;
600}
99f07e91
CG
601
602struct nf_conntrack_tuple_hash *
308ac914 603nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
99f07e91
CG
604 const struct nf_conntrack_tuple *tuple)
605{
606 return __nf_conntrack_find_get(net, zone, tuple,
1b8c8a9f 607 hash_conntrack_raw(tuple, net));
99f07e91 608}
13b18339 609EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
9fb9cbb1 610
c1d10adb
PNA
611static void __nf_conntrack_hash_insert(struct nf_conn *ct,
612 unsigned int hash,
b476b72a 613 unsigned int reply_hash)
c1d10adb 614{
ea781f19 615 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
56d52d48 616 &nf_conntrack_hash[hash]);
ea781f19 617 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
56d52d48 618 &nf_conntrack_hash[reply_hash]);
c1d10adb
PNA
619}
620
7d367e06
JK
621int
622nf_conntrack_hash_check_insert(struct nf_conn *ct)
c1d10adb 623{
308ac914 624 const struct nf_conntrack_zone *zone;
d696c7bd 625 struct net *net = nf_ct_net(ct);
b476b72a 626 unsigned int hash, reply_hash;
7d367e06
JK
627 struct nf_conntrack_tuple_hash *h;
628 struct hlist_nulls_node *n;
93bb0ceb 629 unsigned int sequence;
c1d10adb 630
5d0aa2cc 631 zone = nf_ct_zone(ct);
7d367e06 632
93bb0ceb
JDB
633 local_bh_disable();
634 do {
a3efd812 635 sequence = read_seqcount_begin(&nf_conntrack_generation);
deedb590 636 hash = hash_conntrack(net,
93bb0ceb 637 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
deedb590 638 reply_hash = hash_conntrack(net,
93bb0ceb
JDB
639 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
640 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
7d367e06
JK
641
642 /* See if there's one in the list already, including reverse */
56d52d48 643 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
86804348 644 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
e0c7d472 645 zone, net))
7d367e06 646 goto out;
86804348 647
56d52d48 648 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
86804348 649 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
e0c7d472 650 zone, net))
7d367e06 651 goto out;
c1d10adb 652
e53376be
PNA
653 smp_wmb();
654 /* The caller holds a reference to this object */
655 atomic_set(&ct->ct_general.use, 2);
b476b72a 656 __nf_conntrack_hash_insert(ct, hash, reply_hash);
93bb0ceb 657 nf_conntrack_double_unlock(hash, reply_hash);
7d367e06 658 NF_CT_STAT_INC(net, insert);
93bb0ceb 659 local_bh_enable();
7d367e06
JK
660 return 0;
661
662out:
93bb0ceb 663 nf_conntrack_double_unlock(hash, reply_hash);
7d367e06 664 NF_CT_STAT_INC(net, insert_failed);
93bb0ceb 665 local_bh_enable();
7d367e06 666 return -EEXIST;
c1d10adb 667}
7d367e06 668EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
c1d10adb 669
ba76738c
PNA
670static inline void nf_ct_acct_update(struct nf_conn *ct,
671 enum ip_conntrack_info ctinfo,
672 unsigned int len)
673{
674 struct nf_conn_acct *acct;
675
676 acct = nf_conn_acct_find(ct);
677 if (acct) {
678 struct nf_conn_counter *counter = acct->counter;
679
680 atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
681 atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
682 }
683}
684
71d8c47f
PNA
685static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
686 const struct nf_conn *loser_ct)
687{
688 struct nf_conn_acct *acct;
689
690 acct = nf_conn_acct_find(loser_ct);
691 if (acct) {
692 struct nf_conn_counter *counter = acct->counter;
71d8c47f
PNA
693 unsigned int bytes;
694
695 /* u32 should be fine since we must have seen one packet. */
696 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
697 nf_ct_acct_update(ct, ctinfo, bytes);
698 }
699}
700
701/* Resolve race on insertion if this protocol allows this. */
702static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
703 enum ip_conntrack_info ctinfo,
704 struct nf_conntrack_tuple_hash *h)
705{
706 /* This is the conntrack entry already in hashes that won race. */
707 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
b3480fe0 708 const struct nf_conntrack_l4proto *l4proto;
71d8c47f
PNA
709
710 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
711 if (l4proto->allow_clash &&
6e699867 712 ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
71d8c47f
PNA
713 !nf_ct_is_dying(ct) &&
714 atomic_inc_not_zero(&ct->ct_general.use)) {
97a6ad13
FW
715 enum ip_conntrack_info oldinfo;
716 struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
717
718 nf_ct_acct_merge(ct, ctinfo, loser_ct);
719 nf_conntrack_put(&loser_ct->ct_general);
c74454fa 720 nf_ct_set(skb, ct, oldinfo);
71d8c47f
PNA
721 return NF_ACCEPT;
722 }
723 NF_CT_STAT_INC(net, drop);
724 return NF_DROP;
725}
726
9fb9cbb1
YK
727/* Confirm a connection given skb; places it in hash table */
728int
3db05fea 729__nf_conntrack_confirm(struct sk_buff *skb)
9fb9cbb1 730{
308ac914 731 const struct nf_conntrack_zone *zone;
b476b72a 732 unsigned int hash, reply_hash;
df0933dc 733 struct nf_conntrack_tuple_hash *h;
9fb9cbb1 734 struct nf_conn *ct;
df0933dc 735 struct nf_conn_help *help;
a992ca2a 736 struct nf_conn_tstamp *tstamp;
ea781f19 737 struct hlist_nulls_node *n;
9fb9cbb1 738 enum ip_conntrack_info ctinfo;
400dad39 739 struct net *net;
93bb0ceb 740 unsigned int sequence;
71d8c47f 741 int ret = NF_DROP;
9fb9cbb1 742
3db05fea 743 ct = nf_ct_get(skb, &ctinfo);
400dad39 744 net = nf_ct_net(ct);
9fb9cbb1
YK
745
746 /* ipt_REJECT uses nf_conntrack_attach to attach related
747 ICMP/TCP RST packets in other direction. Actual packet
748 which created connection will be IP_CT_NEW or for an
749 expected connection, IP_CT_RELATED. */
750 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
751 return NF_ACCEPT;
752
5d0aa2cc 753 zone = nf_ct_zone(ct);
93bb0ceb
JDB
754 local_bh_disable();
755
756 do {
a3efd812 757 sequence = read_seqcount_begin(&nf_conntrack_generation);
93bb0ceb
JDB
758 /* reuse the hash saved before */
759 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
56d52d48 760 hash = scale_hash(hash);
deedb590 761 reply_hash = hash_conntrack(net,
93bb0ceb
JDB
762 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
763
764 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
9fb9cbb1
YK
765
766 /* We're not in hash table, and we refuse to set up related
93bb0ceb
JDB
767 * connections for unconfirmed conns. But packet copies and
768 * REJECT will give spurious warnings here.
769 */
9fb9cbb1 770
25985edc 771 /* No external references means no one else could have
93bb0ceb
JDB
772 * confirmed us.
773 */
44d6e2f2 774 WARN_ON(nf_ct_is_confirmed(ct));
0d53778e 775 pr_debug("Confirming conntrack %p\n", ct);
8ca3f5e9
PNA
776 /* We have to check the DYING flag after unlink to prevent
777 * a race against nf_ct_get_next_corpse() possibly called from
778 * user context, else we insert an already 'dead' hash, blocking
779 * further use of that particular connection -JM.
780 */
781 nf_ct_del_from_dying_or_unconfirmed_list(ct);
782
71d8c47f
PNA
783 if (unlikely(nf_ct_is_dying(ct))) {
784 nf_ct_add_to_dying_list(ct);
785 goto dying;
786 }
fc350777 787
9fb9cbb1
YK
788 /* See if there's one in the list already, including reverse:
789 NAT could have grabbed it without realizing, since we're
790 not in the hash. If there is, we lost race. */
56d52d48 791 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
86804348 792 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
e0c7d472 793 zone, net))
df0933dc 794 goto out;
86804348 795
56d52d48 796 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
86804348 797 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
e0c7d472 798 zone, net))
df0933dc 799 goto out;
9fb9cbb1 800
df0933dc
PM
801 /* Timer relative to confirmation time, not original
802 setting time, otherwise we'd get timer wrap in
803 weird delay cases. */
f330a7fd 804 ct->timeout += nfct_time_stamp;
df0933dc 805 atomic_inc(&ct->ct_general.use);
45eec341 806 ct->status |= IPS_CONFIRMED;
5c8ec910 807
a992ca2a
PNA
808 /* set conntrack timestamp, if enabled. */
809 tstamp = nf_conn_tstamp_find(ct);
810 if (tstamp) {
2456e855 811 if (skb->tstamp == 0)
e3192690 812 __net_timestamp(skb);
a992ca2a
PNA
813
814 tstamp->start = ktime_to_ns(skb->tstamp);
815 }
5c8ec910
PM
816 /* Since the lookup is lockless, hash insertion must be done after
817 * starting the timer and setting the CONFIRMED bit. The RCU barriers
818 * guarantee that no other CPU can find the conntrack before the above
819 * stores are visible.
820 */
b476b72a 821 __nf_conntrack_hash_insert(ct, hash, reply_hash);
93bb0ceb 822 nf_conntrack_double_unlock(hash, reply_hash);
93bb0ceb 823 local_bh_enable();
5c8ec910 824
df0933dc
PM
825 help = nfct_help(ct);
826 if (help && help->helper)
a71996fc 827 nf_conntrack_event_cache(IPCT_HELPER, ct);
17e6e4ea 828
df0933dc 829 nf_conntrack_event_cache(master_ct(ct) ?
a71996fc 830 IPCT_RELATED : IPCT_NEW, ct);
df0933dc 831 return NF_ACCEPT;
9fb9cbb1 832
df0933dc 833out:
8ca3f5e9 834 nf_ct_add_to_dying_list(ct);
71d8c47f
PNA
835 ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
836dying:
93bb0ceb 837 nf_conntrack_double_unlock(hash, reply_hash);
0d55af87 838 NF_CT_STAT_INC(net, insert_failed);
93bb0ceb 839 local_bh_enable();
71d8c47f 840 return ret;
9fb9cbb1 841}
13b18339 842EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
9fb9cbb1
YK
843
844/* Returns true if a connection correspondings to the tuple (required
845 for NAT). */
846int
847nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
848 const struct nf_conn *ignored_conntrack)
849{
400dad39 850 struct net *net = nf_ct_net(ignored_conntrack);
308ac914 851 const struct nf_conntrack_zone *zone;
9fb9cbb1 852 struct nf_conntrack_tuple_hash *h;
5e3c61f9 853 struct hlist_nulls_head *ct_hash;
92e47ba8 854 unsigned int hash, hsize;
ea781f19 855 struct hlist_nulls_node *n;
5d0aa2cc 856 struct nf_conn *ct;
308ac914
DB
857
858 zone = nf_ct_zone(ignored_conntrack);
9fb9cbb1 859
2cf12348 860 rcu_read_lock();
95a8d19f 861 begin:
92e47ba8
LZ
862 nf_conntrack_get_ht(&ct_hash, &hsize);
863 hash = __hash_conntrack(net, tuple, hsize);
5e3c61f9
FW
864
865 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
5d0aa2cc 866 ct = nf_ct_tuplehash_to_ctrack(h);
f330a7fd
FW
867
868 if (ct == ignored_conntrack)
869 continue;
870
871 if (nf_ct_is_expired(ct)) {
872 nf_ct_gc_expired(ct);
873 continue;
874 }
875
876 if (nf_ct_key_equal(h, tuple, zone, net)) {
2cf12348
FW
877 NF_CT_STAT_INC_ATOMIC(net, found);
878 rcu_read_unlock();
ba419aff
PM
879 return 1;
880 }
ba419aff 881 }
95a8d19f
FW
882
883 if (get_nulls_value(n) != hash) {
884 NF_CT_STAT_INC_ATOMIC(net, search_restart);
885 goto begin;
886 }
887
2cf12348 888 rcu_read_unlock();
9fb9cbb1 889
ba419aff 890 return 0;
9fb9cbb1 891}
13b18339 892EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
9fb9cbb1 893
7ae7730f
PM
894#define NF_CT_EVICTION_RANGE 8
895
9fb9cbb1
YK
896/* There's a small race here where we may free a just-assured
897 connection. Too bad: we're in trouble anyway. */
242922a0
FW
898static unsigned int early_drop_list(struct net *net,
899 struct hlist_nulls_head *head)
9fb9cbb1 900{
9fb9cbb1 901 struct nf_conntrack_tuple_hash *h;
ea781f19 902 struct hlist_nulls_node *n;
242922a0
FW
903 unsigned int drops = 0;
904 struct nf_conn *tmp;
3e86638e 905
242922a0
FW
906 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
907 tmp = nf_ct_tuplehash_to_ctrack(h);
9fb9cbb1 908
f330a7fd
FW
909 if (nf_ct_is_expired(tmp)) {
910 nf_ct_gc_expired(tmp);
911 continue;
912 }
913
242922a0
FW
914 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
915 !net_eq(nf_ct_net(tmp), net) ||
916 nf_ct_is_dying(tmp))
917 continue;
76507f69 918
242922a0
FW
919 if (!atomic_inc_not_zero(&tmp->ct_general.use))
920 continue;
76507f69 921
242922a0 922 /* kill only if still in same netns -- might have moved due to
5f0d5a3a 923 * SLAB_TYPESAFE_BY_RCU rules.
242922a0
FW
924 *
925 * We steal the timer reference. If that fails timer has
926 * already fired or someone else deleted it. Just drop ref
927 * and move to next entry.
928 */
929 if (net_eq(nf_ct_net(tmp), net) &&
930 nf_ct_is_confirmed(tmp) &&
242922a0
FW
931 nf_ct_delete(tmp, 0, 0))
932 drops++;
933
934 nf_ct_put(tmp);
9fb9cbb1 935 }
3e86638e 936
242922a0
FW
937 return drops;
938}
9fb9cbb1 939
242922a0
FW
940static noinline int early_drop(struct net *net, unsigned int _hash)
941{
942 unsigned int i;
9fb9cbb1 943
242922a0
FW
944 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
945 struct hlist_nulls_head *ct_hash;
92e47ba8 946 unsigned int hash, hsize, drops;
242922a0 947
3101e0fc 948 rcu_read_lock();
92e47ba8
LZ
949 nf_conntrack_get_ht(&ct_hash, &hsize);
950 hash = reciprocal_scale(_hash++, hsize);
242922a0
FW
951
952 drops = early_drop_list(net, &ct_hash[hash]);
3101e0fc
LZ
953 rcu_read_unlock();
954
242922a0
FW
955 if (drops) {
956 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
957 return true;
74138511 958 }
9fb9cbb1 959 }
3e86638e 960
242922a0 961 return false;
9fb9cbb1
YK
962}
963
c6dd940b
FW
964static bool gc_worker_skip_ct(const struct nf_conn *ct)
965{
966 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
967}
968
969static bool gc_worker_can_early_drop(const struct nf_conn *ct)
970{
971 const struct nf_conntrack_l4proto *l4proto;
972
973 if (!test_bit(IPS_ASSURED_BIT, &ct->status))
974 return true;
975
976 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
977 if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
978 return true;
979
980 return false;
981}
982
b87a2f91
FW
983static void gc_worker(struct work_struct *work)
984{
e5072053 985 unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
b87a2f91 986 unsigned int i, goal, buckets = 0, expired_count = 0;
c6dd940b 987 unsigned int nf_conntrack_max95 = 0;
b87a2f91 988 struct conntrack_gc_work *gc_work;
e0df8cae
FW
989 unsigned int ratio, scanned = 0;
990 unsigned long next_run;
b87a2f91
FW
991
992 gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
993
e0df8cae 994 goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
b87a2f91 995 i = gc_work->last_bucket;
c6dd940b
FW
996 if (gc_work->early_drop)
997 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
b87a2f91
FW
998
999 do {
1000 struct nf_conntrack_tuple_hash *h;
1001 struct hlist_nulls_head *ct_hash;
1002 struct hlist_nulls_node *n;
1003 unsigned int hashsz;
1004 struct nf_conn *tmp;
1005
1006 i++;
1007 rcu_read_lock();
1008
1009 nf_conntrack_get_ht(&ct_hash, &hashsz);
1010 if (i >= hashsz)
1011 i = 0;
1012
1013 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
c6dd940b
FW
1014 struct net *net;
1015
b87a2f91
FW
1016 tmp = nf_ct_tuplehash_to_ctrack(h);
1017
c023c0e4 1018 scanned++;
b87a2f91
FW
1019 if (nf_ct_is_expired(tmp)) {
1020 nf_ct_gc_expired(tmp);
1021 expired_count++;
1022 continue;
1023 }
c6dd940b
FW
1024
1025 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
1026 continue;
1027
1028 net = nf_ct_net(tmp);
1029 if (atomic_read(&net->ct.count) < nf_conntrack_max95)
1030 continue;
1031
1032 /* need to take reference to avoid possible races */
1033 if (!atomic_inc_not_zero(&tmp->ct_general.use))
1034 continue;
1035
1036 if (gc_worker_skip_ct(tmp)) {
1037 nf_ct_put(tmp);
1038 continue;
1039 }
1040
1041 if (gc_worker_can_early_drop(tmp))
1042 nf_ct_kill(tmp);
1043
1044 nf_ct_put(tmp);
b87a2f91
FW
1045 }
1046
1047 /* could check get_nulls_value() here and restart if ct
1048 * was moved to another chain. But given gc is best-effort
1049 * we will just continue with next hash slot.
1050 */
1051 rcu_read_unlock();
1052 cond_resched_rcu_qs();
524b698d 1053 } while (++buckets < goal);
b87a2f91
FW
1054
1055 if (gc_work->exiting)
1056 return;
1057
e0df8cae
FW
1058 /*
1059 * Eviction will normally happen from the packet path, and not
1060 * from this gc worker.
1061 *
1062 * This worker is only here to reap expired entries when system went
1063 * idle after a busy period.
1064 *
1065 * The heuristics below are supposed to balance conflicting goals:
1066 *
1067 * 1. Minimize time until we notice a stale entry
1068 * 2. Maximize scan intervals to not waste cycles
1069 *
e5072053 1070 * Normally, expire ratio will be close to 0.
e0df8cae 1071 *
e5072053
FW
1072 * As soon as a sizeable fraction of the entries have expired
1073 * increase scan frequency.
e0df8cae 1074 */
c023c0e4 1075 ratio = scanned ? expired_count * 100 / scanned : 0;
e5072053
FW
1076 if (ratio > GC_EVICT_RATIO) {
1077 gc_work->next_gc_run = min_interval;
e0df8cae 1078 } else {
e5072053 1079 unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
e0df8cae 1080
e5072053
FW
1081 BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
1082
1083 gc_work->next_gc_run += min_interval;
1084 if (gc_work->next_gc_run > max)
1085 gc_work->next_gc_run = max;
e0df8cae 1086 }
c023c0e4 1087
e5072053 1088 next_run = gc_work->next_gc_run;
b87a2f91 1089 gc_work->last_bucket = i;
c6dd940b 1090 gc_work->early_drop = false;
e0df8cae 1091 queue_delayed_work(system_long_wq, &gc_work->dwork, next_run);
b87a2f91
FW
1092}
1093
1094static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1095{
a232cd0e 1096 INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
e5072053 1097 gc_work->next_gc_run = HZ;
b87a2f91
FW
1098 gc_work->exiting = false;
1099}
1100
99f07e91 1101static struct nf_conn *
308ac914
DB
1102__nf_conntrack_alloc(struct net *net,
1103 const struct nf_conntrack_zone *zone,
99f07e91
CG
1104 const struct nf_conntrack_tuple *orig,
1105 const struct nf_conntrack_tuple *repl,
1106 gfp_t gfp, u32 hash)
9fb9cbb1 1107{
cd7fcbf1 1108 struct nf_conn *ct;
9fb9cbb1 1109
5251e2d2 1110 /* We don't want any race condition at early drop stage */
49ac8713 1111 atomic_inc(&net->ct.count);
5251e2d2 1112
76eb9460 1113 if (nf_conntrack_max &&
49ac8713 1114 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
93bb0ceb 1115 if (!early_drop(net, hash)) {
c6dd940b
FW
1116 if (!conntrack_gc_work.early_drop)
1117 conntrack_gc_work.early_drop = true;
49ac8713 1118 atomic_dec(&net->ct.count);
e87cc472 1119 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
9fb9cbb1
YK
1120 return ERR_PTR(-ENOMEM);
1121 }
1122 }
1123
941297f4
ED
1124 /*
1125 * Do not use kmem_cache_zalloc(), as this cache uses
5f0d5a3a 1126 * SLAB_TYPESAFE_BY_RCU.
941297f4 1127 */
0c5366b3 1128 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
5e8018fc
DB
1129 if (ct == NULL)
1130 goto out;
1131
440f0d58 1132 spin_lock_init(&ct->lock);
c88130bc 1133 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
941297f4 1134 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
c88130bc 1135 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
99f07e91
CG
1136 /* save hash for reusing when confirming */
1137 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
c41884ce 1138 ct->status = 0;
c2d9ba9b 1139 write_pnet(&ct->ct_net, net);
c41884ce
FW
1140 memset(&ct->__nfct_init_offset[0], 0,
1141 offsetof(struct nf_conn, proto) -
1142 offsetof(struct nf_conn, __nfct_init_offset[0]));
5e8018fc 1143
6c8dee98 1144 nf_ct_zone_add(ct, zone);
5e8018fc 1145
e53376be
PNA
1146 /* Because we use RCU lookups, we set ct_general.use to zero before
1147 * this is inserted in any list.
941297f4 1148 */
e53376be 1149 atomic_set(&ct->ct_general.use, 0);
c88130bc 1150 return ct;
5e8018fc
DB
1151out:
1152 atomic_dec(&net->ct.count);
5d0aa2cc 1153 return ERR_PTR(-ENOMEM);
9fb9cbb1 1154}
99f07e91 1155
308ac914
DB
1156struct nf_conn *nf_conntrack_alloc(struct net *net,
1157 const struct nf_conntrack_zone *zone,
99f07e91
CG
1158 const struct nf_conntrack_tuple *orig,
1159 const struct nf_conntrack_tuple *repl,
1160 gfp_t gfp)
1161{
1162 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1163}
13b18339 1164EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
9fb9cbb1 1165
c88130bc 1166void nf_conntrack_free(struct nf_conn *ct)
76507f69 1167{
1d45209d
ED
1168 struct net *net = nf_ct_net(ct);
1169
e53376be 1170 /* A freed object has refcnt == 0, that's
5f0d5a3a 1171 * the golden rule for SLAB_TYPESAFE_BY_RCU
e53376be 1172 */
44d6e2f2 1173 WARN_ON(atomic_read(&ct->ct_general.use) != 0);
e53376be 1174
ceeff754 1175 nf_ct_ext_destroy(ct);
ea781f19 1176 nf_ct_ext_free(ct);
0c5366b3 1177 kmem_cache_free(nf_conntrack_cachep, ct);
4e857c58 1178 smp_mb__before_atomic();
0c3c6c00 1179 atomic_dec(&net->ct.count);
76507f69 1180}
13b18339 1181EXPORT_SYMBOL_GPL(nf_conntrack_free);
9fb9cbb1 1182
c539f017 1183
9fb9cbb1
YK
1184/* Allocate a new conntrack: we return -ENOMEM if classification
1185 failed due to stress. Otherwise it really is unclassifiable. */
fc09e4a7 1186static noinline struct nf_conntrack_tuple_hash *
b2a15a60 1187init_conntrack(struct net *net, struct nf_conn *tmpl,
5a1fb391 1188 const struct nf_conntrack_tuple *tuple,
2a04aabf
JL
1189 const struct nf_conntrack_l3proto *l3proto,
1190 const struct nf_conntrack_l4proto *l4proto,
9fb9cbb1 1191 struct sk_buff *skb,
60b5f8f7 1192 unsigned int dataoff, u32 hash)
9fb9cbb1 1193{
c88130bc 1194 struct nf_conn *ct;
3c158f7f 1195 struct nf_conn_help *help;
9fb9cbb1 1196 struct nf_conntrack_tuple repl_tuple;
b2a15a60 1197 struct nf_conntrack_ecache *ecache;
ca7433df 1198 struct nf_conntrack_expect *exp = NULL;
308ac914 1199 const struct nf_conntrack_zone *zone;
60b5f8f7 1200 struct nf_conn_timeout *timeout_ext;
5e8018fc 1201 struct nf_conntrack_zone tmp;
60b5f8f7 1202 unsigned int *timeouts;
9fb9cbb1 1203
605dcad6 1204 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
0d53778e 1205 pr_debug("Can't invert tuple.\n");
9fb9cbb1
YK
1206 return NULL;
1207 }
1208
5e8018fc 1209 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
99f07e91
CG
1210 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1211 hash);
0a9ee813 1212 if (IS_ERR(ct))
c88130bc 1213 return (struct nf_conntrack_tuple_hash *)ct;
9fb9cbb1 1214
4440a2ab
GF
1215 if (!nf_ct_add_synproxy(ct, tmpl)) {
1216 nf_conntrack_free(ct);
1217 return ERR_PTR(-ENOMEM);
48b1de4c
PM
1218 }
1219
60b5f8f7 1220 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
ae2d708e
PNA
1221 if (timeout_ext) {
1222 timeouts = nf_ct_timeout_data(timeout_ext);
1223 if (unlikely(!timeouts))
1224 timeouts = l4proto->get_timeouts(net);
1225 } else {
60b5f8f7 1226 timeouts = l4proto->get_timeouts(net);
ae2d708e 1227 }
60b5f8f7 1228
2c8503f5 1229 if (!l4proto->new(ct, skb, dataoff, timeouts)) {
c88130bc 1230 nf_conntrack_free(ct);
ccd63c20 1231 pr_debug("can't track with proto module\n");
9fb9cbb1
YK
1232 return NULL;
1233 }
1234
60b5f8f7 1235 if (timeout_ext)
ae2d708e
PNA
1236 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1237 GFP_ATOMIC);
60b5f8f7 1238
58401572 1239 nf_ct_acct_ext_add(ct, GFP_ATOMIC);
a992ca2a 1240 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
c539f017 1241 nf_ct_labels_ext_add(ct);
b2a15a60
PM
1242
1243 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1244 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1245 ecache ? ecache->expmask : 0,
1246 GFP_ATOMIC);
58401572 1247
ca7433df
JDB
1248 local_bh_disable();
1249 if (net->ct.expect_count) {
1250 spin_lock(&nf_conntrack_expect_lock);
1251 exp = nf_ct_find_expectation(net, zone, tuple);
1252 if (exp) {
ccd63c20 1253 pr_debug("expectation arrives ct=%p exp=%p\n",
ca7433df
JDB
1254 ct, exp);
1255 /* Welcome, Mr. Bond. We've been expecting you... */
1256 __set_bit(IPS_EXPECTED_BIT, &ct->status);
1257 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1258 ct->master = exp->master;
1259 if (exp->helper) {
1260 help = nf_ct_helper_ext_add(ct, exp->helper,
1261 GFP_ATOMIC);
1262 if (help)
1263 rcu_assign_pointer(help->helper, exp->helper);
1264 }
ceceae1b 1265
9fb9cbb1 1266#ifdef CONFIG_NF_CONNTRACK_MARK
ca7433df 1267 ct->mark = exp->master->mark;
7c9728c3
JM
1268#endif
1269#ifdef CONFIG_NF_CONNTRACK_SECMARK
ca7433df 1270 ct->secmark = exp->master->secmark;
9fb9cbb1 1271#endif
ca7433df
JDB
1272 NF_CT_STAT_INC(net, expect_new);
1273 }
1274 spin_unlock(&nf_conntrack_expect_lock);
1275 }
8e8118f8 1276 if (!exp)
b2a15a60 1277 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
9fb9cbb1 1278
e53376be
PNA
1279 /* Now it is inserted into the unconfirmed list, bump refcount */
1280 nf_conntrack_get(&ct->ct_general);
b7779d06 1281 nf_ct_add_to_unconfirmed_list(ct);
9fb9cbb1 1282
ca7433df 1283 local_bh_enable();
9fb9cbb1
YK
1284
1285 if (exp) {
1286 if (exp->expectfn)
c88130bc 1287 exp->expectfn(ct, exp);
6823645d 1288 nf_ct_expect_put(exp);
9fb9cbb1
YK
1289 }
1290
c88130bc 1291 return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
9fb9cbb1
YK
1292}
1293
fc09e4a7
FW
1294/* On success, returns 0, sets skb->_nfct | ctinfo */
1295static int
b2a15a60 1296resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
a702a65f 1297 struct sk_buff *skb,
9fb9cbb1
YK
1298 unsigned int dataoff,
1299 u_int16_t l3num,
1300 u_int8_t protonum,
2a04aabf
JL
1301 const struct nf_conntrack_l3proto *l3proto,
1302 const struct nf_conntrack_l4proto *l4proto)
9fb9cbb1 1303{
308ac914 1304 const struct nf_conntrack_zone *zone;
9fb9cbb1
YK
1305 struct nf_conntrack_tuple tuple;
1306 struct nf_conntrack_tuple_hash *h;
fc09e4a7 1307 enum ip_conntrack_info ctinfo;
5e8018fc 1308 struct nf_conntrack_zone tmp;
9fb9cbb1 1309 struct nf_conn *ct;
99f07e91 1310 u32 hash;
9fb9cbb1 1311
bbe735e4 1312 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
a31f1adc 1313 dataoff, l3num, protonum, net, &tuple, l3proto,
605dcad6 1314 l4proto)) {
ccd63c20 1315 pr_debug("Can't get tuple\n");
fc09e4a7 1316 return 0;
9fb9cbb1
YK
1317 }
1318
1319 /* look for tuple match */
5e8018fc 1320 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1b8c8a9f 1321 hash = hash_conntrack_raw(&tuple, net);
99f07e91 1322 h = __nf_conntrack_find_get(net, zone, &tuple, hash);
9fb9cbb1 1323 if (!h) {
b2a15a60 1324 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
60b5f8f7 1325 skb, dataoff, hash);
9fb9cbb1 1326 if (!h)
fc09e4a7 1327 return 0;
9fb9cbb1 1328 if (IS_ERR(h))
fc09e4a7 1329 return PTR_ERR(h);
9fb9cbb1
YK
1330 }
1331 ct = nf_ct_tuplehash_to_ctrack(h);
1332
1333 /* It exists; we have (non-exclusive) reference. */
1334 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
fc09e4a7 1335 ctinfo = IP_CT_ESTABLISHED_REPLY;
9fb9cbb1
YK
1336 } else {
1337 /* Once we've had two way comms, always ESTABLISHED. */
1338 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
ccd63c20 1339 pr_debug("normal packet for %p\n", ct);
fc09e4a7 1340 ctinfo = IP_CT_ESTABLISHED;
9fb9cbb1 1341 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
ccd63c20 1342 pr_debug("related packet for %p\n", ct);
fc09e4a7 1343 ctinfo = IP_CT_RELATED;
9fb9cbb1 1344 } else {
ccd63c20 1345 pr_debug("new packet for %p\n", ct);
fc09e4a7 1346 ctinfo = IP_CT_NEW;
9fb9cbb1 1347 }
9fb9cbb1 1348 }
fc09e4a7
FW
1349 nf_ct_set(skb, ct, ctinfo);
1350 return 0;
9fb9cbb1
YK
1351}
1352
1353unsigned int
a702a65f
AD
1354nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1355 struct sk_buff *skb)
9fb9cbb1 1356{
b3480fe0
FW
1357 const struct nf_conntrack_l3proto *l3proto;
1358 const struct nf_conntrack_l4proto *l4proto;
97a6ad13 1359 struct nf_conn *ct, *tmpl;
9fb9cbb1 1360 enum ip_conntrack_info ctinfo;
2c8503f5 1361 unsigned int *timeouts;
9fb9cbb1
YK
1362 unsigned int dataoff;
1363 u_int8_t protonum;
9fb9cbb1
YK
1364 int ret;
1365
97a6ad13 1366 tmpl = nf_ct_get(skb, &ctinfo);
cc41c84b 1367 if (tmpl || ctinfo == IP_CT_UNTRACKED) {
b2a15a60 1368 /* Previously seen (loopback or untracked)? Ignore. */
cc41c84b
FW
1369 if ((tmpl && !nf_ct_is_template(tmpl)) ||
1370 ctinfo == IP_CT_UNTRACKED) {
b2a15a60
PM
1371 NF_CT_STAT_INC_ATOMIC(net, ignore);
1372 return NF_ACCEPT;
1373 }
a9e419dc 1374 skb->_nfct = 0;
9fb9cbb1
YK
1375 }
1376
e2361cb9 1377 /* rcu_read_lock()ed by nf_hook_thresh */
76108cea 1378 l3proto = __nf_ct_l3proto_find(pf);
3db05fea 1379 ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
ffc30690
YK
1380 &dataoff, &protonum);
1381 if (ret <= 0) {
25985edc 1382 pr_debug("not prepared to track yet or error occurred\n");
0d55af87
AD
1383 NF_CT_STAT_INC_ATOMIC(net, error);
1384 NF_CT_STAT_INC_ATOMIC(net, invalid);
b2a15a60
PM
1385 ret = -ret;
1386 goto out;
9fb9cbb1
YK
1387 }
1388
76108cea 1389 l4proto = __nf_ct_l4proto_find(pf, protonum);
9fb9cbb1
YK
1390
1391 /* It may be an special packet, error, unclean...
1392 * inverse of the return code tells to the netfilter
1393 * core what to do with the packet. */
74c51a14 1394 if (l4proto->error != NULL) {
11df4b76 1395 ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
74c51a14 1396 if (ret <= 0) {
0d55af87
AD
1397 NF_CT_STAT_INC_ATOMIC(net, error);
1398 NF_CT_STAT_INC_ATOMIC(net, invalid);
b2a15a60
PM
1399 ret = -ret;
1400 goto out;
74c51a14 1401 }
88ed01d1 1402 /* ICMP[v6] protocol trackers may assign one conntrack. */
a9e419dc 1403 if (skb->_nfct)
88ed01d1 1404 goto out;
9fb9cbb1 1405 }
08733a0c 1406repeat:
fc09e4a7
FW
1407 ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
1408 l3proto, l4proto);
1409 if (ret < 0) {
9fb9cbb1 1410 /* Too stressed to deal. */
0d55af87 1411 NF_CT_STAT_INC_ATOMIC(net, drop);
b2a15a60
PM
1412 ret = NF_DROP;
1413 goto out;
9fb9cbb1
YK
1414 }
1415
fc09e4a7
FW
1416 ct = nf_ct_get(skb, &ctinfo);
1417 if (!ct) {
1418 /* Not valid part of a connection */
1419 NF_CT_STAT_INC_ATOMIC(net, invalid);
1420 ret = NF_ACCEPT;
1421 goto out;
1422 }
9fb9cbb1 1423
60b5f8f7 1424 /* Decide what timeout policy we want to apply to this flow. */
84b5ee93 1425 timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
60b5f8f7 1426
d1c1e39d 1427 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, timeouts);
ec8d5409 1428 if (ret <= 0) {
9fb9cbb1
YK
1429 /* Invalid: inverse of the return code tells
1430 * the netfilter core what to do */
0d53778e 1431 pr_debug("nf_conntrack_in: Can't track with proto module\n");
97a6ad13 1432 nf_conntrack_put(&ct->ct_general);
a9e419dc 1433 skb->_nfct = 0;
0d55af87 1434 NF_CT_STAT_INC_ATOMIC(net, invalid);
7d1e0459
PNA
1435 if (ret == -NF_DROP)
1436 NF_CT_STAT_INC_ATOMIC(net, drop);
56a62e22
AB
1437 /* Special case: TCP tracker reports an attempt to reopen a
1438 * closed/aborted connection. We have to go back and create a
1439 * fresh conntrack.
1440 */
1441 if (ret == -NF_REPEAT)
1442 goto repeat;
b2a15a60
PM
1443 ret = -ret;
1444 goto out;
9fb9cbb1
YK
1445 }
1446
fc09e4a7
FW
1447 if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
1448 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
858b3133 1449 nf_conntrack_event_cache(IPCT_REPLY, ct);
b2a15a60 1450out:
56a62e22
AB
1451 if (tmpl)
1452 nf_ct_put(tmpl);
9fb9cbb1
YK
1453
1454 return ret;
1455}
13b18339 1456EXPORT_SYMBOL_GPL(nf_conntrack_in);
9fb9cbb1 1457
5f2b4c90
JE
1458bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1459 const struct nf_conntrack_tuple *orig)
9fb9cbb1 1460{
5f2b4c90 1461 bool ret;
923f4902
PM
1462
1463 rcu_read_lock();
1464 ret = nf_ct_invert_tuple(inverse, orig,
1465 __nf_ct_l3proto_find(orig->src.l3num),
1466 __nf_ct_l4proto_find(orig->src.l3num,
1467 orig->dst.protonum));
1468 rcu_read_unlock();
1469 return ret;
9fb9cbb1 1470}
13b18339 1471EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
9fb9cbb1 1472
5b1158e9
JK
1473/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1474 implicitly racy: see __nf_conntrack_confirm */
1475void nf_conntrack_alter_reply(struct nf_conn *ct,
1476 const struct nf_conntrack_tuple *newreply)
1477{
1478 struct nf_conn_help *help = nfct_help(ct);
1479
5b1158e9 1480 /* Should be unconfirmed, so not in hash table yet */
44d6e2f2 1481 WARN_ON(nf_ct_is_confirmed(ct));
5b1158e9 1482
0d53778e 1483 pr_debug("Altering reply tuple of %p to ", ct);
3c9fba65 1484 nf_ct_dump_tuple(newreply);
5b1158e9
JK
1485
1486 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
ef1a5a50 1487 if (ct->master || (help && !hlist_empty(&help->expectations)))
c52fbb41 1488 return;
ceceae1b 1489
c52fbb41 1490 rcu_read_lock();
b2a15a60 1491 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
c52fbb41 1492 rcu_read_unlock();
5b1158e9 1493}
13b18339 1494EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
5b1158e9 1495
9fb9cbb1
YK
1496/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1497void __nf_ct_refresh_acct(struct nf_conn *ct,
1498 enum ip_conntrack_info ctinfo,
1499 const struct sk_buff *skb,
1500 unsigned long extra_jiffies,
1501 int do_acct)
1502{
44d6e2f2 1503 WARN_ON(!skb);
9fb9cbb1 1504
997ae831 1505 /* Only update if this is not a fixed timeout */
47d95045
PM
1506 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1507 goto acct;
997ae831 1508
9fb9cbb1 1509 /* If not in hash table, timer will not be active yet */
f330a7fd
FW
1510 if (nf_ct_is_confirmed(ct))
1511 extra_jiffies += nfct_time_stamp;
9fb9cbb1 1512
f330a7fd 1513 ct->timeout = extra_jiffies;
47d95045 1514acct:
ba76738c
PNA
1515 if (do_acct)
1516 nf_ct_acct_update(ct, ctinfo, skb->len);
9fb9cbb1 1517}
13b18339 1518EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
9fb9cbb1 1519
ad66713f
FW
1520bool nf_ct_kill_acct(struct nf_conn *ct,
1521 enum ip_conntrack_info ctinfo,
1522 const struct sk_buff *skb)
51091764 1523{
ad66713f 1524 nf_ct_acct_update(ct, ctinfo, skb->len);
58401572 1525
f330a7fd 1526 return nf_ct_delete(ct, 0, 0);
51091764 1527}
ad66713f 1528EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
51091764 1529
c0cd1156 1530#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
c1d10adb
PNA
1531
1532#include <linux/netfilter/nfnetlink.h>
1533#include <linux/netfilter/nfnetlink_conntrack.h>
57b47a53
IM
1534#include <linux/mutex.h>
1535
c1d10adb
PNA
1536/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1537 * in ip_conntrack_core, since we don't want the protocols to autoload
1538 * or depend on ctnetlink */
fdf70832 1539int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
c1d10adb
PNA
1540 const struct nf_conntrack_tuple *tuple)
1541{
bae65be8
DM
1542 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
1543 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
1544 goto nla_put_failure;
c1d10adb
PNA
1545 return 0;
1546
df6fb868 1547nla_put_failure:
c1d10adb
PNA
1548 return -1;
1549}
fdf70832 1550EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
c1d10adb 1551
f73e924c
PM
1552const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1553 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
1554 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
c1d10adb 1555};
f73e924c 1556EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
c1d10adb 1557
fdf70832 1558int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
c1d10adb
PNA
1559 struct nf_conntrack_tuple *t)
1560{
df6fb868 1561 if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
c1d10adb
PNA
1562 return -EINVAL;
1563
77236b6e
PM
1564 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1565 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
c1d10adb
PNA
1566
1567 return 0;
1568}
fdf70832 1569EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
5c0de29d
HE
1570
1571int nf_ct_port_nlattr_tuple_size(void)
1572{
1573 return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1574}
1575EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
c1d10adb
PNA
1576#endif
1577
9fb9cbb1 1578/* Used by ipt_REJECT and ip6t_REJECT. */
312a0c16 1579static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
9fb9cbb1
YK
1580{
1581 struct nf_conn *ct;
1582 enum ip_conntrack_info ctinfo;
1583
1584 /* This ICMP is in reverse direction to the packet which caused it */
1585 ct = nf_ct_get(skb, &ctinfo);
1586 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
fb048833 1587 ctinfo = IP_CT_RELATED_REPLY;
9fb9cbb1
YK
1588 else
1589 ctinfo = IP_CT_RELATED;
1590
1591 /* Attach to new skbuff, and increment count */
c74454fa 1592 nf_ct_set(nskb, ct, ctinfo);
cb9c6836 1593 nf_conntrack_get(skb_nfct(nskb));
9fb9cbb1
YK
1594}
1595
9fb9cbb1 1596/* Bring out ya dead! */
df0933dc 1597static struct nf_conn *
2843fb69 1598get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
9fb9cbb1
YK
1599 void *data, unsigned int *bucket)
1600{
df0933dc
PM
1601 struct nf_conntrack_tuple_hash *h;
1602 struct nf_conn *ct;
ea781f19 1603 struct hlist_nulls_node *n;
93bb0ceb 1604 spinlock_t *lockp;
9fb9cbb1 1605
56d52d48 1606 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
93bb0ceb
JDB
1607 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
1608 local_bh_disable();
b16c2919 1609 nf_conntrack_lock(lockp);
56d52d48
FW
1610 if (*bucket < nf_conntrack_htable_size) {
1611 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
93bb0ceb
JDB
1612 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
1613 continue;
1614 ct = nf_ct_tuplehash_to_ctrack(h);
2843fb69 1615 if (iter(ct, data))
93bb0ceb
JDB
1616 goto found;
1617 }
df0933dc 1618 }
93bb0ceb
JDB
1619 spin_unlock(lockp);
1620 local_bh_enable();
d93c6258 1621 cond_resched();
601e68e1 1622 }
b7779d06 1623
b0feacaa
FW
1624 return NULL;
1625found:
1626 atomic_inc(&ct->ct_general.use);
1627 spin_unlock(lockp);
1628 local_bh_enable();
1629 return ct;
1630}
1631
2843fb69
FW
1632static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
1633 void *data, u32 portid, int report)
1634{
0d02d564 1635 unsigned int bucket = 0, sequence;
2843fb69 1636 struct nf_conn *ct;
2843fb69
FW
1637
1638 might_sleep();
1639
0d02d564
FW
1640 for (;;) {
1641 sequence = read_seqcount_begin(&nf_conntrack_generation);
2843fb69 1642
0d02d564
FW
1643 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1644 /* Time to push up daises... */
1645
1646 nf_ct_delete(ct, portid, report);
1647 nf_ct_put(ct);
1648 cond_resched();
1649 }
1650
1651 if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
1652 break;
1653 bucket = 0;
2843fb69
FW
1654 }
1655}
1656
1657struct iter_data {
1658 int (*iter)(struct nf_conn *i, void *data);
1659 void *data;
1660 struct net *net;
1661};
1662
1663static int iter_net_only(struct nf_conn *i, void *data)
1664{
1665 struct iter_data *d = data;
1666
1667 if (!net_eq(d->net, nf_ct_net(i)))
1668 return 0;
1669
1670 return d->iter(i, d->data);
1671}
1672
b0feacaa
FW
1673static void
1674__nf_ct_unconfirmed_destroy(struct net *net)
1675{
1676 int cpu;
1677
b7779d06 1678 for_each_possible_cpu(cpu) {
b0feacaa
FW
1679 struct nf_conntrack_tuple_hash *h;
1680 struct hlist_nulls_node *n;
1681 struct ct_pcpu *pcpu;
1682
1683 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
b7779d06
JDB
1684
1685 spin_lock_bh(&pcpu->lock);
1686 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
b0feacaa
FW
1687 struct nf_conn *ct;
1688
b7779d06 1689 ct = nf_ct_tuplehash_to_ctrack(h);
b0feacaa
FW
1690
1691 /* we cannot call iter() on unconfirmed list, the
1692 * owning cpu can reallocate ct->ext at any time.
1693 */
1694 set_bit(IPS_DYING_BIT, &ct->status);
b7779d06
JDB
1695 }
1696 spin_unlock_bh(&pcpu->lock);
d93c6258 1697 cond_resched();
b7779d06 1698 }
9fb9cbb1
YK
1699}
1700
84657984
FW
1701void nf_ct_unconfirmed_destroy(struct net *net)
1702{
1703 might_sleep();
1704
1705 if (atomic_read(&net->ct.count) > 0) {
1706 __nf_ct_unconfirmed_destroy(net);
e2a75007 1707 nf_queue_nf_hook_drop(net);
84657984
FW
1708 synchronize_net();
1709 }
1710}
1711EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
1712
9fd6452d
FW
1713void nf_ct_iterate_cleanup_net(struct net *net,
1714 int (*iter)(struct nf_conn *i, void *data),
1715 void *data, u32 portid, int report)
9fb9cbb1 1716{
2843fb69 1717 struct iter_data d;
9fb9cbb1 1718
d93c6258
FW
1719 might_sleep();
1720
88b68bc5
FW
1721 if (atomic_read(&net->ct.count) == 0)
1722 return;
1723
2843fb69
FW
1724 d.iter = iter;
1725 d.data = data;
1726 d.net = net;
1727
2843fb69
FW
1728 nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
1729}
1730EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
9fb9cbb1 1731
2843fb69
FW
1732/**
1733 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table
1734 * @iter: callback to invoke for each conntrack
1735 * @data: data to pass to @iter
1736 *
1737 * Like nf_ct_iterate_cleanup, but first marks conntracks on the
1738 * unconfirmed list as dying (so they will not be inserted into
1739 * main table).
7866cc57
FW
1740 *
1741 * Can only be called in module exit path.
2843fb69
FW
1742 */
1743void
1744nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
1745{
1746 struct net *net;
1747
1748 rtnl_lock();
1749 for_each_net(net) {
1750 if (atomic_read(&net->ct.count) == 0)
1751 continue;
1752 __nf_ct_unconfirmed_destroy(net);
e2a75007 1753 nf_queue_nf_hook_drop(net);
9fb9cbb1 1754 }
2843fb69
FW
1755 rtnl_unlock();
1756
7866cc57
FW
1757 /* Need to wait for netns cleanup worker to finish, if its
1758 * running -- it might have deleted a net namespace from
1759 * the global list, so our __nf_ct_unconfirmed_destroy() might
1760 * not have affected all namespaces.
1761 */
1762 net_ns_barrier();
1763
2843fb69
FW
1764 /* a conntrack could have been unlinked from unconfirmed list
1765 * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy().
1766 * This makes sure its inserted into conntrack table.
1767 */
1768 synchronize_net();
1769
1770 nf_ct_iterate_cleanup(iter, data, 0, 0);
9fb9cbb1 1771}
2843fb69 1772EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
9fb9cbb1 1773
274d383b
PNA
1774static int kill_all(struct nf_conn *i, void *data)
1775{
2843fb69 1776 return net_eq(nf_ct_net(i), data);
274d383b
PNA
1777}
1778
d862a662 1779void nf_ct_free_hashtable(void *hash, unsigned int size)
9fb9cbb1 1780{
d862a662 1781 if (is_vmalloc_addr(hash))
9fb9cbb1
YK
1782 vfree(hash);
1783 else
601e68e1 1784 free_pages((unsigned long)hash,
f205c5e0 1785 get_order(sizeof(struct hlist_head) * size));
9fb9cbb1 1786}
ac565e5f 1787EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
9fb9cbb1 1788
f94161c1 1789void nf_conntrack_cleanup_start(void)
9fb9cbb1 1790{
b87a2f91 1791 conntrack_gc_work.exiting = true;
f94161c1
G
1792 RCU_INIT_POINTER(ip_ct_attach, NULL);
1793}
1794
1795void nf_conntrack_cleanup_end(void)
1796{
1797 RCU_INIT_POINTER(nf_ct_destroy, NULL);
9edd7ca0 1798
b87a2f91 1799 cancel_delayed_work_sync(&conntrack_gc_work.dwork);
56d52d48
FW
1800 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1801
04d87001 1802 nf_conntrack_proto_fini();
41d73ec0 1803 nf_conntrack_seqadj_fini();
5f69b8f5 1804 nf_conntrack_labels_fini();
5e615b22 1805 nf_conntrack_helper_fini();
8684094c 1806 nf_conntrack_timeout_fini();
3fe0f943 1807 nf_conntrack_ecache_fini();
73f4001a 1808 nf_conntrack_tstamp_fini();
b7ff3a1f 1809 nf_conntrack_acct_fini();
83b4dbe1 1810 nf_conntrack_expect_fini();
77571149
FW
1811
1812 kmem_cache_destroy(nf_conntrack_cachep);
08f6547d 1813}
9fb9cbb1 1814
f94161c1
G
1815/*
1816 * Mishearing the voices in his head, our hero wonders how he's
1817 * supposed to kill the mall.
1818 */
1819void nf_conntrack_cleanup_net(struct net *net)
08f6547d 1820{
dece40e8
VD
1821 LIST_HEAD(single);
1822
1823 list_add(&net->exit_list, &single);
1824 nf_conntrack_cleanup_net_list(&single);
1825}
1826
1827void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
1828{
1829 int busy;
1830 struct net *net;
1831
f94161c1
G
1832 /*
1833 * This makes sure all current packets have passed through
1834 * netfilter framework. Roll on, two-stage module
1835 * delete...
1836 */
1837 synchronize_net();
dece40e8
VD
1838i_see_dead_people:
1839 busy = 0;
1840 list_for_each_entry(net, net_exit_list, exit_list) {
2843fb69 1841 nf_ct_iterate_cleanup(kill_all, net, 0, 0);
dece40e8
VD
1842 if (atomic_read(&net->ct.count) != 0)
1843 busy = 1;
1844 }
1845 if (busy) {
9fb9cbb1
YK
1846 schedule();
1847 goto i_see_dead_people;
1848 }
1849
dece40e8 1850 list_for_each_entry(net, net_exit_list, exit_list) {
dece40e8
VD
1851 nf_conntrack_proto_pernet_fini(net);
1852 nf_conntrack_helper_pernet_fini(net);
1853 nf_conntrack_ecache_pernet_fini(net);
1854 nf_conntrack_tstamp_pernet_fini(net);
1855 nf_conntrack_acct_pernet_fini(net);
1856 nf_conntrack_expect_pernet_fini(net);
dece40e8 1857 free_percpu(net->ct.stat);
b7779d06 1858 free_percpu(net->ct.pcpu_lists);
dece40e8 1859 }
08f6547d
AD
1860}
1861
d862a662 1862void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
9fb9cbb1 1863{
ea781f19
ED
1864 struct hlist_nulls_head *hash;
1865 unsigned int nr_slots, i;
1866 size_t sz;
9fb9cbb1 1867
9cc1c73a
FW
1868 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1869 return NULL;
1870
ea781f19
ED
1871 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
1872 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
9cc1c73a
FW
1873
1874 if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1875 return NULL;
1876
ea781f19
ED
1877 sz = nr_slots * sizeof(struct hlist_nulls_head);
1878 hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
1879 get_order(sz));
f0ad4621 1880 if (!hash)
966567b7 1881 hash = vzalloc(sz);
9fb9cbb1 1882
ea781f19
ED
1883 if (hash && nulls)
1884 for (i = 0; i < nr_slots; i++)
1885 INIT_HLIST_NULLS_HEAD(&hash[i], i);
9fb9cbb1
YK
1886
1887 return hash;
1888}
ac565e5f 1889EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
9fb9cbb1 1890
3183ab89 1891int nf_conntrack_hash_resize(unsigned int hashsize)
9fb9cbb1 1892{
3183ab89
FW
1893 int i, bucket;
1894 unsigned int old_size;
ea781f19 1895 struct hlist_nulls_head *hash, *old_hash;
9fb9cbb1 1896 struct nf_conntrack_tuple_hash *h;
5d0aa2cc 1897 struct nf_conn *ct;
9fb9cbb1 1898
9fb9cbb1
YK
1899 if (!hashsize)
1900 return -EINVAL;
1901
d862a662 1902 hash = nf_ct_alloc_hashtable(&hashsize, 1);
9fb9cbb1
YK
1903 if (!hash)
1904 return -ENOMEM;
1905
3183ab89
FW
1906 old_size = nf_conntrack_htable_size;
1907 if (old_size == hashsize) {
1908 nf_ct_free_hashtable(hash, hashsize);
1909 return 0;
1910 }
1911
93bb0ceb
JDB
1912 local_bh_disable();
1913 nf_conntrack_all_lock();
a3efd812 1914 write_seqcount_begin(&nf_conntrack_generation);
93bb0ceb 1915
76507f69
PM
1916 /* Lookups in the old hash might happen in parallel, which means we
1917 * might get false negatives during connection lookup. New connections
1918 * created because of a false negative won't make it into the hash
93bb0ceb 1919 * though since that required taking the locks.
76507f69 1920 */
93bb0ceb 1921
56d52d48
FW
1922 for (i = 0; i < nf_conntrack_htable_size; i++) {
1923 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
1924 h = hlist_nulls_entry(nf_conntrack_hash[i].first,
1925 struct nf_conntrack_tuple_hash, hnnode);
5d0aa2cc 1926 ct = nf_ct_tuplehash_to_ctrack(h);
ea781f19 1927 hlist_nulls_del_rcu(&h->hnnode);
1b8c8a9f
FW
1928 bucket = __hash_conntrack(nf_ct_net(ct),
1929 &h->tuple, hashsize);
ea781f19 1930 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
9fb9cbb1
YK
1931 }
1932 }
56d52d48
FW
1933 old_size = nf_conntrack_htable_size;
1934 old_hash = nf_conntrack_hash;
9fb9cbb1 1935
56d52d48
FW
1936 nf_conntrack_hash = hash;
1937 nf_conntrack_htable_size = hashsize;
93bb0ceb 1938
a3efd812 1939 write_seqcount_end(&nf_conntrack_generation);
93bb0ceb
JDB
1940 nf_conntrack_all_unlock();
1941 local_bh_enable();
9fb9cbb1 1942
5e3c61f9 1943 synchronize_net();
d862a662 1944 nf_ct_free_hashtable(old_hash, old_size);
9fb9cbb1
YK
1945 return 0;
1946}
3183ab89
FW
1947
1948int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1949{
1950 unsigned int hashsize;
1951 int rc;
1952
1953 if (current->nsproxy->net_ns != &init_net)
1954 return -EOPNOTSUPP;
1955
1956 /* On boot, we can set this without any fancy locking. */
1957 if (!nf_conntrack_htable_size)
1958 return param_set_uint(val, kp);
1959
1960 rc = kstrtouint(val, 0, &hashsize);
1961 if (rc)
1962 return rc;
1963
1964 return nf_conntrack_hash_resize(hashsize);
1965}
fae718dd 1966EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
9fb9cbb1 1967
fae718dd 1968module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
9fb9cbb1
YK
1969 &nf_conntrack_htable_size, 0600);
1970
ab71632c 1971static __always_inline unsigned int total_extension_size(void)
b3a5db10
FW
1972{
1973 /* remember to add new extensions below */
1974 BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
1975
1976 return sizeof(struct nf_ct_ext) +
1977 sizeof(struct nf_conn_help)
1978#if IS_ENABLED(CONFIG_NF_NAT)
1979 + sizeof(struct nf_conn_nat)
1980#endif
1981 + sizeof(struct nf_conn_seqadj)
1982 + sizeof(struct nf_conn_acct)
1983#ifdef CONFIG_NF_CONNTRACK_EVENTS
1984 + sizeof(struct nf_conntrack_ecache)
1985#endif
1986#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
1987 + sizeof(struct nf_conn_tstamp)
1988#endif
1989#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1990 + sizeof(struct nf_conn_timeout)
1991#endif
1992#ifdef CONFIG_NF_CONNTRACK_LABELS
1993 + sizeof(struct nf_conn_labels)
1994#endif
1995#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
1996 + sizeof(struct nf_conn_synproxy)
1997#endif
1998 ;
1999};
2000
f94161c1 2001int nf_conntrack_init_start(void)
9fb9cbb1 2002{
f205c5e0 2003 int max_factor = 8;
0c5366b3 2004 int ret = -ENOMEM;
cc41c84b 2005 int i;
93bb0ceb 2006
b3a5db10
FW
2007 /* struct nf_ct_ext uses u8 to store offsets/size */
2008 BUILD_BUG_ON(total_extension_size() > 255u);
2009
a3efd812
FW
2010 seqcount_init(&nf_conntrack_generation);
2011
d5d20912 2012 for (i = 0; i < CONNTRACK_LOCKS; i++)
93bb0ceb 2013 spin_lock_init(&nf_conntrack_locks[i]);
9fb9cbb1 2014
9fb9cbb1 2015 if (!nf_conntrack_htable_size) {
88eab472
ML
2016 /* Idea from tcp.c: use 1/16384 of memory.
2017 * On i386: 32MB machine has 512 buckets.
2018 * >= 1GB machines have 16384 buckets.
2019 * >= 4GB machines have 65536 buckets.
2020 */
9fb9cbb1 2021 nf_conntrack_htable_size
4481374c 2022 = (((totalram_pages << PAGE_SHIFT) / 16384)
f205c5e0 2023 / sizeof(struct hlist_head));
88eab472
ML
2024 if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
2025 nf_conntrack_htable_size = 65536;
2026 else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
f205c5e0
PM
2027 nf_conntrack_htable_size = 16384;
2028 if (nf_conntrack_htable_size < 32)
2029 nf_conntrack_htable_size = 32;
2030
2031 /* Use a max. factor of four by default to get the same max as
2032 * with the old struct list_heads. When a table size is given
2033 * we use the old value of 8 to avoid reducing the max.
2034 * entries. */
2035 max_factor = 4;
9fb9cbb1 2036 }
56d52d48
FW
2037
2038 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
2039 if (!nf_conntrack_hash)
2040 return -ENOMEM;
2041
f205c5e0 2042 nf_conntrack_max = max_factor * nf_conntrack_htable_size;
8e5105a0 2043
0c5366b3 2044 nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
a9e419dc
FW
2045 sizeof(struct nf_conn),
2046 NFCT_INFOMASK + 1,
5f0d5a3a 2047 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
0c5366b3
FW
2048 if (!nf_conntrack_cachep)
2049 goto err_cachep;
2050
654d0fbd 2051 printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
8e5105a0
PM
2052 NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
2053 nf_conntrack_max);
83b4dbe1
G
2054
2055 ret = nf_conntrack_expect_init();
2056 if (ret < 0)
2057 goto err_expect;
2058
b7ff3a1f
G
2059 ret = nf_conntrack_acct_init();
2060 if (ret < 0)
2061 goto err_acct;
2062
73f4001a
G
2063 ret = nf_conntrack_tstamp_init();
2064 if (ret < 0)
2065 goto err_tstamp;
2066
3fe0f943
G
2067 ret = nf_conntrack_ecache_init();
2068 if (ret < 0)
2069 goto err_ecache;
2070
8684094c
G
2071 ret = nf_conntrack_timeout_init();
2072 if (ret < 0)
2073 goto err_timeout;
2074
5e615b22
G
2075 ret = nf_conntrack_helper_init();
2076 if (ret < 0)
2077 goto err_helper;
2078
5f69b8f5
G
2079 ret = nf_conntrack_labels_init();
2080 if (ret < 0)
2081 goto err_labels;
2082
41d73ec0
PM
2083 ret = nf_conntrack_seqadj_init();
2084 if (ret < 0)
2085 goto err_seqadj;
2086
04d87001
G
2087 ret = nf_conntrack_proto_init();
2088 if (ret < 0)
2089 goto err_proto;
2090
b87a2f91 2091 conntrack_gc_work_init(&conntrack_gc_work);
e5072053 2092 queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ);
b87a2f91 2093
08f6547d
AD
2094 return 0;
2095
04d87001 2096err_proto:
41d73ec0
PM
2097 nf_conntrack_seqadj_fini();
2098err_seqadj:
04d87001 2099 nf_conntrack_labels_fini();
5f69b8f5
G
2100err_labels:
2101 nf_conntrack_helper_fini();
5e615b22
G
2102err_helper:
2103 nf_conntrack_timeout_fini();
8684094c
G
2104err_timeout:
2105 nf_conntrack_ecache_fini();
3fe0f943
G
2106err_ecache:
2107 nf_conntrack_tstamp_fini();
73f4001a
G
2108err_tstamp:
2109 nf_conntrack_acct_fini();
b7ff3a1f
G
2110err_acct:
2111 nf_conntrack_expect_fini();
83b4dbe1 2112err_expect:
0c5366b3
FW
2113 kmem_cache_destroy(nf_conntrack_cachep);
2114err_cachep:
56d52d48 2115 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
08f6547d
AD
2116 return ret;
2117}
2118
f94161c1
G
2119void nf_conntrack_init_end(void)
2120{
2121 /* For use by REJECT target */
2122 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
2123 RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
f94161c1
G
2124}
2125
8cc20198
ED
2126/*
2127 * We need to use special "null" values, not used in hash table
2128 */
2129#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
2130#define DYING_NULLS_VAL ((1<<30)+1)
252b3e8c 2131#define TEMPLATE_NULLS_VAL ((1<<30)+2)
8cc20198 2132
f94161c1 2133int nf_conntrack_init_net(struct net *net)
08f6547d 2134{
b7779d06
JDB
2135 int ret = -ENOMEM;
2136 int cpu;
ceceae1b 2137
cc41c84b 2138 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
08f6547d 2139 atomic_set(&net->ct.count, 0);
b7779d06
JDB
2140
2141 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
2142 if (!net->ct.pcpu_lists)
08f6547d 2143 goto err_stat;
b7779d06
JDB
2144
2145 for_each_possible_cpu(cpu) {
2146 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2147
2148 spin_lock_init(&pcpu->lock);
2149 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
2150 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
08f6547d 2151 }
5b3501fa 2152
b7779d06
JDB
2153 net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2154 if (!net->ct.stat)
2155 goto err_pcpu_lists;
2156
83b4dbe1 2157 ret = nf_conntrack_expect_pernet_init(net);
08f6547d
AD
2158 if (ret < 0)
2159 goto err_expect;
b7ff3a1f 2160 ret = nf_conntrack_acct_pernet_init(net);
58401572 2161 if (ret < 0)
08f6547d 2162 goto err_acct;
73f4001a 2163 ret = nf_conntrack_tstamp_pernet_init(net);
a992ca2a
PNA
2164 if (ret < 0)
2165 goto err_tstamp;
3fe0f943 2166 ret = nf_conntrack_ecache_pernet_init(net);
a0891aa6
PNA
2167 if (ret < 0)
2168 goto err_ecache;
5e615b22 2169 ret = nf_conntrack_helper_pernet_init(net);
a9006892
EL
2170 if (ret < 0)
2171 goto err_helper;
04d87001 2172 ret = nf_conntrack_proto_pernet_init(net);
f94161c1
G
2173 if (ret < 0)
2174 goto err_proto;
08f6547d 2175 return 0;
c539f017 2176
f94161c1 2177err_proto:
5e615b22 2178 nf_conntrack_helper_pernet_fini(net);
a9006892 2179err_helper:
3fe0f943 2180 nf_conntrack_ecache_pernet_fini(net);
a0891aa6 2181err_ecache:
73f4001a 2182 nf_conntrack_tstamp_pernet_fini(net);
a992ca2a 2183err_tstamp:
b7ff3a1f 2184 nf_conntrack_acct_pernet_fini(net);
08f6547d 2185err_acct:
83b4dbe1 2186 nf_conntrack_expect_pernet_fini(net);
08f6547d 2187err_expect:
0d55af87 2188 free_percpu(net->ct.stat);
b7779d06
JDB
2189err_pcpu_lists:
2190 free_percpu(net->ct.pcpu_lists);
0d55af87 2191err_stat:
08f6547d
AD
2192 return ret;
2193}