1 // SPDX-License-Identifier: GPL-2.0-only
3 * count the number of connections matching an arbitrary key.
5 * (C) 2017 Red Hat GmbH
6 * Author: Florian Westphal <fw@strlen.de>
8 * split from xt_connlimit.c:
9 * (c) 2000 Gerd Knorr <kraxel@bytesex.org>
10 * Nov 2002: Martin Bene <martin.bene@icomedias.com>:
11 * only ignore TIME_WAIT or gone connections
12 * (C) CC Computer Consultants GmbH, 2007
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16 #include <linux/in6.h>
18 #include <linux/ipv6.h>
19 #include <linux/jhash.h>
20 #include <linux/slab.h>
21 #include <linux/list.h>
22 #include <linux/rbtree.h>
23 #include <linux/module.h>
24 #include <linux/random.h>
25 #include <linux/skbuff.h>
26 #include <linux/spinlock.h>
27 #include <linux/netfilter/nf_conntrack_tcp.h>
28 #include <linux/netfilter/x_tables.h>
29 #include <net/netfilter/nf_conntrack.h>
30 #include <net/netfilter/nf_conntrack_count.h>
31 #include <net/netfilter/nf_conntrack_core.h>
32 #include <net/netfilter/nf_conntrack_tuple.h>
33 #include <net/netfilter/nf_conntrack_zones.h>
35 #define CONNCOUNT_SLOTS 256U
37 #define CONNCOUNT_GC_MAX_NODES 8
40 /* we will save the tuples of all connections we care about */
41 struct nf_conncount_tuple
{
42 struct list_head node
;
43 struct nf_conntrack_tuple tuple
;
44 struct nf_conntrack_zone zone
;
49 struct nf_conncount_rb
{
51 struct nf_conncount_list list
;
53 struct rcu_head rcu_head
;
56 static spinlock_t nf_conncount_locks
[CONNCOUNT_SLOTS
] __cacheline_aligned_in_smp
;
58 struct nf_conncount_data
{
60 struct rb_root root
[CONNCOUNT_SLOTS
];
62 struct work_struct gc_work
;
63 unsigned long pending_trees
[BITS_TO_LONGS(CONNCOUNT_SLOTS
)];
67 static u_int32_t conncount_rnd __read_mostly
;
68 static struct kmem_cache
*conncount_rb_cachep __read_mostly
;
69 static struct kmem_cache
*conncount_conn_cachep __read_mostly
;
71 static inline bool already_closed(const struct nf_conn
*conn
)
73 if (nf_ct_protonum(conn
) == IPPROTO_TCP
)
74 return conn
->proto
.tcp
.state
== TCP_CONNTRACK_TIME_WAIT
||
75 conn
->proto
.tcp
.state
== TCP_CONNTRACK_CLOSE
;
80 static int key_diff(const u32
*a
, const u32
*b
, unsigned int klen
)
82 return memcmp(a
, b
, klen
* sizeof(u32
));
85 static void conn_free(struct nf_conncount_list
*list
,
86 struct nf_conncount_tuple
*conn
)
88 lockdep_assert_held(&list
->list_lock
);
91 list_del(&conn
->node
);
93 kmem_cache_free(conncount_conn_cachep
, conn
);
96 static const struct nf_conntrack_tuple_hash
*
97 find_or_evict(struct net
*net
, struct nf_conncount_list
*list
,
98 struct nf_conncount_tuple
*conn
)
100 const struct nf_conntrack_tuple_hash
*found
;
102 int cpu
= raw_smp_processor_id();
105 found
= nf_conntrack_find_get(net
, &conn
->zone
, &conn
->tuple
);
111 /* conn might have been added just before by another cpu and
112 * might still be unconfirmed. In this case, nf_conntrack_find()
113 * returns no result. Thus only evict if this cpu added the
114 * stale entry or if the entry is older than two jiffies.
117 if (conn
->cpu
== cpu
|| age
>= 2) {
118 conn_free(list
, conn
);
119 return ERR_PTR(-ENOENT
);
122 return ERR_PTR(-EAGAIN
);
125 static int __nf_conncount_add(struct net
*net
,
126 struct nf_conncount_list
*list
,
127 const struct nf_conntrack_tuple
*tuple
,
128 const struct nf_conntrack_zone
*zone
)
130 const struct nf_conntrack_tuple_hash
*found
;
131 struct nf_conncount_tuple
*conn
, *conn_n
;
132 struct nf_conn
*found_ct
;
133 unsigned int collect
= 0;
135 if (time_is_after_eq_jiffies((unsigned long)list
->last_gc
))
138 /* check the saved connections */
139 list_for_each_entry_safe(conn
, conn_n
, &list
->head
, node
) {
140 if (collect
> CONNCOUNT_GC_MAX_NODES
)
143 found
= find_or_evict(net
, list
, conn
);
145 /* Not found, but might be about to be confirmed */
146 if (PTR_ERR(found
) == -EAGAIN
) {
147 if (nf_ct_tuple_equal(&conn
->tuple
, tuple
) &&
148 nf_ct_zone_id(&conn
->zone
, conn
->zone
.dir
) ==
149 nf_ct_zone_id(zone
, zone
->dir
))
150 return 0; /* already exists */
157 found_ct
= nf_ct_tuplehash_to_ctrack(found
);
159 if (nf_ct_tuple_equal(&conn
->tuple
, tuple
) &&
160 nf_ct_zone_equal(found_ct
, zone
, zone
->dir
)) {
162 * We should not see tuples twice unless someone hooks
163 * this into a table without "-p tcp --syn".
165 * Attempt to avoid a re-add in this case.
169 } else if (already_closed(found_ct
)) {
171 * we do not care about connections which are
172 * closed already -> ditch it
175 conn_free(list
, conn
);
184 if (WARN_ON_ONCE(list
->count
> INT_MAX
))
187 conn
= kmem_cache_alloc(conncount_conn_cachep
, GFP_ATOMIC
);
191 conn
->tuple
= *tuple
;
193 conn
->cpu
= raw_smp_processor_id();
194 conn
->jiffies32
= (u32
)jiffies
;
195 list_add_tail(&conn
->node
, &list
->head
);
197 list
->last_gc
= (u32
)jiffies
;
201 int nf_conncount_add(struct net
*net
,
202 struct nf_conncount_list
*list
,
203 const struct nf_conntrack_tuple
*tuple
,
204 const struct nf_conntrack_zone
*zone
)
208 /* check the saved connections */
209 spin_lock_bh(&list
->list_lock
);
210 ret
= __nf_conncount_add(net
, list
, tuple
, zone
);
211 spin_unlock_bh(&list
->list_lock
);
215 EXPORT_SYMBOL_GPL(nf_conncount_add
);
217 void nf_conncount_list_init(struct nf_conncount_list
*list
)
219 spin_lock_init(&list
->list_lock
);
220 INIT_LIST_HEAD(&list
->head
);
222 list
->last_gc
= (u32
)jiffies
;
224 EXPORT_SYMBOL_GPL(nf_conncount_list_init
);
226 /* Return true if the list is empty. Must be called with BH disabled. */
227 bool nf_conncount_gc_list(struct net
*net
,
228 struct nf_conncount_list
*list
)
230 const struct nf_conntrack_tuple_hash
*found
;
231 struct nf_conncount_tuple
*conn
, *conn_n
;
232 struct nf_conn
*found_ct
;
233 unsigned int collected
= 0;
236 /* don't bother if we just did GC */
237 if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list
->last_gc
)))
240 /* don't bother if other cpu is already doing GC */
241 if (!spin_trylock(&list
->list_lock
))
244 list_for_each_entry_safe(conn
, conn_n
, &list
->head
, node
) {
245 found
= find_or_evict(net
, list
, conn
);
247 if (PTR_ERR(found
) == -ENOENT
)
252 found_ct
= nf_ct_tuplehash_to_ctrack(found
);
253 if (already_closed(found_ct
)) {
255 * we do not care about connections which are
256 * closed already -> ditch it
259 conn_free(list
, conn
);
265 if (collected
> CONNCOUNT_GC_MAX_NODES
)
271 list
->last_gc
= (u32
)jiffies
;
272 spin_unlock(&list
->list_lock
);
276 EXPORT_SYMBOL_GPL(nf_conncount_gc_list
);
278 static void __tree_nodes_free(struct rcu_head
*h
)
280 struct nf_conncount_rb
*rbconn
;
282 rbconn
= container_of(h
, struct nf_conncount_rb
, rcu_head
);
283 kmem_cache_free(conncount_rb_cachep
, rbconn
);
286 /* caller must hold tree nf_conncount_locks[] lock */
287 static void tree_nodes_free(struct rb_root
*root
,
288 struct nf_conncount_rb
*gc_nodes
[],
289 unsigned int gc_count
)
291 struct nf_conncount_rb
*rbconn
;
294 rbconn
= gc_nodes
[--gc_count
];
295 spin_lock(&rbconn
->list
.list_lock
);
296 if (!rbconn
->list
.count
) {
297 rb_erase(&rbconn
->node
, root
);
298 call_rcu(&rbconn
->rcu_head
, __tree_nodes_free
);
300 spin_unlock(&rbconn
->list
.list_lock
);
304 static void schedule_gc_worker(struct nf_conncount_data
*data
, int tree
)
306 set_bit(tree
, data
->pending_trees
);
307 schedule_work(&data
->gc_work
);
311 insert_tree(struct net
*net
,
312 struct nf_conncount_data
*data
,
313 struct rb_root
*root
,
316 const struct nf_conntrack_tuple
*tuple
,
317 const struct nf_conntrack_zone
*zone
)
319 struct nf_conncount_rb
*gc_nodes
[CONNCOUNT_GC_MAX_NODES
];
320 struct rb_node
**rbnode
, *parent
;
321 struct nf_conncount_rb
*rbconn
;
322 struct nf_conncount_tuple
*conn
;
323 unsigned int count
= 0, gc_count
= 0;
324 u8 keylen
= data
->keylen
;
327 spin_lock_bh(&nf_conncount_locks
[hash
]);
330 rbnode
= &(root
->rb_node
);
333 rbconn
= rb_entry(*rbnode
, struct nf_conncount_rb
, node
);
336 diff
= key_diff(key
, rbconn
->key
, keylen
);
338 rbnode
= &((*rbnode
)->rb_left
);
339 } else if (diff
> 0) {
340 rbnode
= &((*rbnode
)->rb_right
);
344 ret
= nf_conncount_add(net
, &rbconn
->list
, tuple
, zone
);
346 count
= 0; /* hotdrop */
348 count
= rbconn
->list
.count
;
349 tree_nodes_free(root
, gc_nodes
, gc_count
);
353 if (gc_count
>= ARRAY_SIZE(gc_nodes
))
356 if (do_gc
&& nf_conncount_gc_list(net
, &rbconn
->list
))
357 gc_nodes
[gc_count
++] = rbconn
;
361 tree_nodes_free(root
, gc_nodes
, gc_count
);
362 schedule_gc_worker(data
, hash
);
368 /* expected case: match, insert new node */
369 rbconn
= kmem_cache_alloc(conncount_rb_cachep
, GFP_ATOMIC
);
373 conn
= kmem_cache_alloc(conncount_conn_cachep
, GFP_ATOMIC
);
375 kmem_cache_free(conncount_rb_cachep
, rbconn
);
379 conn
->tuple
= *tuple
;
381 memcpy(rbconn
->key
, key
, sizeof(u32
) * keylen
);
383 nf_conncount_list_init(&rbconn
->list
);
384 list_add(&conn
->node
, &rbconn
->list
.head
);
386 rbconn
->list
.count
= count
;
388 rb_link_node_rcu(&rbconn
->node
, parent
, rbnode
);
389 rb_insert_color(&rbconn
->node
, root
);
391 spin_unlock_bh(&nf_conncount_locks
[hash
]);
396 count_tree(struct net
*net
,
397 struct nf_conncount_data
*data
,
399 const struct nf_conntrack_tuple
*tuple
,
400 const struct nf_conntrack_zone
*zone
)
402 struct rb_root
*root
;
403 struct rb_node
*parent
;
404 struct nf_conncount_rb
*rbconn
;
406 u8 keylen
= data
->keylen
;
408 hash
= jhash2(key
, data
->keylen
, conncount_rnd
) % CONNCOUNT_SLOTS
;
409 root
= &data
->root
[hash
];
411 parent
= rcu_dereference_raw(root
->rb_node
);
415 rbconn
= rb_entry(parent
, struct nf_conncount_rb
, node
);
417 diff
= key_diff(key
, rbconn
->key
, keylen
);
419 parent
= rcu_dereference_raw(parent
->rb_left
);
420 } else if (diff
> 0) {
421 parent
= rcu_dereference_raw(parent
->rb_right
);
426 nf_conncount_gc_list(net
, &rbconn
->list
);
427 return rbconn
->list
.count
;
430 spin_lock_bh(&rbconn
->list
.list_lock
);
431 /* Node might be about to be free'd.
432 * We need to defer to insert_tree() in this case.
434 if (rbconn
->list
.count
== 0) {
435 spin_unlock_bh(&rbconn
->list
.list_lock
);
439 /* same source network -> be counted! */
440 ret
= __nf_conncount_add(net
, &rbconn
->list
, tuple
, zone
);
441 spin_unlock_bh(&rbconn
->list
.list_lock
);
443 return 0; /* hotdrop */
445 return rbconn
->list
.count
;
452 return insert_tree(net
, data
, root
, hash
, key
, tuple
, zone
);
455 static void tree_gc_worker(struct work_struct
*work
)
457 struct nf_conncount_data
*data
= container_of(work
, struct nf_conncount_data
, gc_work
);
458 struct nf_conncount_rb
*gc_nodes
[CONNCOUNT_GC_MAX_NODES
], *rbconn
;
459 struct rb_root
*root
;
460 struct rb_node
*node
;
461 unsigned int tree
, next_tree
, gc_count
= 0;
463 tree
= data
->gc_tree
% CONNCOUNT_SLOTS
;
464 root
= &data
->root
[tree
];
468 for (node
= rb_first(root
); node
!= NULL
; node
= rb_next(node
)) {
469 rbconn
= rb_entry(node
, struct nf_conncount_rb
, node
);
470 if (nf_conncount_gc_list(data
->net
, &rbconn
->list
))
478 spin_lock_bh(&nf_conncount_locks
[tree
]);
479 if (gc_count
< ARRAY_SIZE(gc_nodes
))
480 goto next
; /* do not bother */
483 node
= rb_first(root
);
484 while (node
!= NULL
) {
485 rbconn
= rb_entry(node
, struct nf_conncount_rb
, node
);
486 node
= rb_next(node
);
488 if (rbconn
->list
.count
> 0)
491 gc_nodes
[gc_count
++] = rbconn
;
492 if (gc_count
>= ARRAY_SIZE(gc_nodes
)) {
493 tree_nodes_free(root
, gc_nodes
, gc_count
);
498 tree_nodes_free(root
, gc_nodes
, gc_count
);
500 clear_bit(tree
, data
->pending_trees
);
502 next_tree
= (tree
+ 1) % CONNCOUNT_SLOTS
;
503 next_tree
= find_next_bit(data
->pending_trees
, CONNCOUNT_SLOTS
, next_tree
);
505 if (next_tree
< CONNCOUNT_SLOTS
) {
506 data
->gc_tree
= next_tree
;
510 spin_unlock_bh(&nf_conncount_locks
[tree
]);
513 /* Count and return number of conntrack entries in 'net' with particular 'key'.
514 * If 'tuple' is not null, insert it into the accounting data structure.
515 * Call with RCU read lock.
517 unsigned int nf_conncount_count(struct net
*net
,
518 struct nf_conncount_data
*data
,
520 const struct nf_conntrack_tuple
*tuple
,
521 const struct nf_conntrack_zone
*zone
)
523 return count_tree(net
, data
, key
, tuple
, zone
);
525 EXPORT_SYMBOL_GPL(nf_conncount_count
);
527 struct nf_conncount_data
*nf_conncount_init(struct net
*net
, unsigned int family
,
530 struct nf_conncount_data
*data
;
533 if (keylen
% sizeof(u32
) ||
534 keylen
/ sizeof(u32
) > MAX_KEYLEN
||
536 return ERR_PTR(-EINVAL
);
538 net_get_random_once(&conncount_rnd
, sizeof(conncount_rnd
));
540 data
= kmalloc(sizeof(*data
), GFP_KERNEL
);
542 return ERR_PTR(-ENOMEM
);
544 ret
= nf_ct_netns_get(net
, family
);
550 for (i
= 0; i
< ARRAY_SIZE(data
->root
); ++i
)
551 data
->root
[i
] = RB_ROOT
;
553 data
->keylen
= keylen
/ sizeof(u32
);
555 INIT_WORK(&data
->gc_work
, tree_gc_worker
);
559 EXPORT_SYMBOL_GPL(nf_conncount_init
);
561 void nf_conncount_cache_free(struct nf_conncount_list
*list
)
563 struct nf_conncount_tuple
*conn
, *conn_n
;
565 list_for_each_entry_safe(conn
, conn_n
, &list
->head
, node
)
566 kmem_cache_free(conncount_conn_cachep
, conn
);
568 EXPORT_SYMBOL_GPL(nf_conncount_cache_free
);
570 static void destroy_tree(struct rb_root
*r
)
572 struct nf_conncount_rb
*rbconn
;
573 struct rb_node
*node
;
575 while ((node
= rb_first(r
)) != NULL
) {
576 rbconn
= rb_entry(node
, struct nf_conncount_rb
, node
);
580 nf_conncount_cache_free(&rbconn
->list
);
582 kmem_cache_free(conncount_rb_cachep
, rbconn
);
586 void nf_conncount_destroy(struct net
*net
, unsigned int family
,
587 struct nf_conncount_data
*data
)
591 cancel_work_sync(&data
->gc_work
);
592 nf_ct_netns_put(net
, family
);
594 for (i
= 0; i
< ARRAY_SIZE(data
->root
); ++i
)
595 destroy_tree(&data
->root
[i
]);
599 EXPORT_SYMBOL_GPL(nf_conncount_destroy
);
601 static int __init
nf_conncount_modinit(void)
605 for (i
= 0; i
< CONNCOUNT_SLOTS
; ++i
)
606 spin_lock_init(&nf_conncount_locks
[i
]);
608 conncount_conn_cachep
= kmem_cache_create("nf_conncount_tuple",
609 sizeof(struct nf_conncount_tuple
),
611 if (!conncount_conn_cachep
)
614 conncount_rb_cachep
= kmem_cache_create("nf_conncount_rb",
615 sizeof(struct nf_conncount_rb
),
617 if (!conncount_rb_cachep
) {
618 kmem_cache_destroy(conncount_conn_cachep
);
625 static void __exit
nf_conncount_modexit(void)
627 kmem_cache_destroy(conncount_conn_cachep
);
628 kmem_cache_destroy(conncount_rb_cachep
);
631 module_init(nf_conncount_modinit
);
632 module_exit(nf_conncount_modexit
);
633 MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
634 MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
635 MODULE_DESCRIPTION("netfilter: count number of connections matching a key");
636 MODULE_LICENSE("GPL");