1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
17 static DEFINE_MUTEX(flowtable_lock
);
18 static LIST_HEAD(flowtables
);
21 flow_offload_fill_dir(struct flow_offload
*flow
,
22 enum flow_offload_tuple_dir dir
)
24 struct flow_offload_tuple
*ft
= &flow
->tuplehash
[dir
].tuple
;
25 struct nf_conntrack_tuple
*ctt
= &flow
->ct
->tuplehash
[dir
].tuple
;
29 switch (ctt
->src
.l3num
) {
31 ft
->src_v4
= ctt
->src
.u3
.in
;
32 ft
->dst_v4
= ctt
->dst
.u3
.in
;
35 ft
->src_v6
= ctt
->src
.u3
.in6
;
36 ft
->dst_v6
= ctt
->dst
.u3
.in6
;
40 ft
->l3proto
= ctt
->src
.l3num
;
41 ft
->l4proto
= ctt
->dst
.protonum
;
42 ft
->src_port
= ctt
->src
.u
.tcp
.port
;
43 ft
->dst_port
= ctt
->dst
.u
.tcp
.port
;
46 struct flow_offload
*flow_offload_alloc(struct nf_conn
*ct
)
48 struct flow_offload
*flow
;
50 if (unlikely(nf_ct_is_dying(ct
) ||
51 !atomic_inc_not_zero(&ct
->ct_general
.use
)))
54 flow
= kzalloc(sizeof(*flow
), GFP_ATOMIC
);
60 flow_offload_fill_dir(flow
, FLOW_OFFLOAD_DIR_ORIGINAL
);
61 flow_offload_fill_dir(flow
, FLOW_OFFLOAD_DIR_REPLY
);
63 if (ct
->status
& IPS_SRC_NAT
)
64 __set_bit(NF_FLOW_SNAT
, &flow
->flags
);
65 if (ct
->status
& IPS_DST_NAT
)
66 __set_bit(NF_FLOW_DNAT
, &flow
->flags
);
75 EXPORT_SYMBOL_GPL(flow_offload_alloc
);
77 static int flow_offload_fill_route(struct flow_offload
*flow
,
78 const struct nf_flow_route
*route
,
79 enum flow_offload_tuple_dir dir
)
81 struct flow_offload_tuple
*flow_tuple
= &flow
->tuplehash
[dir
].tuple
;
82 struct dst_entry
*other_dst
= route
->tuple
[!dir
].dst
;
83 struct dst_entry
*dst
= route
->tuple
[dir
].dst
;
85 if (!dst_hold_safe(route
->tuple
[dir
].dst
))
88 switch (flow_tuple
->l3proto
) {
90 flow_tuple
->mtu
= ip_dst_mtu_maybe_forward(dst
, true);
93 flow_tuple
->mtu
= ip6_dst_mtu_forward(dst
);
97 flow_tuple
->iifidx
= other_dst
->dev
->ifindex
;
98 flow_tuple
->dst_cache
= dst
;
103 int flow_offload_route_init(struct flow_offload
*flow
,
104 const struct nf_flow_route
*route
)
108 err
= flow_offload_fill_route(flow
, route
, FLOW_OFFLOAD_DIR_ORIGINAL
);
112 err
= flow_offload_fill_route(flow
, route
, FLOW_OFFLOAD_DIR_REPLY
);
114 goto err_route_reply
;
116 flow
->type
= NF_FLOW_OFFLOAD_ROUTE
;
121 dst_release(route
->tuple
[FLOW_OFFLOAD_DIR_ORIGINAL
].dst
);
125 EXPORT_SYMBOL_GPL(flow_offload_route_init
);
127 static void flow_offload_fixup_tcp(struct ip_ct_tcp
*tcp
)
129 tcp
->state
= TCP_CONNTRACK_ESTABLISHED
;
130 tcp
->seen
[0].td_maxwin
= 0;
131 tcp
->seen
[1].td_maxwin
= 0;
134 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
135 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
137 static void flow_offload_fixup_ct_timeout(struct nf_conn
*ct
)
139 const struct nf_conntrack_l4proto
*l4proto
;
140 int l4num
= nf_ct_protonum(ct
);
141 unsigned int timeout
;
143 l4proto
= nf_ct_l4proto_find(l4num
);
147 if (l4num
== IPPROTO_TCP
)
148 timeout
= NF_FLOWTABLE_TCP_PICKUP_TIMEOUT
;
149 else if (l4num
== IPPROTO_UDP
)
150 timeout
= NF_FLOWTABLE_UDP_PICKUP_TIMEOUT
;
154 if (nf_flow_timeout_delta(ct
->timeout
) > (__s32
)timeout
)
155 ct
->timeout
= nfct_time_stamp
+ timeout
;
158 static void flow_offload_fixup_ct_state(struct nf_conn
*ct
)
160 if (nf_ct_protonum(ct
) == IPPROTO_TCP
)
161 flow_offload_fixup_tcp(&ct
->proto
.tcp
);
164 static void flow_offload_fixup_ct(struct nf_conn
*ct
)
166 flow_offload_fixup_ct_state(ct
);
167 flow_offload_fixup_ct_timeout(ct
);
170 static void flow_offload_route_release(struct flow_offload
*flow
)
172 dst_release(flow
->tuplehash
[FLOW_OFFLOAD_DIR_ORIGINAL
].tuple
.dst_cache
);
173 dst_release(flow
->tuplehash
[FLOW_OFFLOAD_DIR_REPLY
].tuple
.dst_cache
);
176 void flow_offload_free(struct flow_offload
*flow
)
178 switch (flow
->type
) {
179 case NF_FLOW_OFFLOAD_ROUTE
:
180 flow_offload_route_release(flow
);
186 kfree_rcu(flow
, rcu_head
);
188 EXPORT_SYMBOL_GPL(flow_offload_free
);
190 static u32
flow_offload_hash(const void *data
, u32 len
, u32 seed
)
192 const struct flow_offload_tuple
*tuple
= data
;
194 return jhash(tuple
, offsetof(struct flow_offload_tuple
, dir
), seed
);
197 static u32
flow_offload_hash_obj(const void *data
, u32 len
, u32 seed
)
199 const struct flow_offload_tuple_rhash
*tuplehash
= data
;
201 return jhash(&tuplehash
->tuple
, offsetof(struct flow_offload_tuple
, dir
), seed
);
204 static int flow_offload_hash_cmp(struct rhashtable_compare_arg
*arg
,
207 const struct flow_offload_tuple
*tuple
= arg
->key
;
208 const struct flow_offload_tuple_rhash
*x
= ptr
;
210 if (memcmp(&x
->tuple
, tuple
, offsetof(struct flow_offload_tuple
, dir
)))
216 static const struct rhashtable_params nf_flow_offload_rhash_params
= {
217 .head_offset
= offsetof(struct flow_offload_tuple_rhash
, node
),
218 .hashfn
= flow_offload_hash
,
219 .obj_hashfn
= flow_offload_hash_obj
,
220 .obj_cmpfn
= flow_offload_hash_cmp
,
221 .automatic_shrinking
= true,
224 int flow_offload_add(struct nf_flowtable
*flow_table
, struct flow_offload
*flow
)
228 flow
->timeout
= nf_flowtable_time_stamp
+ NF_FLOW_TIMEOUT
;
230 err
= rhashtable_insert_fast(&flow_table
->rhashtable
,
231 &flow
->tuplehash
[0].node
,
232 nf_flow_offload_rhash_params
);
236 err
= rhashtable_insert_fast(&flow_table
->rhashtable
,
237 &flow
->tuplehash
[1].node
,
238 nf_flow_offload_rhash_params
);
240 rhashtable_remove_fast(&flow_table
->rhashtable
,
241 &flow
->tuplehash
[0].node
,
242 nf_flow_offload_rhash_params
);
246 if (nf_flowtable_hw_offload(flow_table
)) {
247 __set_bit(NF_FLOW_HW
, &flow
->flags
);
248 nf_flow_offload_add(flow_table
, flow
);
253 EXPORT_SYMBOL_GPL(flow_offload_add
);
255 void flow_offload_refresh(struct nf_flowtable
*flow_table
,
256 struct flow_offload
*flow
)
258 flow
->timeout
= nf_flowtable_time_stamp
+ NF_FLOW_TIMEOUT
;
260 if (likely(!nf_flowtable_hw_offload(flow_table
) ||
261 !test_and_clear_bit(NF_FLOW_HW_REFRESH
, &flow
->flags
)))
264 nf_flow_offload_add(flow_table
, flow
);
266 EXPORT_SYMBOL_GPL(flow_offload_refresh
);
268 static inline bool nf_flow_has_expired(const struct flow_offload
*flow
)
270 return nf_flow_timeout_delta(flow
->timeout
) <= 0;
273 static void flow_offload_del(struct nf_flowtable
*flow_table
,
274 struct flow_offload
*flow
)
276 rhashtable_remove_fast(&flow_table
->rhashtable
,
277 &flow
->tuplehash
[FLOW_OFFLOAD_DIR_ORIGINAL
].node
,
278 nf_flow_offload_rhash_params
);
279 rhashtable_remove_fast(&flow_table
->rhashtable
,
280 &flow
->tuplehash
[FLOW_OFFLOAD_DIR_REPLY
].node
,
281 nf_flow_offload_rhash_params
);
283 clear_bit(IPS_OFFLOAD_BIT
, &flow
->ct
->status
);
285 if (nf_flow_has_expired(flow
))
286 flow_offload_fixup_ct(flow
->ct
);
287 else if (test_bit(NF_FLOW_TEARDOWN
, &flow
->flags
))
288 flow_offload_fixup_ct_timeout(flow
->ct
);
290 flow_offload_free(flow
);
293 void flow_offload_teardown(struct flow_offload
*flow
)
295 set_bit(NF_FLOW_TEARDOWN
, &flow
->flags
);
297 flow_offload_fixup_ct_state(flow
->ct
);
299 EXPORT_SYMBOL_GPL(flow_offload_teardown
);
301 struct flow_offload_tuple_rhash
*
302 flow_offload_lookup(struct nf_flowtable
*flow_table
,
303 struct flow_offload_tuple
*tuple
)
305 struct flow_offload_tuple_rhash
*tuplehash
;
306 struct flow_offload
*flow
;
309 tuplehash
= rhashtable_lookup(&flow_table
->rhashtable
, tuple
,
310 nf_flow_offload_rhash_params
);
314 dir
= tuplehash
->tuple
.dir
;
315 flow
= container_of(tuplehash
, struct flow_offload
, tuplehash
[dir
]);
316 if (test_bit(NF_FLOW_TEARDOWN
, &flow
->flags
))
319 if (unlikely(nf_ct_is_dying(flow
->ct
)))
324 EXPORT_SYMBOL_GPL(flow_offload_lookup
);
327 nf_flow_table_iterate(struct nf_flowtable
*flow_table
,
328 void (*iter
)(struct flow_offload
*flow
, void *data
),
331 struct flow_offload_tuple_rhash
*tuplehash
;
332 struct rhashtable_iter hti
;
333 struct flow_offload
*flow
;
336 rhashtable_walk_enter(&flow_table
->rhashtable
, &hti
);
337 rhashtable_walk_start(&hti
);
339 while ((tuplehash
= rhashtable_walk_next(&hti
))) {
340 if (IS_ERR(tuplehash
)) {
341 if (PTR_ERR(tuplehash
) != -EAGAIN
) {
342 err
= PTR_ERR(tuplehash
);
347 if (tuplehash
->tuple
.dir
)
350 flow
= container_of(tuplehash
, struct flow_offload
, tuplehash
[0]);
354 rhashtable_walk_stop(&hti
);
355 rhashtable_walk_exit(&hti
);
360 static void nf_flow_offload_gc_step(struct flow_offload
*flow
, void *data
)
362 struct nf_flowtable
*flow_table
= data
;
364 if (nf_flow_has_expired(flow
) || nf_ct_is_dying(flow
->ct
) ||
365 test_bit(NF_FLOW_TEARDOWN
, &flow
->flags
)) {
366 if (test_bit(NF_FLOW_HW
, &flow
->flags
)) {
367 if (!test_bit(NF_FLOW_HW_DYING
, &flow
->flags
))
368 nf_flow_offload_del(flow_table
, flow
);
369 else if (test_bit(NF_FLOW_HW_DEAD
, &flow
->flags
))
370 flow_offload_del(flow_table
, flow
);
372 flow_offload_del(flow_table
, flow
);
374 } else if (test_bit(NF_FLOW_HW
, &flow
->flags
)) {
375 nf_flow_offload_stats(flow_table
, flow
);
379 static void nf_flow_offload_work_gc(struct work_struct
*work
)
381 struct nf_flowtable
*flow_table
;
383 flow_table
= container_of(work
, struct nf_flowtable
, gc_work
.work
);
384 nf_flow_table_iterate(flow_table
, nf_flow_offload_gc_step
, flow_table
);
385 queue_delayed_work(system_power_efficient_wq
, &flow_table
->gc_work
, HZ
);
388 int nf_flow_table_offload_add_cb(struct nf_flowtable
*flow_table
,
389 flow_setup_cb_t
*cb
, void *cb_priv
)
391 struct flow_block
*block
= &flow_table
->flow_block
;
392 struct flow_block_cb
*block_cb
;
395 down_write(&flow_table
->flow_block_lock
);
396 block_cb
= flow_block_cb_lookup(block
, cb
, cb_priv
);
402 block_cb
= flow_block_cb_alloc(cb
, cb_priv
, cb_priv
, NULL
);
403 if (IS_ERR(block_cb
)) {
404 err
= PTR_ERR(block_cb
);
408 list_add_tail(&block_cb
->list
, &block
->cb_list
);
411 up_write(&flow_table
->flow_block_lock
);
414 EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb
);
416 void nf_flow_table_offload_del_cb(struct nf_flowtable
*flow_table
,
417 flow_setup_cb_t
*cb
, void *cb_priv
)
419 struct flow_block
*block
= &flow_table
->flow_block
;
420 struct flow_block_cb
*block_cb
;
422 down_write(&flow_table
->flow_block_lock
);
423 block_cb
= flow_block_cb_lookup(block
, cb
, cb_priv
);
425 list_del(&block_cb
->list
);
426 flow_block_cb_free(block_cb
);
430 up_write(&flow_table
->flow_block_lock
);
432 EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb
);
434 static int nf_flow_nat_port_tcp(struct sk_buff
*skb
, unsigned int thoff
,
435 __be16 port
, __be16 new_port
)
439 if (!pskb_may_pull(skb
, thoff
+ sizeof(*tcph
)) ||
440 skb_try_make_writable(skb
, thoff
+ sizeof(*tcph
)))
443 tcph
= (void *)(skb_network_header(skb
) + thoff
);
444 inet_proto_csum_replace2(&tcph
->check
, skb
, port
, new_port
, true);
449 static int nf_flow_nat_port_udp(struct sk_buff
*skb
, unsigned int thoff
,
450 __be16 port
, __be16 new_port
)
454 if (!pskb_may_pull(skb
, thoff
+ sizeof(*udph
)) ||
455 skb_try_make_writable(skb
, thoff
+ sizeof(*udph
)))
458 udph
= (void *)(skb_network_header(skb
) + thoff
);
459 if (udph
->check
|| skb
->ip_summed
== CHECKSUM_PARTIAL
) {
460 inet_proto_csum_replace2(&udph
->check
, skb
, port
,
463 udph
->check
= CSUM_MANGLED_0
;
469 static int nf_flow_nat_port(struct sk_buff
*skb
, unsigned int thoff
,
470 u8 protocol
, __be16 port
, __be16 new_port
)
474 if (nf_flow_nat_port_tcp(skb
, thoff
, port
, new_port
) < 0)
478 if (nf_flow_nat_port_udp(skb
, thoff
, port
, new_port
) < 0)
486 int nf_flow_snat_port(const struct flow_offload
*flow
,
487 struct sk_buff
*skb
, unsigned int thoff
,
488 u8 protocol
, enum flow_offload_tuple_dir dir
)
490 struct flow_ports
*hdr
;
491 __be16 port
, new_port
;
493 if (!pskb_may_pull(skb
, thoff
+ sizeof(*hdr
)) ||
494 skb_try_make_writable(skb
, thoff
+ sizeof(*hdr
)))
497 hdr
= (void *)(skb_network_header(skb
) + thoff
);
500 case FLOW_OFFLOAD_DIR_ORIGINAL
:
502 new_port
= flow
->tuplehash
[FLOW_OFFLOAD_DIR_REPLY
].tuple
.dst_port
;
503 hdr
->source
= new_port
;
505 case FLOW_OFFLOAD_DIR_REPLY
:
507 new_port
= flow
->tuplehash
[FLOW_OFFLOAD_DIR_ORIGINAL
].tuple
.src_port
;
508 hdr
->dest
= new_port
;
514 return nf_flow_nat_port(skb
, thoff
, protocol
, port
, new_port
);
516 EXPORT_SYMBOL_GPL(nf_flow_snat_port
);
518 int nf_flow_dnat_port(const struct flow_offload
*flow
,
519 struct sk_buff
*skb
, unsigned int thoff
,
520 u8 protocol
, enum flow_offload_tuple_dir dir
)
522 struct flow_ports
*hdr
;
523 __be16 port
, new_port
;
525 if (!pskb_may_pull(skb
, thoff
+ sizeof(*hdr
)) ||
526 skb_try_make_writable(skb
, thoff
+ sizeof(*hdr
)))
529 hdr
= (void *)(skb_network_header(skb
) + thoff
);
532 case FLOW_OFFLOAD_DIR_ORIGINAL
:
534 new_port
= flow
->tuplehash
[FLOW_OFFLOAD_DIR_REPLY
].tuple
.src_port
;
535 hdr
->dest
= new_port
;
537 case FLOW_OFFLOAD_DIR_REPLY
:
539 new_port
= flow
->tuplehash
[FLOW_OFFLOAD_DIR_ORIGINAL
].tuple
.dst_port
;
540 hdr
->source
= new_port
;
546 return nf_flow_nat_port(skb
, thoff
, protocol
, port
, new_port
);
548 EXPORT_SYMBOL_GPL(nf_flow_dnat_port
);
550 int nf_flow_table_init(struct nf_flowtable
*flowtable
)
554 INIT_DEFERRABLE_WORK(&flowtable
->gc_work
, nf_flow_offload_work_gc
);
555 flow_block_init(&flowtable
->flow_block
);
556 init_rwsem(&flowtable
->flow_block_lock
);
558 err
= rhashtable_init(&flowtable
->rhashtable
,
559 &nf_flow_offload_rhash_params
);
563 queue_delayed_work(system_power_efficient_wq
,
564 &flowtable
->gc_work
, HZ
);
566 mutex_lock(&flowtable_lock
);
567 list_add(&flowtable
->list
, &flowtables
);
568 mutex_unlock(&flowtable_lock
);
572 EXPORT_SYMBOL_GPL(nf_flow_table_init
);
574 static void nf_flow_table_do_cleanup(struct flow_offload
*flow
, void *data
)
576 struct net_device
*dev
= data
;
579 flow_offload_teardown(flow
);
583 if (net_eq(nf_ct_net(flow
->ct
), dev_net(dev
)) &&
584 (flow
->tuplehash
[0].tuple
.iifidx
== dev
->ifindex
||
585 flow
->tuplehash
[1].tuple
.iifidx
== dev
->ifindex
))
586 flow_offload_teardown(flow
);
589 static void nf_flow_table_iterate_cleanup(struct nf_flowtable
*flowtable
,
590 struct net_device
*dev
)
592 nf_flow_table_iterate(flowtable
, nf_flow_table_do_cleanup
, dev
);
593 flush_delayed_work(&flowtable
->gc_work
);
594 nf_flow_table_offload_flush(flowtable
);
597 void nf_flow_table_cleanup(struct net_device
*dev
)
599 struct nf_flowtable
*flowtable
;
601 mutex_lock(&flowtable_lock
);
602 list_for_each_entry(flowtable
, &flowtables
, list
)
603 nf_flow_table_iterate_cleanup(flowtable
, dev
);
604 mutex_unlock(&flowtable_lock
);
606 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup
);
608 void nf_flow_table_free(struct nf_flowtable
*flow_table
)
610 mutex_lock(&flowtable_lock
);
611 list_del(&flow_table
->list
);
612 mutex_unlock(&flowtable_lock
);
614 cancel_delayed_work_sync(&flow_table
->gc_work
);
615 nf_flow_table_iterate(flow_table
, nf_flow_table_do_cleanup
, NULL
);
616 nf_flow_table_iterate(flow_table
, nf_flow_offload_gc_step
, flow_table
);
617 nf_flow_table_offload_flush(flow_table
);
618 if (nf_flowtable_hw_offload(flow_table
))
619 nf_flow_table_iterate(flow_table
, nf_flow_offload_gc_step
,
621 rhashtable_destroy(&flow_table
->rhashtable
);
623 EXPORT_SYMBOL_GPL(nf_flow_table_free
);
625 static int __init
nf_flow_table_module_init(void)
627 return nf_flow_table_offload_init();
630 static void __exit
nf_flow_table_module_exit(void)
632 nf_flow_table_offload_exit();
635 module_init(nf_flow_table_module_init
);
636 module_exit(nf_flow_table_module_exit
);
638 MODULE_LICENSE("GPL");
639 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");