]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
xfrm: Refactor xfrm_input lock to reduce contention with RSS
authorJianbo Liu <jianbol@nvidia.com>
Tue, 21 Oct 2025 01:35:42 +0000 (04:35 +0300)
committerSteffen Klassert <steffen.klassert@secunet.com>
Mon, 27 Oct 2025 09:24:30 +0000 (10:24 +0100)
With newer NICs like mlx5 supporting RSS for IPsec crypto offload,
packets for a single Security Association (SA) are scattered across
multiple CPU cores for parallel processing. The xfrm_state spinlock
(x->lock) is held for each packet during xfrm processing.

When multiple connections or flows share the same SA, this parallelism
causes high lock contention on x->lock, creating a performance
bottleneck and limiting scalability.

The original xfrm_input() function exacerbated this issue by releasing
and immediately re-acquiring x->lock. For hardware crypto offload
paths, this unlock/relock sequence is unnecessary and introduces
significant overhead. This patch refactors the function to relocate
the type_offload->input_tail call for the offload path, performing all
necessary work while continuously holding the lock. This reordering is
safe, since packets which don't pass the checks below will still fail
them with the new code.

Performance testing with iperf using multiple parallel streams over a
single IPsec SA shows significant improvement in throughput as the
number of queues (and thus CPU cores) increases:

+-----------+---------------+--------------+-----------------+
| RX queues | Before (Gbps) | After (Gbps) | Improvement (%) |
+-----------+---------------+--------------+-----------------+
|         2 |          32.3 |         34.4 |             6.5 |
|         4 |          34.4 |         40.0 |            16.3 |
|         6 |          24.5 |         38.3 |            56.3 |
|         8 |          23.1 |         38.3 |            65.8 |
|        12 |          18.1 |         29.9 |            65.2 |
|        16 |          16.0 |         25.2 |            57.5 |
+-----------+---------------+--------------+-----------------+

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
net/xfrm/xfrm_input.c

index c9ddef869aa557ba678eaa189d72a22b2d4db7e0..257935cbd2211ec48002428b6ff5a91955fd11bd 100644 (file)
@@ -505,6 +505,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
                        async = 1;
                        dev_put(skb->dev);
                        seq = XFRM_SKB_CB(skb)->seq.input.low;
+                       spin_lock(&x->lock);
                        goto resume;
                }
                /* GRO call */
@@ -541,6 +542,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
                                goto drop;
                        }
+
+                       nexthdr = x->type_offload->input_tail(x, skb);
                }
 
                goto lock;
@@ -638,11 +641,9 @@ lock:
                        goto drop_unlock;
                }
 
-               spin_unlock(&x->lock);
-
                if (xfrm_tunnel_check(skb, x, family)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
-                       goto drop;
+                       goto drop_unlock;
                }
 
                seq_hi = htonl(xfrm_replay_seqhi(x, seq));
@@ -650,9 +651,8 @@ lock:
                XFRM_SKB_CB(skb)->seq.input.low = seq;
                XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
 
-               if (crypto_done) {
-                       nexthdr = x->type_offload->input_tail(x, skb);
-               } else {
+               if (!crypto_done) {
+                       spin_unlock(&x->lock);
                        dev_hold(skb->dev);
 
                        nexthdr = x->type->input(x, skb);
@@ -660,9 +660,9 @@ lock:
                                return 0;
 
                        dev_put(skb->dev);
+                       spin_lock(&x->lock);
                }
 resume:
-               spin_lock(&x->lock);
                if (nexthdr < 0) {
                        if (nexthdr == -EBADMSG) {
                                xfrm_audit_state_icvfail(x, skb,