]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
tcp: trace retransmit failures in tcp_retransmit_skb
authorFan Yu <fan.yu9@zte.com.cn>
Mon, 21 Jul 2025 03:16:07 +0000 (11:16 +0800)
committerJakub Kicinski <kuba@kernel.org>
Wed, 23 Jul 2025 01:19:11 +0000 (18:19 -0700)
Background
==========
When TCP retransmits a packet due to missing ACKs, the
retransmission may fail for various reasons (e.g., packets
stuck in driver queues, receiver zero windows, or routing issues).

The original tcp_retransmit_skb tracepoint:

  'commit e086101b150a ("tcp: add a tracepoint for tcp retransmission")'

lacks visibility into these failure causes, making production
diagnostics difficult.

Solution
========
Adds the retval("err") to the tcp_retransmit_skb tracepoint.
Enables users to know why some tcp retransmission failed and
users can filter retransmission failures by retval.

Compatibility description
=========================
This patch extends the tcp_retransmit_skb tracepoint
by adding a new "err" field at the end of its
existing structure (within TP_STRUCT__entry). The
compatibility implications are detailed as follows:

1) Structural compatibility for legacy user-space tools
Legacy tools/BPF programs accessing existing fields
(by offset or name) can still work without modification
or recompilation.The new field is appended to the end,
preserving original memory layout.

2) Note: semantic changes
The original tracepoint primarily only focused on
successfully retransmitted packets. With this patch,
the tracepoint now can figure out packets that may
terminate early due to specific reasons. For accurate
statistics, users should filter using "err" to
distinguish outcomes.

Before patched:
field:const void * skbaddr; offset:8; size:8; signed:0;
field:const void * skaddr; offset:16; size:8; signed:0;
field:int state; offset:24; size:4; signed:1;
field:__u16 sport; offset:28; size:2; signed:0;
field:__u16 dport; offset:30; size:2; signed:0;
field:__u16 family; offset:32; size:2; signed:0;
field:__u8 saddr[4]; offset:34; size:4; signed:0;
field:__u8 daddr[4]; offset:38; size:4; signed:0;
field:__u8 saddr_v6[16]; offset:42; size:16; signed:0;
field:__u8 daddr_v6[16]; offset:58; size:16; signed:0;

print fmt: "skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s"

After patched:
field:const void * skbaddr; offset:8; size:8; signed:0;
field:const void * skaddr; offset:16; size:8; signed:0;
field:int state; offset:24; size:4; signed:1;
field:__u16 sport; offset:28; size:2; signed:0;
field:__u16 dport; offset:30; size:2; signed:0;
field:__u16 family; offset:32; size:2; signed:0;
field:__u8 saddr[4]; offset:34; size:4; signed:0;
field:__u8 daddr[4]; offset:38; size:4; signed:0;
field:__u8 saddr_v6[16]; offset:42; size:16; signed:0;
field:__u8 daddr_v6[16]; offset:58; size:16; signed:0;
field:int err; offset:76; size:4; signed:1;

print fmt: "skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s err=%d"

Co-developed-by: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Fan Yu <fan.yu9@zte.com.cn>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250721111607626_BDnIJB0ywk6FghN63bor@zte.com.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
include/trace/events/tcp.h
net/ipv4/tcp_output.c

index 54e60c6009e3b4715a517c5ced8ab1692f90eafd..9d2c36c6a0ed74c82306f347ff41fabed7ada8c4 100644 (file)
 #include <linux/sock_diag.h>
 #include <net/rstreason.h>
 
-/*
- * tcp event with arguments sk and skb
- *
- * Note: this class requires a valid sk pointer; while skb pointer could
- *       be NULL.
- */
-DECLARE_EVENT_CLASS(tcp_event_sk_skb,
+TRACE_EVENT(tcp_retransmit_skb,
 
-       TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+       TP_PROTO(const struct sock *sk, const struct sk_buff *skb, int err),
 
-       TP_ARGS(sk, skb),
+       TP_ARGS(sk, skb, err),
 
        TP_STRUCT__entry(
                __field(const void *, skbaddr)
@@ -36,6 +30,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
+               __field(int, err)
        ),
 
        TP_fast_assign(
@@ -58,21 +53,17 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 
                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                              sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+
+               __entry->err = err;
        ),
 
-       TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s",
+       TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s err=%d",
                  __entry->skbaddr, __entry->skaddr,
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
-                 show_tcp_state_name(__entry->state))
-);
-
-DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
-
-       TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
-
-       TP_ARGS(sk, skb)
+                 show_tcp_state_name(__entry->state),
+                 __entry->err)
 );
 
 #undef FN
index b616776e3354c7df23890100def1df729fd33d12..caf11920a87861955b0d7eeb2c0c03973d6e58ee 100644 (file)
@@ -3330,8 +3330,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
        if (icsk->icsk_mtup.probe_size)
                icsk->icsk_mtup.probe_size = 0;
 
-       if (skb_still_in_host_queue(sk, skb))
-               return -EBUSY;
+       if (skb_still_in_host_queue(sk, skb)) {
+               err = -EBUSY;
+               goto out;
+       }
 
 start:
        if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
@@ -3342,14 +3344,19 @@ start:
                }
                if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
                        WARN_ON_ONCE(1);
-                       return -EINVAL;
+                       err = -EINVAL;
+                       goto out;
+               }
+               if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) {
+                       err = -ENOMEM;
+                       goto out;
                }
-               if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
-                       return -ENOMEM;
        }
 
-       if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
-               return -EHOSTUNREACH; /* Routing failure or similar. */
+       if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) {
+               err = -EHOSTUNREACH; /* Routing failure or similar. */
+               goto out;
+       }
 
        cur_mss = tcp_current_mss(sk);
        avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
@@ -3360,8 +3367,10 @@ start:
         * our retransmit of one segment serves as a zero window probe.
         */
        if (avail_wnd <= 0) {
-               if (TCP_SKB_CB(skb)->seq != tp->snd_una)
-                       return -EAGAIN;
+               if (TCP_SKB_CB(skb)->seq != tp->snd_una) {
+                       err = -EAGAIN;
+                       goto out;
+               }
                avail_wnd = cur_mss;
        }
 
@@ -3373,11 +3382,15 @@ start:
        }
        if (skb->len > len) {
                if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
-                                cur_mss, GFP_ATOMIC))
-                       return -ENOMEM; /* We'll try again later. */
+                                cur_mss, GFP_ATOMIC)) {
+                       err = -ENOMEM;  /* We'll try again later. */
+                       goto out;
+               }
        } else {
-               if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
-                       return -ENOMEM;
+               if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) {
+                       err = -ENOMEM;
+                       goto out;
+               }
 
                diff = tcp_skb_pcount(skb);
                tcp_set_skb_tso_segs(skb, cur_mss);
@@ -3431,17 +3444,16 @@ start:
                tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
                                  TCP_SKB_CB(skb)->seq, segs, err);
 
-       if (likely(!err)) {
-               trace_tcp_retransmit_skb(sk, skb);
-       } else if (err != -EBUSY) {
+       if (unlikely(err) && err != -EBUSY)
                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
-       }
 
        /* To avoid taking spuriously low RTT samples based on a timestamp
         * for a transmit that never happened, always mark EVER_RETRANS
         */
        TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 
+out:
+       trace_tcp_retransmit_skb(sk, skb, err);
        return err;
 }