]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
net: track pfmemalloc drops via SKB_DROP_REASON_PFMEMALLOC
authorJesper Dangaard Brouer <hawk@kernel.org>
Wed, 16 Jul 2025 16:26:53 +0000 (18:26 +0200)
committerJakub Kicinski <kuba@kernel.org>
Fri, 18 Jul 2025 23:59:05 +0000 (16:59 -0700)
Add a new SKB drop reason (SKB_DROP_REASON_PFMEMALLOC) to track packets
dropped due to memory pressure. In production environments, we've observed
memory exhaustion reported by memory layer stack traces, but these drops
were not properly tracked in the SKB drop reason infrastructure.

While most network code paths now properly report pfmemalloc drops, some
protocol-specific socket implementations still use sk_filter() without
drop reason tracking:
- Bluetooth L2CAP sockets
- CAIF sockets
- IUCV sockets
- Netlink sockets
- SCTP sockets
- Unix domain sockets

These remaining cases represent less common paths and could be converted
in a follow-up patch if needed. The current implementation provides
significantly improved observability into memory pressure events in the
network stack, especially for key protocols like TCP and UDP, helping to
diagnose problems in production environments.

Reported-by: Matt Fleming <mfleming@cloudflare.com>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://patch.msgid.link/175268316579.2407873.11634752355644843509.stgit@firesoul
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
12 files changed:
drivers/net/tun.c
include/linux/filter.h
include/net/dropreason-core.h
include/net/tcp.h
net/core/dev.c
net/core/filter.c
net/core/sock.c
net/ipv4/tcp_ipv4.c
net/ipv4/udp.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c
net/rose/rose_in.c

index 49bcd12a4ac84b4100ae0870a81b2b2738c2e07c..e65228ba3fae4dd9c540456303b0c65ce185aa98 100644 (file)
@@ -1002,8 +1002,8 @@ static unsigned int run_ebpf_filter(struct tun_struct *tun,
 /* Net device start xmit */
 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+       enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct tun_struct *tun = netdev_priv(dev);
-       enum skb_drop_reason drop_reason;
        int txq = skb->queue_mapping;
        struct netdev_queue *queue;
        struct tun_file *tfile;
@@ -1032,10 +1032,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
        }
 
        if (tfile->socket.sk->sk_filter &&
-           sk_filter(tfile->socket.sk, skb)) {
-               drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+           sk_filter_reason(tfile->socket.sk, skb, &drop_reason))
                goto drop;
-       }
 
        len = run_ebpf_filter(tun, skb, len);
        if (len == 0) {
index f5cf4d35d83e97e6f5ef0eb97ac31855eeb65e9c..4e82332afe039f2bc8d5968e7900491284cd1fea 100644 (file)
@@ -1073,10 +1073,20 @@ bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
        return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
 }
 
-int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap,
+                      enum skb_drop_reason *reason);
+
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 {
-       return sk_filter_trim_cap(sk, skb, 1);
+       enum skb_drop_reason ignore_reason;
+
+       return sk_filter_trim_cap(sk, skb, 1, &ignore_reason);
+}
+
+static inline int sk_filter_reason(struct sock *sk, struct sk_buff *skb,
+                                  enum skb_drop_reason *reason)
+{
+       return sk_filter_trim_cap(sk, skb, 1, reason);
 }
 
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
index 229bb1826f2a401ca55257f8dc01e1aa9f8b7c23..e19184dd1b0f6f01b85feb23821e83fa66792ce6 100644 (file)
        FN(CAN_RX_INVALID_FRAME)        \
        FN(CANFD_RX_INVALID_FRAME)      \
        FN(CANXL_RX_INVALID_FRAME)      \
+       FN(PFMEMALLOC)  \
        FNe(MAX)
 
 /**
@@ -598,6 +599,11 @@ enum skb_drop_reason {
         * non conform CAN-XL frame (or device is unable to receive CAN frames)
         */
        SKB_DROP_REASON_CANXL_RX_INVALID_FRAME,
+       /**
+        * @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve
+        * reached a path or socket not eligible for use of memory reserves
+        */
+       SKB_DROP_REASON_PFMEMALLOC,
        /**
         * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
         * shouldn't be used as a real 'reason' - only for tracing code gen
index bc08de49805cf6fc2ffbec96e42bf12378fd10cf..b3815d10434000aad82b415c1c4c135670f9e09f 100644 (file)
@@ -1559,7 +1559,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
                     enum skb_drop_reason *reason);
 
 
-int tcp_filter(struct sock *sk, struct sk_buff *skb);
+int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason);
 void tcp_set_state(struct sock *sk, int state);
 void tcp_done(struct sock *sk);
 int tcp_abort(struct sock *sk, int err);
index 621a639aeba1b7431c4676c3aea8724976006aae..59a9089117de07753ea1a4bb6ba04d15be6e6aec 100644 (file)
@@ -5749,6 +5749,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
                                    struct packet_type **ppt_prev)
 {
+       enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;
        struct packet_type *ptype, *pt_prev;
        rx_handler_func_t *rx_handler;
        struct sk_buff *skb = *pskb;
@@ -5840,8 +5841,10 @@ skip_taps:
 #endif
        skb_reset_redirect(skb);
 skip_classify:
-       if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
+       if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) {
+               drop_reason = SKB_DROP_REASON_PFMEMALLOC;
                goto drop;
+       }
 
        if (skb_vlan_tag_present(skb)) {
                if (pt_prev) {
@@ -5946,7 +5949,8 @@ drop:
                        dev_core_stats_rx_dropped_inc(skb->dev);
                else
                        dev_core_stats_rx_nohandler_inc(skb->dev);
-               kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
+
+               kfree_skb_reason(skb, drop_reason);
                /* Jamal, now you will not able to escape explaining
                 * me how you were going to use this. :-)
                 */
index 7a72f766aacfaf5e7b7b9445485419b608a5e2de..2eb8947d80976c3f4e8ed1d2b0b7ee688973cca1 100644 (file)
@@ -122,6 +122,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
  *     @sk: sock associated with &sk_buff
  *     @skb: buffer to filter
  *     @cap: limit on how short the eBPF program may trim the packet
+ *     @reason: record drop reason on errors (negative return value)
  *
  * Run the eBPF program and then cut skb->data to correct size returned by
  * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
@@ -130,7 +131,8 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
  * be accepted or -EPERM if the packet should be tossed.
  *
  */
-int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb,
+                      unsigned int cap, enum skb_drop_reason *reason)
 {
        int err;
        struct sk_filter *filter;
@@ -142,15 +144,20 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+               *reason = SKB_DROP_REASON_PFMEMALLOC;
                return -ENOMEM;
        }
        err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
-       if (err)
+       if (err) {
+               *reason = SKB_DROP_REASON_SOCKET_FILTER;
                return err;
+       }
 
        err = security_sock_rcv_skb(sk, skb);
-       if (err)
+       if (err) {
+               *reason = SKB_DROP_REASON_SECURITY_HOOK;
                return err;
+       }
 
        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
@@ -162,6 +169,8 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
                pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
                skb->sk = save_sk;
                err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
+               if (err)
+                       *reason = SKB_DROP_REASON_SOCKET_FILTER;
        }
        rcu_read_unlock();
 
index 8b7623c7d547dbf20c263aed249e63f62e988447..7c26ec8dce630f0d24a622a418c15e6594d1babb 100644 (file)
@@ -526,11 +526,10 @@ int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
        enum skb_drop_reason drop_reason;
        int err;
 
-       err = sk_filter(sk, skb);
-       if (err) {
-               drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+       err = sk_filter_reason(sk, skb, &drop_reason);
+       if (err)
                goto out;
-       }
+
        err = __sock_queue_rcv_skb(sk, skb);
        switch (err) {
        case -ENOMEM:
@@ -553,15 +552,18 @@ EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                     const int nested, unsigned int trim_cap, bool refcounted)
 {
+       enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        int rc = NET_RX_SUCCESS;
+       int err;
 
-       if (sk_filter_trim_cap(sk, skb, trim_cap))
+       if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
                goto discard_and_relse;
 
        skb->dev = NULL;
 
        if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
                atomic_inc(&sk->sk_drops);
+               reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
                goto discard_and_relse;
        }
        if (nested)
@@ -577,8 +579,12 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                rc = sk_backlog_rcv(sk, skb);
 
                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
-       } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
+       } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
                bh_unlock_sock(sk);
+               if (err == -ENOMEM)
+                       reason = SKB_DROP_REASON_PFMEMALLOC;
+               if (err == -ENOBUFS)
+                       reason = SKB_DROP_REASON_SOCKET_BACKLOG;
                atomic_inc(&sk->sk_drops);
                goto discard_and_relse;
        }
@@ -589,7 +595,7 @@ out:
                sock_put(sk);
        return rc;
 discard_and_relse:
-       kfree_skb(skb);
+       sk_skb_reason_drop(sk, skb, reason);
        goto out;
 }
 EXPORT_SYMBOL(__sk_receive_skb);
index 16bf6fdff96b441064157698ed7e684c2fc30de6..84d3d556ed8062d07fe7019bc0dadd90d3b80d96 100644 (file)
@@ -2026,6 +2026,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
        u32 gso_size;
        u64 limit;
        int delta;
+       int err;
 
        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
         * we can fix skb->truesize to its real value to avoid future drops.
@@ -2136,21 +2137,27 @@ no_coalesce:
 
        limit = min_t(u64, limit, UINT_MAX);
 
-       if (unlikely(sk_add_backlog(sk, skb, limit))) {
+       err = sk_add_backlog(sk, skb, limit);
+       if (unlikely(err)) {
                bh_unlock_sock(sk);
-               *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
-               __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+               if (err == -ENOMEM) {
+                       *reason = SKB_DROP_REASON_PFMEMALLOC;
+                       __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+               } else {
+                       *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
+                       __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+               }
                return true;
        }
        return false;
 }
 EXPORT_IPV6_MOD(tcp_add_backlog);
 
-int tcp_filter(struct sock *sk, struct sk_buff *skb)
+int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
 {
        struct tcphdr *th = (struct tcphdr *)skb->data;
 
-       return sk_filter_trim_cap(sk, skb, th->doff * 4);
+       return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
 }
 EXPORT_IPV6_MOD(tcp_filter);
 
@@ -2277,14 +2284,12 @@ lookup:
                }
                refcounted = true;
                nsk = NULL;
-               if (!tcp_filter(sk, skb)) {
+               if (!tcp_filter(sk, skb, &drop_reason)) {
                        th = (const struct tcphdr *)skb->data;
                        iph = ip_hdr(skb);
                        tcp_v4_fill_cb(skb, iph, th);
                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
                                            &drop_reason);
-               } else {
-                       drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                }
                if (!nsk) {
                        reqsk_put(req);
@@ -2340,10 +2345,9 @@ process:
 
        nf_reset_ct(skb);
 
-       if (tcp_filter(sk, skb)) {
-               drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+       if (tcp_filter(sk, skb, &drop_reason))
                goto discard_and_relse;
-       }
+
        th = (const struct tcphdr *)skb->data;
        iph = ip_hdr(skb);
        tcp_v4_fill_cb(skb, iph, th);
index 49f43c54cfb0e3ac85c1a6202f3d6a2f1ca6d0ba..cc3ce0f762ec211a963464c2dd7ac329a6be1ffd 100644 (file)
@@ -2347,7 +2347,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  */
 static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 {
-       int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+       enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct udp_sock *up = udp_sk(sk);
        int is_udplite = IS_UDPLITE(sk);
 
@@ -2436,10 +2436,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
            udp_lib_checksum_complete(skb))
                        goto csum_error;
 
-       if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
-               drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+       if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
                goto drop;
-       }
 
        udp_csum_pull_header(skb);
 
index 8f2c3cba1f1fa4c1074cb020722508726588c3ac..7577e7eb2c97b821826f633a11dd5567dde7b7cb 100644 (file)
@@ -1834,14 +1834,12 @@ lookup:
                }
                refcounted = true;
                nsk = NULL;
-               if (!tcp_filter(sk, skb)) {
+               if (!tcp_filter(sk, skb, &drop_reason)) {
                        th = (const struct tcphdr *)skb->data;
                        hdr = ipv6_hdr(skb);
                        tcp_v6_fill_cb(skb, hdr, th);
                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
                                            &drop_reason);
-               } else {
-                       drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                }
                if (!nsk) {
                        reqsk_put(req);
@@ -1897,10 +1895,9 @@ process:
 
        nf_reset_ct(skb);
 
-       if (tcp_filter(sk, skb)) {
-               drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+       if (tcp_filter(sk, skb, &drop_reason))
                goto discard_and_relse;
-       }
+
        th = (const struct tcphdr *)skb->data;
        hdr = ipv6_hdr(skb);
        tcp_v6_fill_cb(skb, hdr, th);
index 6bbdadbd5fecccfb7de99f05c6fb179393e162f2..6a68f77da44b55baed42b44c936902f865754140 100644 (file)
@@ -894,10 +894,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
            udp_lib_checksum_complete(skb))
                goto csum_error;
 
-       if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
-               drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+       if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
                goto drop;
-       }
 
        udp_csum_pull_header(skb);
 
index 4d67f36dce1b4961aef4ecebe775a9675490832d..3e99181e759f983d3f53f127c5d956863e926c8d 100644 (file)
@@ -101,6 +101,7 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety
  */
 static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m)
 {
+       enum skb_drop_reason dr; /* ignored */
        struct rose_sock *rose = rose_sk(sk);
        int queued = 0;
 
@@ -162,7 +163,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety
                rose_frames_acked(sk, nr);
                if (ns == rose->vr) {
                        rose_start_idletimer(sk);
-                       if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 &&
+                       if (!sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN, &dr) &&
                            __sock_queue_rcv_skb(sk, skb) == 0) {
                                rose->vr = (rose->vr + 1) % ROSE_MODULUS;
                                queued = 1;