]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
net: Allow opt-out from global protocol memory accounting.
authorKuniyuki Iwashima <kuniyu@google.com>
Tue, 14 Oct 2025 23:54:55 +0000 (23:54 +0000)
committerMartin KaFai Lau <martin.lau@kernel.org>
Thu, 16 Oct 2025 19:04:47 +0000 (12:04 -0700)
Some protocols (e.g., TCP, UDP) implement memory accounting for socket
buffers and charge memory to per-protocol global counters pointed to by
sk->sk_proto->memory_allocated.

Sometimes, system processes do not want that limitation.  For a similar
purpose, there is SO_RESERVE_MEM for sockets under memcg.

Also, by opting out of the per-protocol accounting, sockets under memcg
can avoid paying costs for two orthogonal memory accounting mechanisms.
A microbenchmark result is in the subsequent bpf patch.

Let's allow opt-out from the per-protocol memory accounting if
sk->sk_bypass_prot_mem is true.

sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache
line, and sk_has_account() always fetches sk->sk_prot before accessing
sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch.

The following patches will set sk->sk_bypass_prot_mem to true, and
then, the per-protocol memory accounting will be skipped.

Note that this does NOT disable memcg, but rather the per-protocol one.

Another option not to use the hole in struct sock_common is create
sk_prot variants like tcp_prot_bypass, but this would complicate
SOCKMAP logic, tcp_bpf_prots etc.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-3-kuniyu@google.com
include/net/proto_memory.h
include/net/sock.h
include/net/tcp.h
net/core/sock.c
net/ipv4/tcp.c
net/ipv4/tcp_output.c
net/mptcp/protocol.c
net/tls/tls_device.c

index 8e91a8fa31b52aa5c69b66930f1877eefecfa1ce..ad6d703ce6fe1d8ee26fb4e68ebbe73a62227d9f 100644 (file)
@@ -35,6 +35,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
            mem_cgroup_sk_under_memory_pressure(sk))
                return true;
 
+       if (sk->sk_bypass_prot_mem)
+               return false;
+
        return !!READ_ONCE(*sk->sk_prot->memory_pressure);
 }
 
index 30ac2eb4ef9bf73743e3dc9e66c6c3059f34964e..415e7381aa5051a78b1eda5da458e02507318b13 100644 (file)
@@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair;
  *     @skc_reuseport: %SO_REUSEPORT setting
  *     @skc_ipv6only: socket is IPV6 only
  *     @skc_net_refcnt: socket is using net ref counting
+ *     @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb
  *     @skc_bound_dev_if: bound device index if != 0
  *     @skc_bind_node: bind hash linkage for various protocol lookup tables
  *     @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -174,6 +175,7 @@ struct sock_common {
        unsigned char           skc_reuseport:1;
        unsigned char           skc_ipv6only:1;
        unsigned char           skc_net_refcnt:1;
+       unsigned char           skc_bypass_prot_mem:1;
        int                     skc_bound_dev_if;
        union {
                struct hlist_node       skc_bind_node;
@@ -381,6 +383,7 @@ struct sock {
 #define sk_reuseport           __sk_common.skc_reuseport
 #define sk_ipv6only            __sk_common.skc_ipv6only
 #define sk_net_refcnt          __sk_common.skc_net_refcnt
+#define sk_bypass_prot_mem     __sk_common.skc_bypass_prot_mem
 #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
 #define sk_bind_node           __sk_common.skc_bind_node
 #define sk_prot                        __sk_common.skc_prot
index 1e547138f4fb7f5c47d15990954d4d135f465f73..439e327fdbfad7829d64c08d999dcc1ed73aef93 100644 (file)
@@ -303,6 +303,9 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk)
            mem_cgroup_sk_under_memory_pressure(sk))
                return true;
 
+       if (sk->sk_bypass_prot_mem)
+               return false;
+
        return READ_ONCE(tcp_memory_pressure);
 }
 /*
index 08ae20069b6d287745800710192396f76c8781b4..5bf208579c02bcda66f0ae90dbb3e1a0cacbf5e8 100644 (file)
@@ -1046,9 +1046,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
        if (!charged)
                return -ENOMEM;
 
+       if (sk->sk_bypass_prot_mem)
+               goto success;
+
        /* pre-charge to forward_alloc */
        sk_memory_allocated_add(sk, pages);
        allocated = sk_memory_allocated(sk);
+
        /* If the system goes into memory pressure with this
         * precharge, give up and return error.
         */
@@ -1057,6 +1061,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
                mem_cgroup_sk_uncharge(sk, pages);
                return -ENOMEM;
        }
+
+success:
        sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
 
        WRITE_ONCE(sk->sk_reserved_mem,
@@ -3145,8 +3151,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
                return true;
 
-       sk_enter_memory_pressure(sk);
+       if (!sk->sk_bypass_prot_mem)
+               sk_enter_memory_pressure(sk);
+
        sk_stream_moderate_sndbuf(sk);
+
        return false;
 }
 EXPORT_SYMBOL(sk_page_frag_refill);
@@ -3263,10 +3272,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 {
        bool memcg_enabled = false, charged = false;
        struct proto *prot = sk->sk_prot;
-       long allocated;
+       long allocated = 0;
 
-       sk_memory_allocated_add(sk, amt);
-       allocated = sk_memory_allocated(sk);
+       if (!sk->sk_bypass_prot_mem) {
+               sk_memory_allocated_add(sk, amt);
+               allocated = sk_memory_allocated(sk);
+       }
 
        if (mem_cgroup_sk_enabled(sk)) {
                memcg_enabled = true;
@@ -3275,6 +3286,9 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
                        goto suppress_allocation;
        }
 
+       if (!allocated)
+               return 1;
+
        /* Under limit. */
        if (allocated <= sk_prot_mem_limits(sk, 0)) {
                sk_leave_memory_pressure(sk);
@@ -3353,7 +3367,8 @@ suppress_allocation:
 
        trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
 
-       sk_memory_allocated_sub(sk, amt);
+       if (allocated)
+               sk_memory_allocated_sub(sk, amt);
 
        if (charged)
                mem_cgroup_sk_uncharge(sk, amt);
@@ -3392,11 +3407,14 @@ EXPORT_SYMBOL(__sk_mem_schedule);
  */
 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
 {
-       sk_memory_allocated_sub(sk, amount);
-
        if (mem_cgroup_sk_enabled(sk))
                mem_cgroup_sk_uncharge(sk, amount);
 
+       if (sk->sk_bypass_prot_mem)
+               return;
+
+       sk_memory_allocated_sub(sk, amount);
+
        if (sk_under_global_memory_pressure(sk) &&
            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
                sk_leave_memory_pressure(sk);
index 4d720aa09a4c67f0f068ad7cbf72b306a65a3ee2..54def27326f1a41a5ee057913ec93cd5ef06b728 100644 (file)
@@ -928,7 +928,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
                }
                __kfree_skb(skb);
        } else {
-               sk->sk_prot->enter_memory_pressure(sk);
+               if (!sk->sk_bypass_prot_mem)
+                       tcp_enter_memory_pressure(sk);
                sk_stream_moderate_sndbuf(sk);
        }
        return NULL;
index b94efb3050d2fe49216ef0c8771e403b1d227422..7f5df7a71f62954ef86cc917e1da5e1981e20aa0 100644 (file)
@@ -3743,12 +3743,17 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
        delta = size - sk->sk_forward_alloc;
        if (delta <= 0)
                return;
+
        amt = sk_mem_pages(delta);
        sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
-       sk_memory_allocated_add(sk, amt);
 
        if (mem_cgroup_sk_enabled(sk))
                mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+       if (sk->sk_bypass_prot_mem)
+               return;
+
+       sk_memory_allocated_add(sk, amt);
 }
 
 /* Send a FIN. The caller locks the socket for us.
index 0292162a14eedffde166cc2a2d4eaa7c3aa6760d..94a5f6dcc5775e1265bb9f3c925fa80ae8c42924 100644 (file)
@@ -1065,11 +1065,12 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
        mptcp_for_each_subflow(msk, subflow) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
-               if (first)
+               if (first && !ssk->sk_bypass_prot_mem) {
                        tcp_enter_memory_pressure(ssk);
-               sk_stream_moderate_sndbuf(ssk);
+                       first = false;
+               }
 
-               first = false;
+               sk_stream_moderate_sndbuf(ssk);
        }
        __mptcp_sync_sndbuf(sk);
 }
index a64ae15b1a60d4065cd2b4def9e28839e759445a..caa2b5d24622354e53a30b9cda922560dcdb2ee7 100644 (file)
@@ -373,7 +373,8 @@ static int tls_do_allocation(struct sock *sk,
        if (!offload_ctx->open_record) {
                if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
                                                   sk->sk_allocation))) {
-                       READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
+                       if (!sk->sk_bypass_prot_mem)
+                               READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
                        sk_stream_moderate_sndbuf(sk);
                        return -ENOMEM;
                }