Some protocols (e.g., TCP, UDP) implement memory accounting for socket
buffers and charge memory to per-protocol global counters pointed to by
sk->sk_proto->memory_allocated.
Sometimes, system processes do not want that limitation. For a similar
purpose, there is SO_RESERVE_MEM for sockets under memcg.
Also, by opting out of the per-protocol accounting, sockets under memcg
can avoid paying costs for two orthogonal memory accounting mechanisms.
A microbenchmark result is in the subsequent bpf patch.
Let's allow opt-out from the per-protocol memory accounting if
sk->sk_bypass_prot_mem is true.
sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache
line, and sk_has_account() always fetches sk->sk_prot before accessing
sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch.
The following patches will set sk->sk_bypass_prot_mem to true, and
then, the per-protocol memory accounting will be skipped.
Note that this does NOT disable memcg, but rather the per-protocol one.
Another option not to use the hole in struct sock_common is create
sk_prot variants like tcp_prot_bypass, but this would complicate
SOCKMAP logic, tcp_bpf_prots etc.
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-3-kuniyu@google.com
mem_cgroup_sk_under_memory_pressure(sk))
return true;
+ if (sk->sk_bypass_prot_mem)
+ return false;
+
return !!READ_ONCE(*sk->sk_prot->memory_pressure);
}
* @skc_reuseport: %SO_REUSEPORT setting
* @skc_ipv6only: socket is IPV6 only
* @skc_net_refcnt: socket is using net ref counting
+ * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb
* @skc_bound_dev_if: bound device index if != 0
* @skc_bind_node: bind hash linkage for various protocol lookup tables
* @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
unsigned char skc_reuseport:1;
unsigned char skc_ipv6only:1;
unsigned char skc_net_refcnt:1;
+ unsigned char skc_bypass_prot_mem:1;
int skc_bound_dev_if;
union {
struct hlist_node skc_bind_node;
#define sk_reuseport __sk_common.skc_reuseport
#define sk_ipv6only __sk_common.skc_ipv6only
#define sk_net_refcnt __sk_common.skc_net_refcnt
+#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
#define sk_prot __sk_common.skc_prot
mem_cgroup_sk_under_memory_pressure(sk))
return true;
+ if (sk->sk_bypass_prot_mem)
+ return false;
+
return READ_ONCE(tcp_memory_pressure);
}
/*
if (!charged)
return -ENOMEM;
+ if (sk->sk_bypass_prot_mem)
+ goto success;
+
/* pre-charge to forward_alloc */
sk_memory_allocated_add(sk, pages);
allocated = sk_memory_allocated(sk);
+
/* If the system goes into memory pressure with this
* precharge, give up and return error.
*/
mem_cgroup_sk_uncharge(sk, pages);
return -ENOMEM;
}
+
+success:
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
WRITE_ONCE(sk->sk_reserved_mem,
if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
return true;
- sk_enter_memory_pressure(sk);
+ if (!sk->sk_bypass_prot_mem)
+ sk_enter_memory_pressure(sk);
+
sk_stream_moderate_sndbuf(sk);
+
return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);
{
bool memcg_enabled = false, charged = false;
struct proto *prot = sk->sk_prot;
- long allocated;
+ long allocated = 0;
- sk_memory_allocated_add(sk, amt);
- allocated = sk_memory_allocated(sk);
+ if (!sk->sk_bypass_prot_mem) {
+ sk_memory_allocated_add(sk, amt);
+ allocated = sk_memory_allocated(sk);
+ }
if (mem_cgroup_sk_enabled(sk)) {
memcg_enabled = true;
goto suppress_allocation;
}
+ if (!allocated)
+ return 1;
+
/* Under limit. */
if (allocated <= sk_prot_mem_limits(sk, 0)) {
sk_leave_memory_pressure(sk);
trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
- sk_memory_allocated_sub(sk, amt);
+ if (allocated)
+ sk_memory_allocated_sub(sk, amt);
if (charged)
mem_cgroup_sk_uncharge(sk, amt);
*/
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
- sk_memory_allocated_sub(sk, amount);
-
if (mem_cgroup_sk_enabled(sk))
mem_cgroup_sk_uncharge(sk, amount);
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_sub(sk, amount);
+
if (sk_under_global_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
__kfree_skb(skb);
} else {
- sk->sk_prot->enter_memory_pressure(sk);
+ if (!sk->sk_bypass_prot_mem)
+ tcp_enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
}
return NULL;
delta = size - sk->sk_forward_alloc;
if (delta <= 0)
return;
+
amt = sk_mem_pages(delta);
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
- sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sk_enabled(sk))
mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_add(sk, amt);
}
/* Send a FIN. The caller locks the socket for us.
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- if (first)
+ if (first && !ssk->sk_bypass_prot_mem) {
tcp_enter_memory_pressure(ssk);
- sk_stream_moderate_sndbuf(ssk);
+ first = false;
+ }
- first = false;
+ sk_stream_moderate_sndbuf(ssk);
}
__mptcp_sync_sndbuf(sk);
}
if (!offload_ctx->open_record) {
if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
sk->sk_allocation))) {
- READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
+ if (!sk->sk_bypass_prot_mem)
+ READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
return -ENOMEM;
}