From: Eric Dumazet Date: Thu, 29 Jan 2026 15:34:58 +0000 (+0000) Subject: tcp: reduce tcp sockets size by one cache line X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ed9b70040d7b22552f1392bed529ef0861f2a25c;p=thirdparty%2Fkernel%2Flinux.git tcp: reduce tcp sockets size by one cache line By default, when a kmem_cache is created with SLAB_TYPESAFE_BY_RCU, slub has to use extra storage for the freelist pointer after each object, because slub assumes that any bit in the object can be used by RCU readers. Because proto_register() is also using SLAB_HWCACHE_ALIGN, this forces slub to use one extra cache line per object. We can instead put the slub freelist anywhere in the object, granted the concurrent RCU readers are not supposed to use the pointer value. Add a new (struct sock)sk_freeptr field, in an union with sk_rcu: No RCU readers would need to look at sk_rcu, which is only used at free phase. Tested: grep . /sys/kernel/slab/TCP/{object_size,slab_size,objs_per_slab} grep . /sys/kernel/slab/TCPv6/{object_size,slab_size,objs_per_slab} Before: /sys/kernel/slab/TCP/object_size:2368 /sys/kernel/slab/TCP/slab_size:2432 /sys/kernel/slab/TCP/objs_per_slab:13 /sys/kernel/slab/TCPv6/object_size:2496 /sys/kernel/slab/TCPv6/slab_size:2560 /sys/kernel/slab/TCPv6/objs_per_slab:12 After this patch, we can pack one more TCPv6 object per slab, and object_size == slab_size. /sys/kernel/slab/TCP/object_size:2368 /sys/kernel/slab/TCP/slab_size:2368 /sys/kernel/slab/TCP/objs_per_slab:13 /sys/kernel/slab/TCPv6/object_size:2496 /sys/kernel/slab/TCPv6/slab_size:2496 /sys/kernel/slab/TCPv6/objs_per_slab:13 Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260129153458.4163797-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- diff --git a/include/net/sock.h b/include/net/sock.h index aafe8bdb2c0f..66b56288c1d3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -341,6 +341,7 @@ struct sk_filter; * @sk_reuseport_cb: reuseport group container * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period + * @sk_freeptr: used for SLAB_TYPESAFE_BY_RCU managed sockets * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME @@ -582,7 +583,14 @@ struct sock { struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct numa_drop_counters *sk_drop_counters; - struct rcu_head sk_rcu; + /* sockets using SLAB_TYPESAFE_BY_RCU can use sk_freeptr. + * By the time kfree() is called, sk_rcu can not be in + * use and can be mangled. + */ + union { + struct rcu_head sk_rcu; + freeptr_t sk_freeptr; + }; netns_tracker ns_tracker; struct xarray sk_user_frags; @@ -1368,6 +1376,7 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; + unsigned int freeptr_offset; unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ diff --git a/net/core/sock.c b/net/core/sock.c index a1c8b47b0d56..693e6d80f501 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4193,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab) return -EINVAL; } if (alloc_slab) { - prot->slab = kmem_cache_create_usercopy(prot->name, - prot->obj_size, 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | - prot->slab_flags, - prot->useroffset, prot->usersize, - NULL); + struct kmem_cache_args args = { + .useroffset = prot->useroffset, + .usersize = prot->usersize, + .freeptr_offset = prot->freeptr_offset, + .use_freeptr_offset = !!prot->freeptr_offset, + }; + prot->slab = kmem_cache_create(prot->name, prot->obj_size, + &args, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | + prot->slab_flags); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ffdf52fbf646..0fc8a42921aa 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3460,6 +3460,8 @@ struct proto tcp_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), + .freeptr_offset = offsetof(struct tcp_sock, + inet_conn.icsk_inet.sk.sk_freeptr), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4ae664b05fa9..8bf29186c15f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2332,6 +2332,8 @@ struct proto tcpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .freeptr_offset = offsetof(struct tcp6_sock, + tcp.inet_conn.icsk_inet.sk.sk_freeptr), .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops,