It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in
fast path by reducing this to a single dirtied cache line.
In current layout, we change only four/six fields in the first cache line:
- q.spinlock
- q.qlen
- bstats.bytes
- bstats.packets
- some Qdisc also change q.next/q.prev
In the second cache line we change in the fast path:
- running
- state
- qstats.backlog
/* --- cacheline 2 boundary (128 bytes) --- */
struct sk_buff_head gso_skb __attribute__((__aligned__(64))); /* 0x80 0x18 */
struct qdisc_skb_head q; /* 0x98 0x18 */
struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xb0 0x10 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct gnet_stats_queue qstats; /* 0xc0 0x14 */
bool running; /* 0xd4 0x1 */
/* XXX 3 bytes hole, try to pack */
unsigned long state; /* 0xd8 0x8 */
struct Qdisc * next_sched; /* 0xe0 0x8 */
struct sk_buff_head skb_bad_txq; /* 0xe8 0x18 */
/* --- cacheline 4 boundary (256 bytes) --- */
Reorganize things to have a first cache line mostly read,
then a mostly written one.
This gives a ~3% increase of performance under tx stress.
Note that there is an additional hole because @qstats now spans over a third cache line.
/* --- cacheline 2 boundary (128 bytes) --- */
__u8 __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /* 0x80 0 */
struct sk_buff_head gso_skb; /* 0x80 0x18 */
struct Qdisc * next_sched; /* 0x98 0x8 */
struct sk_buff_head skb_bad_txq; /* 0xa0 0x18 */
__u8 __cacheline_group_end__Qdisc_read_mostly[0]; /* 0xb8 0 */
/* XXX 8 bytes hole, try to pack */
/* --- cacheline 3 boundary (192 bytes) --- */
__u8 __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /* 0xc0 0 */
struct qdisc_skb_head q; /* 0xc0 0x18 */
unsigned long state; /* 0xd8 0x8 */
struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xe0 0x10 */
bool running; /* 0xf0 0x1 */
/* XXX 3 bytes hole, try to pack */
struct gnet_stats_queue qstats; /* 0xf4 0x14 */
/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
__u8 __cacheline_group_end__Qdisc_write[0]; /* 0x108 0 */
/* XXX 56 bytes hole, try to pack */
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-8-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
int pad;
refcount_t refcnt;
- /*
- * For performance sake on SMP, we put highly modified fields at the end
- */
- struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
- struct qdisc_skb_head q;
- struct gnet_stats_basic_sync bstats;
- struct gnet_stats_queue qstats;
- bool running; /* must be written under qdisc spinlock */
- unsigned long state;
- struct Qdisc *next_sched;
- struct sk_buff_head skb_bad_txq;
+ /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */
+ __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned;
+ struct sk_buff_head gso_skb;
+ struct Qdisc *next_sched;
+ struct sk_buff_head skb_bad_txq;
+ __cacheline_group_end(Qdisc_read_mostly);
+
+ /* Fields dirtied in dequeue() fast path. */
+ __cacheline_group_begin(Qdisc_write) ____cacheline_aligned;
+ struct qdisc_skb_head q;
+ unsigned long state;
+ struct gnet_stats_basic_sync bstats;
+ bool running; /* must be written under qdisc spinlock */
+
+ /* Note : we only change qstats.backlog in fast path. */
+ struct gnet_stats_queue qstats;
+ __cacheline_group_end(Qdisc_write);
+
atomic_long_t defer_count ____cacheline_aligned_in_smp;
struct llist_head defer_list;