]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
libeth: xdp, xsk: access adjacent u32s as u64 where applicable
authorAlexander Lobakin <aleksander.lobakin@intel.com>
Thu, 12 Jun 2025 16:02:34 +0000 (18:02 +0200)
committerTony Nguyen <anthony.l.nguyen@intel.com>
Mon, 16 Jun 2025 18:40:15 +0000 (11:40 -0700)
On 64-bit systems, writing/reading one u64 is faster than two u32s even
when they're are adjacent in a struct. The compilers won't guarantee
they will combine those; I observed both successful and unsuccessful
attempts with both GCC and Clang, and it's not easy to say what it
depends on.
There's a few places in libeth_xdp winning up to several percent from
combined access (both performance and object code size, especially
when unrolling). Add __LIBETH_WORD_ACCESS and use it there on LE.
Drivers are free to optimize HW-specific callbacks under the same
definition.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
include/net/libeth/xdp.h
include/net/libeth/xsk.h

index dba09a9168f175e35d21f8b91febcac05dc2a436..6ce6aec6884ca520715bb2d5bdba73cf861247a8 100644 (file)
@@ -475,6 +475,21 @@ struct libeth_xdp_tx_desc {
        ((const void *)(uintptr_t)(priv));                                    \
 })
 
+/*
+ * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len
+ * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead.
+ */
+#ifdef __LITTLE_ENDIAN
+#define __LIBETH_WORD_ACCESS           1
+#endif
+#ifdef __LIBETH_WORD_ACCESS
+#define __libeth_xdp_tx_len(flen, ...)                                       \
+       .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0)))
+#else
+#define __libeth_xdp_tx_len(flen, ...)                                       \
+       .len = (flen), .flags = (__VA_ARGS__ + 0)
+#endif
+
 /**
  * libeth_xdp_tx_xmit_bulk - main XDP Tx function
  * @bulk: array of frames to send
@@ -870,8 +885,7 @@ static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq,
 
        bq->bulk[bq->count++] = (typeof(*bq->bulk)){
                .xdpf   = xdpf,
-               .len    = xdpf->len,
-               .flags  = LIBETH_XDP_TX_FIRST,
+               __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST),
        };
 
        if (!xdp_frame_has_frags(xdpf))
@@ -902,7 +916,7 @@ static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq,
 
        bq->bulk[bq->count++] = (typeof(*bq->bulk)){
                .dma    = dma,
-               .len    = skb_frag_size(frag),
+               __libeth_xdp_tx_len(skb_frag_size(frag)),
        };
 
        return true;
@@ -1260,6 +1274,7 @@ bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp,
  * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer
  * head with the Rx buffer data: data pointer, length, headroom, and
  * truesize/tailroom. Zeroes the flags.
+ * Uses faster single u64 write instead of per-field access.
  */
 static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
                                           const struct libeth_fqe *fqe,
@@ -1267,7 +1282,15 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
 {
        const struct page *page = __netmem_to_page(fqe->netmem);
 
+#ifdef __LIBETH_WORD_ACCESS
+       static_assert(offsetofend(typeof(xdp->base), flags) -
+                     offsetof(typeof(xdp->base), frame_sz) ==
+                     sizeof(u64));
+
+       *(u64 *)&xdp->base.frame_sz = fqe->truesize;
+#else
        xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
+#endif
        xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset,
                         page->pp->p.offset, len, true);
 }
index 213778a684763ab20aff0a75e64aa74b172441d6..481a7b28e6f248845fd892b62ec1880652c7d87a 100644 (file)
@@ -26,8 +26,8 @@ static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq,
 {
        bq->bulk[bq->count++] = (typeof(*bq->bulk)){
                .xsk    = xdp,
-               .len    = xdp->base.data_end - xdp->data,
-               .flags  = LIBETH_XDP_TX_FIRST,
+               __libeth_xdp_tx_len(xdp->base.data_end - xdp->data,
+                                   LIBETH_XDP_TX_FIRST),
        };
 
        if (likely(!xdp_buff_has_frags(&xdp->base)))
@@ -48,7 +48,7 @@ static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq,
 {
        bq->bulk[bq->count++] = (typeof(*bq->bulk)){
                .xsk    = frag,
-               .len    = frag->base.data_end - frag->data,
+               __libeth_xdp_tx_len(frag->base.data_end - frag->data),
        };
 }
 
@@ -199,7 +199,7 @@ __libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc,
        ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr);
        desc = (typeof(desc)){
                .addr   = ctx.dma,
-               .len    = xdesc->len,
+               __libeth_xdp_tx_len(xdesc->len),
        };
 
        BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo));
@@ -226,7 +226,7 @@ __libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc,
 {
        return (struct libeth_xdp_tx_desc){
                .addr   = xsk_buff_raw_get_dma(sq->pool, xdesc->addr),
-               .len    = xdesc->len,
+               __libeth_xdp_tx_len(xdesc->len),
        };
 }