From: Alexander Lobakin Date: Thu, 12 Jun 2025 16:02:34 +0000 (+0200) Subject: libeth: xdp, xsk: access adjacent u32s as u64 where applicable X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=80bae9df2108cb72a060ee5235614d7c072af1de;p=thirdparty%2Fkernel%2Flinux.git libeth: xdp, xsk: access adjacent u32s as u64 where applicable On 64-bit systems, writing/reading one u64 is faster than two u32s even when they're are adjacent in a struct. The compilers won't guarantee they will combine those; I observed both successful and unsuccessful attempts with both GCC and Clang, and it's not easy to say what it depends on. There's a few places in libeth_xdp winning up to several percent from combined access (both performance and object code size, especially when unrolling). Add __LIBETH_WORD_ACCESS and use it there on LE. Drivers are free to optimize HW-specific callbacks under the same definition. Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index dba09a9168f17..6ce6aec6884ca 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -475,6 +475,21 @@ struct libeth_xdp_tx_desc { ((const void *)(uintptr_t)(priv)); \ }) +/* + * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len + * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead. + */ +#ifdef __LITTLE_ENDIAN +#define __LIBETH_WORD_ACCESS 1 +#endif +#ifdef __LIBETH_WORD_ACCESS +#define __libeth_xdp_tx_len(flen, ...) \ + .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0))) +#else +#define __libeth_xdp_tx_len(flen, ...) \ + .len = (flen), .flags = (__VA_ARGS__ + 0) +#endif + /** * libeth_xdp_tx_xmit_bulk - main XDP Tx function * @bulk: array of frames to send @@ -870,8 +885,7 @@ static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq, bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .xdpf = xdpf, - .len = xdpf->len, - .flags = LIBETH_XDP_TX_FIRST, + __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST), }; if (!xdp_frame_has_frags(xdpf)) @@ -902,7 +916,7 @@ static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq, bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .dma = dma, - .len = skb_frag_size(frag), + __libeth_xdp_tx_len(skb_frag_size(frag)), }; return true; @@ -1260,6 +1274,7 @@ bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer * head with the Rx buffer data: data pointer, length, headroom, and * truesize/tailroom. Zeroes the flags. + * Uses faster single u64 write instead of per-field access. */ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, const struct libeth_fqe *fqe, @@ -1267,7 +1282,15 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, { const struct page *page = __netmem_to_page(fqe->netmem); +#ifdef __LIBETH_WORD_ACCESS + static_assert(offsetofend(typeof(xdp->base), flags) - + offsetof(typeof(xdp->base), frame_sz) == + sizeof(u64)); + + *(u64 *)&xdp->base.frame_sz = fqe->truesize; +#else xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); +#endif xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, page->pp->p.offset, len, true); } diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h index 213778a684763..481a7b28e6f24 100644 --- a/include/net/libeth/xsk.h +++ b/include/net/libeth/xsk.h @@ -26,8 +26,8 @@ static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq, { bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .xsk = xdp, - .len = xdp->base.data_end - xdp->data, - .flags = LIBETH_XDP_TX_FIRST, + __libeth_xdp_tx_len(xdp->base.data_end - xdp->data, + LIBETH_XDP_TX_FIRST), }; if (likely(!xdp_buff_has_frags(&xdp->base))) @@ -48,7 +48,7 @@ static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, { bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .xsk = frag, - .len = frag->base.data_end - frag->data, + __libeth_xdp_tx_len(frag->base.data_end - frag->data), }; } @@ -199,7 +199,7 @@ __libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc, ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr); desc = (typeof(desc)){ .addr = ctx.dma, - .len = xdesc->len, + __libeth_xdp_tx_len(xdesc->len), }; BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); @@ -226,7 +226,7 @@ __libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc, { return (struct libeth_xdp_tx_desc){ .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr), - .len = xdesc->len, + __libeth_xdp_tx_len(xdesc->len), }; }