]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
net: fec: add AF_XDP zero-copy support
authorWei Fang <wei.fang@nxp.com>
Thu, 5 Feb 2026 08:57:42 +0000 (16:57 +0800)
committerPaolo Abeni <pabeni@redhat.com>
Tue, 10 Feb 2026 09:58:20 +0000 (10:58 +0100)
This patch adds AF_XDP zero-copy support for both TX and RX on the FEC
driver. It introduces new functions for XSK buffer allocation, RX/TX
queue processing in zero-copy mode, and XSK pool setup/teardown.

For RX, fec_alloc_rxq_buffers_zc() is added to allocate RX buffers from
XSK pool. And fec_enet_rx_queue_xsk() is used to process the frames from
the RX queue which is bound to the AF_XDP socket. Similar to the copy
mode, the zero-copy mode also supports XDP_TX, XDP_PASS, XDP_DROP and
XDP_REDIRECT actions. In addition, fec_enet_xsk_tx_xmit() is similar to
fec_enet_xdp_tx_xmit() and is used to handle XDP_TX action in zero-copy
mode.

For TX, there are two cases, one is the frames from the AF_XDP socket,
so fec_enet_xsk_xmit() is added to directly transmit the frames from
the socket and the buffer type is marked as FEC_TXBUF_T_XSK_XMIT. The
other one is the frames from the RX queue (XDP_TX action), the buffer
type is marked as FEC_TXBUF_T_XSK_TX. Therefore, fec_enet_tx_queue()
could correctly clean the TX queue base on the buffer type.

Also, some tests have been done on the i.MX93-EVK board with the xdpsock
tool, the following are the results.

Env: i.MX93 connects to a packet generator, the link speed is 1Gbps, and
flow-control is off. The RX packet size is 64 bytes including FCS. Only
one RX queue (CPU) is used to receive frames.

1. MAC swap L2 forwarding
1.1 Zero-copy mode
root@imx93evk:~# ./xdpsock -i eth0 -l -z
 sock0@eth0:0 l2fwd xdp-drv
                   pps            pkts           1.00
rx                 414715         415455
tx                 414715         415455

1.2 Copy mode
root@imx93evk:~# ./xdpsock -i eth0 -l -c
 sock0@eth0:0 l2fwd xdp-drv
                   pps            pkts           1.00
rx                 356396         356609
tx                 356396         356609

2. TX only
2.1 Zero-copy mode
root@imx93evk:~# ./xdpsock -i eth0 -t -s 64 -z
 sock0@eth0:0 txonly xdp-drv
                   pps            pkts           1.00
rx                 0              0
tx                 1119573        1126720

2.2 Copy mode
root@imx93evk:~# ./xdpsock -i eth0 -t -s 64 -c
sock0@eth0:0 txonly xdp-drv
                   pps            pkts           1.00
rx                 0              0
tx                 406864         407616

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Link: https://patch.msgid.link/20260205085742.2685134-16-wei.fang@nxp.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
drivers/net/ethernet/freescale/fec.h
drivers/net/ethernet/freescale/fec_main.c

index ad7aba1a853604a7db2368becb9c3754c05b3c98..7176803146f3d1857b0c10ea1328f604c8fcdc2e 100644 (file)
@@ -340,6 +340,7 @@ struct bufdesc_ex {
 #define FEC_ENET_TX_FRPPG      (PAGE_SIZE / FEC_ENET_TX_FRSIZE)
 #define TX_RING_SIZE           1024    /* Must be power of two */
 #define TX_RING_MOD_MASK       511     /*   for this to work */
+#define FEC_XSK_TX_BUDGET_MAX  256
 
 #define BD_ENET_RX_INT         0x00800000
 #define BD_ENET_RX_PTP         ((ushort)0x0400)
@@ -528,6 +529,8 @@ enum fec_txbuf_type {
        FEC_TXBUF_T_SKB,
        FEC_TXBUF_T_XDP_NDO,
        FEC_TXBUF_T_XDP_TX,
+       FEC_TXBUF_T_XSK_XMIT,
+       FEC_TXBUF_T_XSK_TX,
 };
 
 struct fec_tx_buffer {
@@ -539,6 +542,7 @@ struct fec_enet_priv_tx_q {
        struct bufdesc_prop bd;
        unsigned char *tx_bounce[TX_RING_SIZE];
        struct fec_tx_buffer tx_buf[TX_RING_SIZE];
+       struct xsk_buff_pool *xsk_pool;
 
        unsigned short tx_stop_threshold;
        unsigned short tx_wake_threshold;
@@ -548,9 +552,16 @@ struct fec_enet_priv_tx_q {
        dma_addr_t tso_hdrs_dma;
 };
 
+union fec_rx_buffer {
+       void *buf_p;
+       struct page *page;
+       struct xdp_buff *xdp;
+};
+
 struct fec_enet_priv_rx_q {
        struct bufdesc_prop bd;
-       struct page *rx_buf[RX_RING_SIZE];
+       union fec_rx_buffer rx_buf[RX_RING_SIZE];
+       struct xsk_buff_pool *xsk_pool;
 
        /* page_pool */
        struct page_pool *page_pool;
index c8e6a56313f5fbc328fce3e984540a4b1488b914..0d926bf18195ee5be95158deaabe429214fa4a77 100644 (file)
@@ -71,6 +71,7 @@
 #include <net/page_pool/helpers.h>
 #include <net/selftests.h>
 #include <net/tso.h>
+#include <net/xdp_sock_drv.h>
 #include <soc/imx/cpuidle.h>
 
 #include "fec.h"
@@ -1041,6 +1042,9 @@ static void fec_enet_bd_init(struct net_device *dev)
                                page_pool_put_page(pp_page_to_nmdesc(page)->pp,
                                                   page, 0, false);
                                break;
+                       case FEC_TXBUF_T_XSK_TX:
+                               xsk_buff_free(txq->tx_buf[i].buf_p);
+                               break;
                        default:
                                break;
                        }
@@ -1475,8 +1479,91 @@ fec_enet_hwtstamp(struct fec_enet_private *fep, unsigned ts,
        hwtstamps->hwtstamp = ns_to_ktime(ns);
 }
 
-static void fec_enet_tx_queue(struct fec_enet_private *fep,
-                             u16 queue, int budget)
+static bool fec_enet_xsk_xmit(struct fec_enet_private *fep,
+                             struct xsk_buff_pool *pool,
+                             u32 queue)
+{
+       struct fec_enet_priv_tx_q *txq = fep->tx_queue[queue];
+       struct xdp_desc *xsk_desc = pool->tx_descs;
+       int cpu = smp_processor_id();
+       int free_bds, budget, batch;
+       struct netdev_queue *nq;
+       struct bufdesc *bdp;
+       dma_addr_t dma;
+       u32 estatus;
+       u16 status;
+       int i, j;
+
+       nq = netdev_get_tx_queue(fep->netdev, queue);
+       __netif_tx_lock(nq, cpu);
+
+       txq_trans_cond_update(nq);
+       free_bds = fec_enet_get_free_txdesc_num(txq);
+       if (!free_bds)
+               goto tx_unlock;
+
+       budget = min(free_bds, FEC_XSK_TX_BUDGET_MAX);
+       batch = xsk_tx_peek_release_desc_batch(pool, budget);
+       if (!batch)
+               goto tx_unlock;
+
+       bdp = txq->bd.cur;
+       for (i = 0; i < batch; i++) {
+               dma = xsk_buff_raw_get_dma(pool, xsk_desc[i].addr);
+               xsk_buff_raw_dma_sync_for_device(pool, dma, xsk_desc[i].len);
+
+               j = fec_enet_get_bd_index(bdp, &txq->bd);
+               txq->tx_buf[j].type = FEC_TXBUF_T_XSK_XMIT;
+               txq->tx_buf[j].buf_p = NULL;
+
+               status = fec16_to_cpu(bdp->cbd_sc);
+               status &= ~BD_ENET_TX_STATS;
+               status |= BD_ENET_TX_INTR | BD_ENET_TX_LAST;
+               bdp->cbd_datlen = cpu_to_fec16(xsk_desc[i].len);
+               bdp->cbd_bufaddr = cpu_to_fec32(dma);
+
+               if (fep->bufdesc_ex) {
+                       struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+
+                       estatus = BD_ENET_TX_INT;
+                       if (fep->quirks & FEC_QUIRK_HAS_AVB)
+                               estatus |= FEC_TX_BD_FTYPE(txq->bd.qid);
+
+                       ebdp->cbd_bdu = 0;
+                       ebdp->cbd_esc = cpu_to_fec32(estatus);
+               }
+
+               /* Make sure the updates to rest of the descriptor are performed
+                * before transferring ownership.
+                */
+               dma_wmb();
+
+               /* Send it on its way.  Tell FEC it's ready, interrupt when done,
+                * it's the last BD of the frame, and to put the CRC on the end.
+                */
+               status |= BD_ENET_TX_READY | BD_ENET_TX_TC;
+               bdp->cbd_sc = cpu_to_fec16(status);
+               dma_wmb();
+
+               bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
+               txq->bd.cur = bdp;
+       }
+
+       /* Trigger transmission start */
+       fec_txq_trigger_xmit(fep, txq);
+
+       __netif_tx_unlock(nq);
+
+       return batch < budget;
+
+tx_unlock:
+       __netif_tx_unlock(nq);
+
+       return true;
+}
+
+static int fec_enet_tx_queue(struct fec_enet_private *fep,
+                            u16 queue, int budget)
 {
        struct netdev_queue *nq = netdev_get_tx_queue(fep->netdev, queue);
        struct fec_enet_priv_tx_q *txq = fep->tx_queue[queue];
@@ -1487,6 +1574,7 @@ static void fec_enet_tx_queue(struct fec_enet_private *fep,
        unsigned short status;
        struct sk_buff *skb;
        struct page *page;
+       int xsk_cnt = 0;
 
        /* get next bdp of dirty_tx */
        bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
@@ -1560,6 +1648,14 @@ static void fec_enet_tx_queue(struct fec_enet_private *fep,
                        page_pool_put_page(pp_page_to_nmdesc(page)->pp, page,
                                           0, true);
                        break;
+               case FEC_TXBUF_T_XSK_XMIT:
+                       bdp->cbd_bufaddr = cpu_to_fec32(0);
+                       xsk_cnt++;
+                       break;
+               case FEC_TXBUF_T_XSK_TX:
+                       bdp->cbd_bufaddr = cpu_to_fec32(0);
+                       xsk_buff_free(tx_buf->buf_p);
+                       break;
                default:
                        break;
                }
@@ -1619,16 +1715,37 @@ out:
        if (bdp != txq->bd.cur &&
            readl(txq->bd.reg_desc_active) == 0)
                writel(0, txq->bd.reg_desc_active);
+
+       if (txq->xsk_pool) {
+               struct xsk_buff_pool *pool = txq->xsk_pool;
+
+               if (xsk_cnt)
+                       xsk_tx_completed(pool, xsk_cnt);
+
+               if (xsk_uses_need_wakeup(pool))
+                       xsk_set_tx_need_wakeup(pool);
+
+               /* If the condition is true, it indicates that there are still
+                * packets to be transmitted, so return "budget" to make the
+                * NAPI continue polling.
+                */
+               if (!fec_enet_xsk_xmit(fep, pool, queue))
+                       return budget;
+       }
+
+       return 0;
 }
 
-static void fec_enet_tx(struct net_device *ndev, int budget)
+static int fec_enet_tx(struct net_device *ndev, int budget)
 {
        struct fec_enet_private *fep = netdev_priv(ndev);
-       int i;
+       int i, count = 0;
 
        /* Make sure that AVB queues are processed first. */
        for (i = fep->num_tx_queues - 1; i >= 0; i--)
-               fec_enet_tx_queue(fep, i, budget);
+               count += fec_enet_tx_queue(fep, i, budget);
+
+       return count;
 }
 
 static int fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq,
@@ -1641,13 +1758,30 @@ static int fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq,
        if (unlikely(!new_page))
                return -ENOMEM;
 
-       rxq->rx_buf[index] = new_page;
+       rxq->rx_buf[index].page = new_page;
        phys_addr = page_pool_get_dma_addr(new_page) + FEC_ENET_XDP_HEADROOM;
        bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
 
        return 0;
 }
 
+static int fec_enet_update_cbd_zc(struct fec_enet_priv_rx_q *rxq,
+                                 struct bufdesc *bdp, int index)
+{
+       struct xdp_buff *new_xdp;
+       dma_addr_t phys_addr;
+
+       new_xdp = xsk_buff_alloc(rxq->xsk_pool);
+       if (unlikely(!new_xdp))
+               return -ENOMEM;
+
+       rxq->rx_buf[index].xdp = new_xdp;
+       phys_addr = xsk_buff_xdp_get_dma(new_xdp);
+       bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
+
+       return 0;
+}
+
 static void fec_enet_rx_vlan(const struct net_device *ndev, struct sk_buff *skb)
 {
        if (ndev->features & NETIF_F_HW_VLAN_CTAG_RX) {
@@ -1802,7 +1936,7 @@ static int fec_enet_rx_queue(struct fec_enet_private *fep,
                ndev->stats.rx_bytes += pkt_len - fep->rx_shift;
 
                index = fec_enet_get_bd_index(bdp, &rxq->bd);
-               page = rxq->rx_buf[index];
+               page = rxq->rx_buf[index].page;
                dma = fec32_to_cpu(bdp->cbd_bufaddr);
                if (fec_enet_update_cbd(rxq, bdp, index)) {
                        ndev->stats.rx_dropped++;
@@ -1932,7 +2066,7 @@ static int fec_enet_rx_queue_xdp(struct fec_enet_private *fep, int queue,
                ndev->stats.rx_bytes += pkt_len - fep->rx_shift;
 
                index = fec_enet_get_bd_index(bdp, &rxq->bd);
-               page = rxq->rx_buf[index];
+               page = rxq->rx_buf[index].page;
                dma = fec32_to_cpu(bdp->cbd_bufaddr);
 
                if (fec_enet_update_cbd(rxq, bdp, index)) {
@@ -2047,6 +2181,274 @@ rx_processing_done:
        return pkt_received;
 }
 
+static struct sk_buff *fec_build_skb_zc(struct xdp_buff *xsk,
+                                       struct napi_struct *napi)
+{
+       size_t len = xdp_get_buff_len(xsk);
+       struct sk_buff *skb;
+
+       skb = napi_alloc_skb(napi, len);
+       if (unlikely(!skb)) {
+               xsk_buff_free(xsk);
+               return NULL;
+       }
+
+       skb_put_data(skb, xsk->data, len);
+       xsk_buff_free(xsk);
+
+       return skb;
+}
+
+static int fec_enet_xsk_tx_xmit(struct fec_enet_private *fep,
+                               struct xdp_buff *xsk, int cpu,
+                               int queue)
+{
+       struct netdev_queue *nq = netdev_get_tx_queue(fep->netdev, queue);
+       struct fec_enet_priv_tx_q *txq = fep->tx_queue[queue];
+       u32 offset = xsk->data - xsk->data_hard_start;
+       u32 headroom = txq->xsk_pool->headroom;
+       u32 len = xsk->data_end - xsk->data;
+       u32 index, status, estatus;
+       struct bufdesc *bdp;
+       dma_addr_t dma;
+
+       __netif_tx_lock(nq, cpu);
+
+       /* Avoid tx timeout as XDP shares the queue with kernel stack */
+       txq_trans_cond_update(nq);
+
+       if (!fec_enet_get_free_txdesc_num(txq)) {
+               __netif_tx_unlock(nq);
+
+               return -EBUSY;
+       }
+
+       /* Fill in a Tx ring entry */
+       bdp = txq->bd.cur;
+       status = fec16_to_cpu(bdp->cbd_sc);
+       status &= ~BD_ENET_TX_STATS;
+
+       index = fec_enet_get_bd_index(bdp, &txq->bd);
+       dma = xsk_buff_xdp_get_frame_dma(xsk) + headroom + offset;
+
+       xsk_buff_raw_dma_sync_for_device(txq->xsk_pool, dma, len);
+
+       txq->tx_buf[index].buf_p = xsk;
+       txq->tx_buf[index].type = FEC_TXBUF_T_XSK_TX;
+
+       status |= (BD_ENET_TX_INTR | BD_ENET_TX_LAST);
+       if (fep->bufdesc_ex)
+               estatus = BD_ENET_TX_INT;
+
+       bdp->cbd_bufaddr = cpu_to_fec32(dma);
+       bdp->cbd_datlen = cpu_to_fec16(len);
+
+       if (fep->bufdesc_ex) {
+               struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+
+               if (fep->quirks & FEC_QUIRK_HAS_AVB)
+                       estatus |= FEC_TX_BD_FTYPE(txq->bd.qid);
+
+               ebdp->cbd_bdu = 0;
+               ebdp->cbd_esc = cpu_to_fec32(estatus);
+       }
+
+       dma_wmb();
+       status |= BD_ENET_TX_READY | BD_ENET_TX_TC;
+       bdp->cbd_sc = cpu_to_fec16(status);
+       dma_wmb();
+
+       bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
+       txq->bd.cur = bdp;
+
+       __netif_tx_unlock(nq);
+
+       return 0;
+}
+
+static int fec_enet_rx_queue_xsk(struct fec_enet_private *fep, int queue,
+                                int budget, struct bpf_prog *prog)
+{
+       u32 data_start = FEC_ENET_XDP_HEADROOM + fep->rx_shift;
+       struct fec_enet_priv_rx_q *rxq = fep->rx_queue[queue];
+       struct net_device *ndev = fep->netdev;
+       struct bufdesc *bdp = rxq->bd.cur;
+       u32 sub_len = 4 + fep->rx_shift;
+       int cpu = smp_processor_id();
+       bool wakeup_xsk = false;
+       struct xdp_buff *xsk;
+       int pkt_received = 0;
+       struct sk_buff *skb;
+       u16 status, pkt_len;
+       u32 xdp_res = 0;
+       int index, err;
+       u32 act;
+
+#if defined(CONFIG_COLDFIRE) && !defined(CONFIG_COLDFIRE_COHERENT_DMA)
+       /*
+        * Hacky flush of all caches instead of using the DMA API for the TSO
+        * headers.
+        */
+       flush_cache_all();
+#endif
+
+       while (!((status = fec16_to_cpu(bdp->cbd_sc)) & BD_ENET_RX_EMPTY)) {
+               if (unlikely(pkt_received >= budget))
+                       break;
+
+               writel(FEC_ENET_RXF_GET(queue), fep->hwp + FEC_IEVENT);
+
+               index = fec_enet_get_bd_index(bdp, &rxq->bd);
+               xsk = rxq->rx_buf[index].xdp;
+               if (unlikely(!xsk)) {
+                       if (fec_enet_update_cbd_zc(rxq, bdp, index))
+                               break;
+
+                       if (fep->bufdesc_ex) {
+                               struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+
+                               ebdp->cbd_esc = cpu_to_fec32(BD_ENET_RX_INT);
+                               ebdp->cbd_prot = 0;
+                               ebdp->cbd_bdu = 0;
+                       }
+
+                       dma_wmb();
+                       status &= ~BD_ENET_RX_STATS;
+                       status |= BD_ENET_RX_EMPTY;
+                       bdp->cbd_sc = cpu_to_fec16(status);
+                       break;
+               }
+
+               pkt_received++;
+               /* Check for errors. */
+               status ^= BD_ENET_RX_LAST;
+               if (unlikely(fec_rx_error_check(ndev, status)))
+                       goto rx_processing_done;
+
+               /* Process the incoming frame. */
+               ndev->stats.rx_packets++;
+               pkt_len = fec16_to_cpu(bdp->cbd_datlen);
+               ndev->stats.rx_bytes += pkt_len - fep->rx_shift;
+
+               if (fec_enet_update_cbd_zc(rxq, bdp, index)) {
+                       ndev->stats.rx_dropped++;
+                       goto rx_processing_done;
+               }
+
+               pkt_len -= sub_len;
+               xsk->data = xsk->data_hard_start + data_start;
+               /* Subtract FCS and 16bit shift */
+               xsk->data_end = xsk->data + pkt_len;
+               xsk->data_meta = xsk->data;
+               xsk_buff_dma_sync_for_cpu(xsk);
+
+               /* If the XSK pool is enabled before the bpf program is
+                * installed, or the bpf program is uninstalled before
+                * the XSK pool is disabled. prog will be NULL and we
+                * need to set a default XDP_PASS action.
+                */
+               if (unlikely(!prog))
+                       act = XDP_PASS;
+               else
+                       act = bpf_prog_run_xdp(prog, xsk);
+
+               switch (act) {
+               case XDP_PASS:
+                       rxq->stats[RX_XDP_PASS]++;
+                       skb = fec_build_skb_zc(xsk, &fep->napi);
+                       if (unlikely(!skb)) {
+                               ndev->stats.rx_dropped++;
+                               trace_xdp_exception(ndev, prog, XDP_PASS);
+                       } else {
+                               napi_gro_receive(&fep->napi, skb);
+                       }
+
+                       break;
+               case XDP_TX:
+                       rxq->stats[RX_XDP_TX]++;
+                       err = fec_enet_xsk_tx_xmit(fep, xsk, cpu, queue);
+                       if (unlikely(err)) {
+                               rxq->stats[RX_XDP_TX_ERRORS]++;
+                               xsk_buff_free(xsk);
+                               trace_xdp_exception(ndev, prog, XDP_TX);
+                       } else {
+                               xdp_res |= FEC_ENET_XDP_TX;
+                       }
+                       break;
+               case XDP_REDIRECT:
+                       rxq->stats[RX_XDP_REDIRECT]++;
+                       err = xdp_do_redirect(ndev, xsk, prog);
+                       if (unlikely(err)) {
+                               if (err == -ENOBUFS)
+                                       wakeup_xsk = true;
+
+                               rxq->stats[RX_XDP_DROP]++;
+                               xsk_buff_free(xsk);
+                               trace_xdp_exception(ndev, prog, XDP_REDIRECT);
+                       } else {
+                               xdp_res |= FEC_ENET_XDP_REDIR;
+                       }
+                       break;
+               default:
+                       bpf_warn_invalid_xdp_action(ndev, prog, act);
+                       fallthrough;
+               case XDP_ABORTED:
+                       trace_xdp_exception(ndev, prog, act);
+                       fallthrough;
+               case XDP_DROP:
+                       rxq->stats[RX_XDP_DROP]++;
+                       xsk_buff_free(xsk);
+                       break;
+               }
+
+rx_processing_done:
+               /* Clear the status flags for this buffer */
+               status &= ~BD_ENET_RX_STATS;
+               /* Mark the buffer empty */
+               status |= BD_ENET_RX_EMPTY;
+
+               if (fep->bufdesc_ex) {
+                       struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+
+                       ebdp->cbd_esc = cpu_to_fec32(BD_ENET_RX_INT);
+                       ebdp->cbd_prot = 0;
+                       ebdp->cbd_bdu = 0;
+               }
+
+               /* Make sure the updates to rest of the descriptor are
+                * performed before transferring ownership.
+                */
+               dma_wmb();
+               bdp->cbd_sc = cpu_to_fec16(status);
+
+               /* Update BD pointer to next entry */
+               bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
+
+               /* Doing this here will keep the FEC running while we process
+                * incoming frames. On a heavily loaded network, we should be
+                * able to keep up at the expense of system resources.
+                */
+               writel(0, rxq->bd.reg_desc_active);
+       }
+
+       rxq->bd.cur = bdp;
+
+       if (xdp_res & FEC_ENET_XDP_REDIR)
+               xdp_do_flush();
+
+       if (xdp_res & FEC_ENET_XDP_TX)
+               fec_txq_trigger_xmit(fep, fep->tx_queue[queue]);
+
+       if (rxq->xsk_pool && xsk_uses_need_wakeup(rxq->xsk_pool)) {
+               if (wakeup_xsk)
+                       xsk_set_rx_need_wakeup(rxq->xsk_pool);
+               else
+                       xsk_clear_rx_need_wakeup(rxq->xsk_pool);
+       }
+
+       return pkt_received;
+}
+
 static int fec_enet_rx(struct net_device *ndev, int budget)
 {
        struct fec_enet_private *fep = netdev_priv(ndev);
@@ -2055,11 +2457,15 @@ static int fec_enet_rx(struct net_device *ndev, int budget)
 
        /* Make sure that AVB queues are processed first. */
        for (i = fep->num_rx_queues - 1; i >= 0; i--) {
-               if (prog)
-                       done += fec_enet_rx_queue_xdp(fep, i, budget - done,
-                                                     prog);
+               struct fec_enet_priv_rx_q *rxq = fep->rx_queue[i];
+               int batch = budget - done;
+
+               if (rxq->xsk_pool)
+                       done += fec_enet_rx_queue_xsk(fep, i, batch, prog);
+               else if (prog)
+                       done += fec_enet_rx_queue_xdp(fep, i, batch, prog);
                else
-                       done += fec_enet_rx_queue(fep, i, budget - done);
+                       done += fec_enet_rx_queue(fep, i, batch);
        }
 
        return done;
@@ -2103,19 +2509,22 @@ static int fec_enet_rx_napi(struct napi_struct *napi, int budget)
 {
        struct net_device *ndev = napi->dev;
        struct fec_enet_private *fep = netdev_priv(ndev);
-       int done = 0;
+       int rx_done = 0, tx_done = 0;
+       int max_done;
 
        do {
-               done += fec_enet_rx(ndev, budget - done);
-               fec_enet_tx(ndev, budget);
-       } while ((done < budget) && fec_enet_collect_events(fep));
+               rx_done += fec_enet_rx(ndev, budget - rx_done);
+               tx_done += fec_enet_tx(ndev, budget);
+               max_done = max(rx_done, tx_done);
+       } while ((max_done < budget) && fec_enet_collect_events(fep));
 
-       if (done < budget) {
-               napi_complete_done(napi, done);
+       if (max_done < budget) {
+               napi_complete_done(napi, max_done);
                writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
+               return max_done;
        }
 
-       return done;
+       return budget;
 }
 
 /* ------------------------------------------------------------------------- */
@@ -3406,7 +3815,8 @@ static int fec_xdp_rxq_info_reg(struct fec_enet_private *fep,
                                struct fec_enet_priv_rx_q *rxq)
 {
        struct net_device *ndev = fep->netdev;
-       int err;
+       void *allocator;
+       int type, err;
 
        err = xdp_rxq_info_reg(&rxq->xdp_rxq, ndev, rxq->id, 0);
        if (err) {
@@ -3414,8 +3824,9 @@ static int fec_xdp_rxq_info_reg(struct fec_enet_private *fep,
                return err;
        }
 
-       err = xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq, MEM_TYPE_PAGE_POOL,
-                                        rxq->page_pool);
+       allocator = rxq->xsk_pool ? NULL : rxq->page_pool;
+       type = rxq->xsk_pool ? MEM_TYPE_XSK_BUFF_POOL : MEM_TYPE_PAGE_POOL;
+       err = xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq, type, allocator);
        if (err) {
                netdev_err(ndev, "Failed to register XDP mem model\n");
                xdp_rxq_info_unreg(&rxq->xdp_rxq);
@@ -3423,6 +3834,9 @@ static int fec_xdp_rxq_info_reg(struct fec_enet_private *fep,
                return err;
        }
 
+       if (rxq->xsk_pool)
+               xsk_pool_set_rxq_info(rxq->xsk_pool, &rxq->xdp_rxq);
+
        return 0;
 }
 
@@ -3436,20 +3850,28 @@ static void fec_xdp_rxq_info_unreg(struct fec_enet_priv_rx_q *rxq)
 
 static void fec_free_rxq_buffers(struct fec_enet_priv_rx_q *rxq)
 {
+       bool xsk = !!rxq->xsk_pool;
        int i;
 
        for (i = 0; i < rxq->bd.ring_size; i++) {
-               struct page *page = rxq->rx_buf[i];
+               union fec_rx_buffer *buf = &rxq->rx_buf[i];
 
-               if (!page)
+               if (!buf->buf_p)
                        continue;
 
-               page_pool_put_full_page(rxq->page_pool, page, false);
-               rxq->rx_buf[i] = NULL;
+               if (xsk)
+                       xsk_buff_free(buf->xdp);
+               else
+                       page_pool_put_full_page(rxq->page_pool,
+                                               buf->page, false);
+
+               rxq->rx_buf[i].buf_p = NULL;
        }
 
-       page_pool_destroy(rxq->page_pool);
-       rxq->page_pool = NULL;
+       if (!xsk) {
+               page_pool_destroy(rxq->page_pool);
+               rxq->page_pool = NULL;
+       }
 }
 
 static void fec_enet_free_buffers(struct net_device *ndev)
@@ -3489,6 +3911,9 @@ static void fec_enet_free_buffers(struct net_device *ndev)
                                page_pool_put_page(pp_page_to_nmdesc(page)->pp,
                                                   page, 0, false);
                                break;
+                       case FEC_TXBUF_T_XSK_TX:
+                               xsk_buff_free(txq->tx_buf[i].buf_p);
+                               break;
                        default:
                                break;
                        }
@@ -3604,7 +4029,7 @@ static int fec_alloc_rxq_buffers_pp(struct fec_enet_private *fep,
 
                phys_addr = page_pool_get_dma_addr(page) + FEC_ENET_XDP_HEADROOM;
                bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
-               rxq->rx_buf[i] = page;
+               rxq->rx_buf[i].page = page;
                bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
        }
 
@@ -3616,6 +4041,33 @@ free_rx_buffers:
        return err;
 }
 
+static int fec_alloc_rxq_buffers_zc(struct fec_enet_private *fep,
+                                   struct fec_enet_priv_rx_q *rxq)
+{
+       union fec_rx_buffer *buf = &rxq->rx_buf[0];
+       struct bufdesc *bdp = rxq->bd.base;
+       dma_addr_t phys_addr;
+       int i;
+
+       for (i = 0; i < rxq->bd.ring_size; i++) {
+               buf[i].xdp = xsk_buff_alloc(rxq->xsk_pool);
+               if (!buf[i].xdp)
+                       break;
+
+               phys_addr = xsk_buff_xdp_get_dma(buf[i].xdp);
+               bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
+               bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
+       }
+
+       for (; i < rxq->bd.ring_size; i++) {
+               buf[i].xdp = NULL;
+               bdp->cbd_bufaddr = cpu_to_fec32(0);
+               bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
+       }
+
+       return 0;
+}
+
 static int
 fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
 {
@@ -3624,9 +4076,16 @@ fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
        int err;
 
        rxq = fep->rx_queue[queue];
-       err = fec_alloc_rxq_buffers_pp(fep, rxq);
-       if (err)
-               goto free_buffers;
+       if (rxq->xsk_pool) {
+               /* RX XDP ZC buffer pool may not be populated, e.g.
+                * xdpsock TX-only.
+                */
+               fec_alloc_rxq_buffers_zc(fep, rxq);
+       } else {
+               err = fec_alloc_rxq_buffers_pp(fep, rxq);
+               if (err)
+                       goto free_buffers;
+       }
 
        err = fec_xdp_rxq_info_reg(fep, rxq);
        if (err)
@@ -3951,21 +4410,237 @@ static u16 fec_enet_select_queue(struct net_device *ndev, struct sk_buff *skb,
        return fec_enet_vlan_pri_to_queue[vlan_tag >> 13];
 }
 
+static void fec_free_rxq(struct fec_enet_priv_rx_q *rxq)
+{
+       fec_xdp_rxq_info_unreg(rxq);
+       fec_free_rxq_buffers(rxq);
+       kfree(rxq);
+}
+
+static struct fec_enet_priv_rx_q *
+fec_alloc_new_rxq_xsk(struct fec_enet_private *fep, int queue,
+                     struct xsk_buff_pool *pool)
+{
+       struct fec_enet_priv_rx_q *old_rxq = fep->rx_queue[queue];
+       struct fec_enet_priv_rx_q *rxq;
+       union fec_rx_buffer *buf;
+       int i;
+
+       rxq = kzalloc(sizeof(*rxq), GFP_KERNEL);
+       if (!rxq)
+               return NULL;
+
+       /* Copy the BD ring to the new rxq */
+       rxq->bd = old_rxq->bd;
+       rxq->id = queue;
+       rxq->xsk_pool = pool;
+       buf = &rxq->rx_buf[0];
+
+       for (i = 0; i < rxq->bd.ring_size; i++) {
+               buf[i].xdp = xsk_buff_alloc(pool);
+               /* RX XDP ZC buffer pool may not be populated, e.g.
+                * xdpsock TX-only.
+                */
+               if (!buf[i].xdp)
+                       break;
+       }
+
+       if (fec_xdp_rxq_info_reg(fep, rxq))
+               goto free_buffers;
+
+       return rxq;
+
+free_buffers:
+       while (--i >= 0)
+               xsk_buff_free(buf[i].xdp);
+
+       kfree(rxq);
+
+       return NULL;
+}
+
+static struct fec_enet_priv_rx_q *
+fec_alloc_new_rxq_pp(struct fec_enet_private *fep, int queue)
+{
+       struct fec_enet_priv_rx_q *old_rxq = fep->rx_queue[queue];
+       struct fec_enet_priv_rx_q *rxq;
+       union fec_rx_buffer *buf;
+       int i = 0;
+
+       rxq = kzalloc(sizeof(*rxq), GFP_KERNEL);
+       if (!rxq)
+               return NULL;
+
+       rxq->bd = old_rxq->bd;
+       rxq->id = queue;
+
+       if (fec_enet_create_page_pool(fep, rxq))
+               goto free_rxq;
+
+       buf = &rxq->rx_buf[0];
+       for (; i < rxq->bd.ring_size; i++) {
+               buf[i].page = page_pool_dev_alloc_pages(rxq->page_pool);
+               if (!buf[i].page)
+                       goto free_buffers;
+       }
+
+       if (fec_xdp_rxq_info_reg(fep, rxq))
+               goto free_buffers;
+
+       return rxq;
+
+free_buffers:
+       while (--i >= 0)
+               page_pool_put_full_page(rxq->page_pool,
+                                       buf[i].page, false);
+
+       page_pool_destroy(rxq->page_pool);
+free_rxq:
+       kfree(rxq);
+
+       return NULL;
+}
+
+static void fec_init_rxq_bd_buffers(struct fec_enet_priv_rx_q *rxq, bool xsk)
+{
+       union fec_rx_buffer *buf = &rxq->rx_buf[0];
+       struct bufdesc *bdp = rxq->bd.base;
+       dma_addr_t dma;
+
+       for (int i = 0; i < rxq->bd.ring_size; i++) {
+               if (xsk)
+                       dma = buf[i].xdp ?
+                             xsk_buff_xdp_get_dma(buf[i].xdp) : 0;
+               else
+                       dma = page_pool_get_dma_addr(buf[i].page) +
+                             FEC_ENET_XDP_HEADROOM;
+
+               bdp->cbd_bufaddr = cpu_to_fec32(dma);
+               bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
+       }
+}
+
+static int fec_xsk_restart_napi(struct fec_enet_private *fep,
+                               struct xsk_buff_pool *pool,
+                               u16 queue)
+{
+       struct fec_enet_priv_tx_q *txq = fep->tx_queue[queue];
+       struct net_device *ndev = fep->netdev;
+       struct fec_enet_priv_rx_q *rxq;
+       int err;
+
+       napi_disable(&fep->napi);
+       netif_tx_disable(ndev);
+       synchronize_rcu();
+
+       rxq = pool ? fec_alloc_new_rxq_xsk(fep, queue, pool) :
+                    fec_alloc_new_rxq_pp(fep, queue);
+       if (!rxq) {
+               err = -ENOMEM;
+               goto err_alloc_new_rxq;
+       }
+
+       /* Replace the old rxq with the new rxq */
+       fec_free_rxq(fep->rx_queue[queue]);
+       fep->rx_queue[queue] = rxq;
+       fec_init_rxq_bd_buffers(rxq, !!pool);
+       txq->xsk_pool = pool;
+
+       fec_restart(ndev);
+       napi_enable(&fep->napi);
+       netif_tx_start_all_queues(ndev);
+
+       return 0;
+
+err_alloc_new_rxq:
+       napi_enable(&fep->napi);
+       netif_tx_start_all_queues(ndev);
+
+       return err;
+}
+
+static int fec_enable_xsk_pool(struct fec_enet_private *fep,
+                              struct xsk_buff_pool *pool,
+                              u16 queue)
+{
+       int err;
+
+       err = xsk_pool_dma_map(pool, &fep->pdev->dev, 0);
+       if (err) {
+               netdev_err(fep->netdev, "Failed to map xsk pool\n");
+               return err;
+       }
+
+       if (!netif_running(fep->netdev)) {
+               struct fec_enet_priv_rx_q *rxq = fep->rx_queue[queue];
+               struct fec_enet_priv_tx_q *txq = fep->tx_queue[queue];
+
+               rxq->xsk_pool = pool;
+               txq->xsk_pool = pool;
+
+               return 0;
+       }
+
+       err = fec_xsk_restart_napi(fep, pool, queue);
+       if (err) {
+               xsk_pool_dma_unmap(pool, 0);
+               return err;
+       }
+
+       return 0;
+}
+
+static int fec_disable_xsk_pool(struct fec_enet_private *fep,
+                               u16 queue)
+{
+       struct fec_enet_priv_tx_q *txq = fep->tx_queue[queue];
+       struct xsk_buff_pool *old_pool = txq->xsk_pool;
+       int err;
+
+       if (!netif_running(fep->netdev)) {
+               struct fec_enet_priv_rx_q *rxq = fep->rx_queue[queue];
+
+               xsk_pool_dma_unmap(old_pool, 0);
+               rxq->xsk_pool = NULL;
+               txq->xsk_pool = NULL;
+
+               return 0;
+       }
+
+       err = fec_xsk_restart_napi(fep, NULL, queue);
+       if (err)
+               return err;
+
+       xsk_pool_dma_unmap(old_pool, 0);
+
+       return 0;
+}
+
+static int fec_setup_xsk_pool(struct fec_enet_private *fep,
+                             struct xsk_buff_pool *pool,
+                             u16 queue)
+{
+       if (queue >= fep->num_rx_queues || queue >= fep->num_tx_queues)
+               return -ERANGE;
+
+       return pool ? fec_enable_xsk_pool(fep, pool, queue) :
+                     fec_disable_xsk_pool(fep, queue);
+}
+
 static int fec_enet_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 {
        struct fec_enet_private *fep = netdev_priv(dev);
        bool is_run = netif_running(dev);
        struct bpf_prog *old_prog;
 
+       /* No need to support the SoCs that require to do the frame swap
+        * because the performance wouldn't be better than the skb mode.
+        */
+       if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
+               return -EOPNOTSUPP;
+
        switch (bpf->command) {
        case XDP_SETUP_PROG:
-               /* No need to support the SoCs that require to
-                * do the frame swap because the performance wouldn't be
-                * better than the skb mode.
-                */
-               if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
-                       return -EOPNOTSUPP;
-
                if (!bpf->prog)
                        xdp_features_clear_redirect_target(dev);
 
@@ -3989,10 +4664,9 @@ static int fec_enet_bpf(struct net_device *dev, struct netdev_bpf *bpf)
                        xdp_features_set_redirect_target(dev, false);
 
                return 0;
-
        case XDP_SETUP_XSK_POOL:
-               return -EOPNOTSUPP;
-
+               return fec_setup_xsk_pool(fep, bpf->xsk.pool,
+                                         bpf->xsk.queue_id);
        default:
                return -EOPNOTSUPP;
        }
@@ -4140,6 +4814,29 @@ static int fec_enet_xdp_xmit(struct net_device *dev,
        return sent_frames;
 }
 
+static int fec_enet_xsk_wakeup(struct net_device *ndev, u32 queue, u32 flags)
+{
+       struct fec_enet_private *fep = netdev_priv(ndev);
+       struct fec_enet_priv_rx_q *rxq;
+
+       if (!netif_running(ndev) || !netif_carrier_ok(ndev))
+               return -ENETDOWN;
+
+       if (queue >= fep->num_rx_queues || queue >= fep->num_tx_queues)
+               return -ERANGE;
+
+       rxq = fep->rx_queue[queue];
+       if (!rxq->xsk_pool)
+               return -EINVAL;
+
+       if (!napi_if_scheduled_mark_missed(&fep->napi)) {
+               if (likely(napi_schedule_prep(&fep->napi)))
+                       __napi_schedule(&fep->napi);
+       }
+
+       return 0;
+}
+
 static int fec_hwtstamp_get(struct net_device *ndev,
                            struct kernel_hwtstamp_config *config)
 {
@@ -4202,6 +4899,7 @@ static const struct net_device_ops fec_netdev_ops = {
        .ndo_set_features       = fec_set_features,
        .ndo_bpf                = fec_enet_bpf,
        .ndo_xdp_xmit           = fec_enet_xdp_xmit,
+       .ndo_xsk_wakeup         = fec_enet_xsk_wakeup,
        .ndo_hwtstamp_get       = fec_hwtstamp_get,
        .ndo_hwtstamp_set       = fec_hwtstamp_set,
 };
@@ -4329,7 +5027,8 @@ static int fec_enet_init(struct net_device *ndev)
 
        if (!(fep->quirks & FEC_QUIRK_SWAP_FRAME))
                ndev->xdp_features = NETDEV_XDP_ACT_BASIC |
-                                    NETDEV_XDP_ACT_REDIRECT;
+                                    NETDEV_XDP_ACT_REDIRECT |
+                                    NETDEV_XDP_ACT_XSK_ZEROCOPY;
 
        fec_restart(ndev);