ice: implement configurable header split for regular Rx

author Alexander Lobakin <aleksander.lobakin@intel.com>

Mon, 6 Oct 2025 16:20:53 +0000 (18:20 +0200)

committer Tony Nguyen <anthony.l.nguyen@intel.com>

Wed, 29 Oct 2025 20:55:21 +0000 (13:55 -0700)
author Alexander Lobakin <aleksander.lobakin@intel.com>
Mon, 6 Oct 2025 16:20:53 +0000 (18:20 +0200)
committer Tony Nguyen <anthony.l.nguyen@intel.com>
Wed, 29 Oct 2025 20:55:21 +0000 (13:55 -0700)
diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h

index 3d4d8b88631b24943a16a001928885c78c3c6e03..147aaee192a791d4c721f69d4e59fc13b2c07a3d 100644 (file)
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -351,6 +351,7 @@ struct ice_vsi {
         u16 num_q_vectors;
         /* tell if only dynamic irq allocation is allowed */
         bool irq_dyn_alloc;
+       bool hsplit:1;
  
         u16 vsi_num;                    /* HW (absolute) index of this VSI */
         u16 idx;                        /* software index in pf->vsi[] */
diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c

index eabab50fab33d97ed72d96d24163504410d24489..eadb1e3d12b3a839da4a91a263908eb87350c1e7 100644 (file)
--- a/drivers/net/ethernet/intel/ice/ice_base.c
+++ b/drivers/net/ethernet/intel/ice/ice_base.c
@@ -524,8 +524,29 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
         else
                 rlan_ctx.l2tsel = 1;
  
-       rlan_ctx.dtype = ICE_RX_DTYPE_NO_SPLIT;
-       rlan_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_NO_SPLIT;
+       if (ring->hdr_pp) {
+               rlan_ctx.hbuf = ring->rx_hdr_len >> ICE_RLAN_CTX_HBUF_S;
+               rlan_ctx.dtype = ICE_RX_DTYPE_HEADER_SPLIT;
+
+               /*
+                * If the frame is TCP/UDP/SCTP, it will be split by the
+                * payload.
+                * If not, but it's an IPv4/IPv6 frame, it will be split by
+                * the IP header.
+                * If not IP, it will be split by the Ethernet header.
+                *
+                * In any case, the header buffer will never be left empty.
+                */
+               rlan_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_SPLIT_L2 |
+                                   ICE_RLAN_RX_HSPLIT_0_SPLIT_IP |
+                                   ICE_RLAN_RX_HSPLIT_0_SPLIT_TCP_UDP |
+                                   ICE_RLAN_RX_HSPLIT_0_SPLIT_SCTP;
+       } else {
+               rlan_ctx.hbuf = 0;
+               rlan_ctx.dtype = ICE_RX_DTYPE_NO_SPLIT;
+               rlan_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_NO_SPLIT;
+       }
+
         rlan_ctx.hsplit_1 = ICE_RLAN_RX_HSPLIT_1_NO_SPLIT;
  
         /* This controls whether VLAN is stripped from inner headers
@@ -581,6 +602,53 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
         return 0;
  }
  
+static int ice_rxq_pp_create(struct ice_rx_ring *rq)
+{
+       struct libeth_fq fq = {
+               .count          = rq->count,
+               .nid            = NUMA_NO_NODE,
+               .hsplit         = rq->vsi->hsplit,
+               .xdp            = ice_is_xdp_ena_vsi(rq->vsi),
+               .buf_len        = LIBIE_MAX_RX_BUF_LEN,
+       };
+       int err;
+
+       err = libeth_rx_fq_create(&fq, &rq->q_vector->napi);
+       if (err)
+               return err;
+
+       rq->pp = fq.pp;
+       rq->rx_fqes = fq.fqes;
+       rq->truesize = fq.truesize;
+       rq->rx_buf_len = fq.buf_len;
+
+       if (!fq.hsplit)
+               return 0;
+
+       fq = (struct libeth_fq){
+               .count          = rq->count,
+               .type           = LIBETH_FQE_HDR,
+               .nid            = NUMA_NO_NODE,
+               .xdp            = ice_is_xdp_ena_vsi(rq->vsi),
+       };
+
+       err = libeth_rx_fq_create(&fq, &rq->q_vector->napi);
+       if (err)
+               goto destroy;
+
+       rq->hdr_pp = fq.pp;
+       rq->hdr_fqes = fq.fqes;
+       rq->hdr_truesize = fq.truesize;
+       rq->rx_hdr_len = fq.buf_len;
+
+       return 0;
+
+destroy:
+       ice_rxq_pp_destroy(rq);
+
+       return err;
+}
+
  /**
   * ice_vsi_cfg_rxq - Configure an Rx queue
   * @ring: the ring being configured
@@ -589,12 +657,6 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
   */
  static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
  {
-       struct libeth_fq fq = {
-               .count          = ring->count,
-               .nid            = NUMA_NO_NODE,
-               .xdp            = ice_is_xdp_ena_vsi(ring->vsi),
-               .buf_len        = LIBIE_MAX_RX_BUF_LEN,
-       };
         struct device *dev = ice_pf_to_dev(ring->vsi->back);
         u32 num_bufs = ICE_DESC_UNUSED(ring);
         u32 rx_buf_len;
@@ -636,15 +698,10 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
                         dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
                                  ring->q_index);
                 } else {
-                       err = libeth_rx_fq_create(&fq, &ring->q_vector->napi);
+                       err = ice_rxq_pp_create(ring);
                         if (err)
                                 return err;
  
-                       ring->pp = fq.pp;
-                       ring->rx_fqes = fq.fqes;
-                       ring->truesize = fq.truesize;
-                       ring->rx_buf_len = fq.buf_len;
-
                         if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
                                 err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
                                                          ring->q_index,
@@ -699,9 +756,7 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
         return 0;
  
  err_destroy_fq:
-       libeth_rx_fq_destroy(&fq);
-       ring->rx_fqes = NULL;
-       ring->pp = NULL;
+       ice_rxq_pp_destroy(ring);
  
         return err;
  }
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c

index 36fdac4fddc3cd3bbbce6502f3db92668b29cf4d..a1d9abee97e5f22ad60b518da9af740b3f60e65d 100644 (file)
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -3151,6 +3151,10 @@ ice_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring,
         ring->rx_jumbo_max_pending = 0;
         ring->rx_mini_pending = 0;
         ring->rx_jumbo_pending = 0;
+
+       kernel_ring->tcp_data_split = vsi->hsplit ?
+                                     ETHTOOL_TCP_DATA_SPLIT_ENABLED :
+                                     ETHTOOL_TCP_DATA_SPLIT_DISABLED;
  }
  
  static int
@@ -3167,6 +3171,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring,
         int i, timeout = 50, err = 0;
         struct ice_hw *hw = &pf->hw;
         u16 new_rx_cnt, new_tx_cnt;
+       bool hsplit;
  
         if (ring->tx_pending > ICE_MAX_NUM_DESC_BY_MAC(hw) ||
             ring->tx_pending < ICE_MIN_NUM_DESC ||
@@ -3192,9 +3197,12 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring,
                 netdev_info(netdev, "Requested Rx descriptor count rounded up to %d\n",
                             new_rx_cnt);
  
+       hsplit = kernel_ring->tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED;
+
         /* if nothing to do return success */
         if (new_tx_cnt == vsi->tx_rings[0]->count &&
-           new_rx_cnt == vsi->rx_rings[0]->count) {
+           new_rx_cnt == vsi->rx_rings[0]->count &&
+           hsplit == vsi->hsplit) {
                 netdev_dbg(netdev, "Nothing to change, descriptor count is same as requested\n");
                 return 0;
         }
@@ -3224,6 +3232,8 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring,
                                 vsi->xdp_rings[i]->count = new_tx_cnt;
                 vsi->num_tx_desc = (u16)new_tx_cnt;
                 vsi->num_rx_desc = (u16)new_rx_cnt;
+               vsi->hsplit = hsplit;
+
                 netdev_dbg(netdev, "Link is down, descriptor count change happens when link is brought up\n");
                 goto done;
         }
@@ -3330,6 +3340,8 @@ rx_unwind:
         }
  
  process_link:
+       vsi->hsplit = hsplit;
+
         /* Bring interface down, copy in the new ring info, then restore the
          * interface. if VSI is up, bring it down and then back up
          */
@@ -4811,6 +4823,7 @@ static const struct ethtool_ops ice_ethtool_ops = {
                                      ETHTOOL_COALESCE_USE_ADAPTIVE |
                                      ETHTOOL_COALESCE_RX_USECS_HIGH,
         .supported_input_xfrm   = RXH_XFRM_SYM_XOR,
+       .supported_ring_params  = ETHTOOL_RING_USE_TCP_DATA_SPLIT,
         .get_link_ksettings     = ice_get_link_ksettings,
         .set_link_ksettings     = ice_set_link_ksettings,
         .get_fec_stats          = ice_get_fec_stats,
diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h

index 10c312d49e052e7d49a7a59cada95f9609c30bf6..185672c7e17d017aef41943db81fa0b14341f88b 100644 (file)
--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -342,6 +342,9 @@ enum ice_flg64_bits {
  /* for ice_32byte_rx_flex_desc.pkt_length member */
  #define ICE_RX_FLX_DESC_PKT_LEN_M      (0x3FFF) /* 14-bits */
  
+/* ice_32byte_rx_flex_desc::hdr_len_sph_flex_flags1 */
+#define ICE_RX_FLEX_DESC_HDR_LEN_M     GENMASK(10, 0)
+
  enum ice_rx_flex_desc_status_error_0_bits {
         /* Note: These are predefined bit offsets */
         ICE_RX_FLEX_DESC_STATUS0_DD_S = 0,
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c

index 5a966138eacfcf72fbd361ba456bbebe504eef70..ad76768a42323f7e1058a78156d18c5104a28a1e 100644 (file)
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -508,16 +508,34 @@ err:
         return -ENOMEM;
  }
  
+void ice_rxq_pp_destroy(struct ice_rx_ring *rq)
+{
+       struct libeth_fq fq = {
+               .fqes   = rq->rx_fqes,
+               .pp     = rq->pp,
+       };
+
+       libeth_rx_fq_destroy(&fq);
+       rq->rx_fqes = NULL;
+       rq->pp = NULL;
+
+       if (!rq->hdr_pp)
+               return;
+
+       fq.fqes = rq->hdr_fqes;
+       fq.pp = rq->hdr_pp;
+
+       libeth_rx_fq_destroy(&fq);
+       rq->hdr_fqes = NULL;
+       rq->hdr_pp = NULL;
+}
+
  /**
   * ice_clean_rx_ring - Free Rx buffers
   * @rx_ring: ring to be cleaned
   */
  void ice_clean_rx_ring(struct ice_rx_ring *rx_ring)
  {
-       struct libeth_fq fq = {
-               .fqes   = rx_ring->rx_fqes,
-               .pp     = rx_ring->pp,
-       };
         u32 size;
  
         if (rx_ring->xsk_pool) {
@@ -533,9 +551,10 @@ void ice_clean_rx_ring(struct ice_rx_ring *rx_ring)
  
         /* Free all the Rx ring sk_buffs */
         for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) {
-               const struct libeth_fqe *rx_fqes = &rx_ring->rx_fqes[i];
+               libeth_rx_recycle_slow(rx_ring->rx_fqes[i].netmem);
  
-               libeth_rx_recycle_slow(rx_fqes->netmem);
+               if (rx_ring->hdr_pp)
+                       libeth_rx_recycle_slow(rx_ring->hdr_fqes[i].netmem);
  
                 if (unlikely(++i == rx_ring->count))
                         i = 0;
@@ -547,12 +566,9 @@ void ice_clean_rx_ring(struct ice_rx_ring *rx_ring)
                 xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
         }
  
-       libeth_rx_fq_destroy(&fq);
-       rx_ring->rx_fqes = NULL;
-       rx_ring->pp = NULL;
+       ice_rxq_pp_destroy(rx_ring);
  
  rx_skip_free:
-
         /* Zero out the descriptor ring */
         size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
                      PAGE_SIZE);
@@ -806,6 +822,12 @@ void ice_init_ctrl_rx_descs(struct ice_rx_ring *rx_ring, u32 count)
   */
  bool ice_alloc_rx_bufs(struct ice_rx_ring *rx_ring, unsigned int cleaned_count)
  {
+       const struct libeth_fq_fp hdr_fq = {
+               .pp             = rx_ring->hdr_pp,
+               .fqes           = rx_ring->hdr_fqes,
+               .truesize       = rx_ring->hdr_truesize,
+               .count          = rx_ring->count,
+       };
         const struct libeth_fq_fp fq = {
                 .pp             = rx_ring->pp,
                 .fqes           = rx_ring->rx_fqes,
@@ -836,6 +858,20 @@ bool ice_alloc_rx_bufs(struct ice_rx_ring *rx_ring, unsigned int cleaned_count)
                  */
                 rx_desc->read.pkt_addr = cpu_to_le64(addr);
  
+               if (!hdr_fq.pp)
+                       goto next;
+
+               addr = libeth_rx_alloc(&hdr_fq, ntu);
+               if (addr == DMA_MAPPING_ERROR) {
+                       rx_ring->ring_stats->rx_stats.alloc_page_failed++;
+
+                       libeth_rx_recycle_slow(fq.fqes[ntu].netmem);
+                       break;
+               }
+
+               rx_desc->read.hdr_addr = cpu_to_le64(addr);
+
+next:
                 rx_desc++;
                 ntu++;
                 if (unlikely(ntu == rx_ring->count)) {
@@ -933,14 +969,16 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
                 unsigned int size;
                 u16 stat_err_bits;
                 u16 vlan_tci;
+               bool rxe;
  
                 /* get the Rx desc from Rx ring based on 'next_to_clean' */
                 rx_desc = ICE_RX_DESC(rx_ring, ntc);
  
-               /* status_error_len will always be zero for unused descriptors
-                * because it's cleared in cleanup, and overlaps with hdr_addr
-                * which is always zero because packet split isn't used, if the
-                * hardware wrote DD then it will be non-zero
+               /*
+                * The DD bit will always be zero for unused descriptors
+                * because it's cleared in cleanup or when setting the DMA
+                * address of the header buffer, which never uses the DD bit.
+                * If the hardware wrote the descriptor, it will be non-zero.
                  */
                 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
                 if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
@@ -954,12 +992,27 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
  
                 ice_trace(clean_rx_irq, rx_ring, rx_desc);
  
+               stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_HBO_S) |
+                               BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
+               rxe = ice_test_staterr(rx_desc->wb.status_error0,
+                                      stat_err_bits);
+
+               if (!rx_ring->hdr_pp)
+                       goto payload;
+
+               size = le16_get_bits(rx_desc->wb.hdr_len_sph_flex_flags1,
+                                    ICE_RX_FLEX_DESC_HDR_LEN_M);
+               if (unlikely(rxe))
+                       size = 0;
+
+               rx_buf = &rx_ring->hdr_fqes[ntc];
+               libeth_xdp_process_buff(xdp, rx_buf, size);
+               rx_buf->netmem = 0;
+
+payload:
                 size = le16_to_cpu(rx_desc->wb.pkt_len) &
                         ICE_RX_FLX_DESC_PKT_LEN_M;
-
-               stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
-               if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
-                                             stat_err_bits)))
+               if (unlikely(rxe))
                         size = 0;
  
                 /* retrieve a buffer from the ring */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h

index e97a38ef3fe7e9e4f06cbab711b55e8d85ca3cfc..e440c55d9e9f0f0913954da91646658e9efb1ef9 100644 (file)
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -255,6 +255,9 @@ struct ice_rx_ring {
         };
  
         /* CL2 - 2nd cacheline starts here */
+       struct libeth_fqe *hdr_fqes;
+       struct page_pool *hdr_pp;
+
         union {
                 struct libeth_xdp_buff_stash xdp;
                 struct libeth_xdp_buff *xsk;
@@ -273,6 +276,8 @@ struct ice_rx_ring {
         /* used in interrupt processing */
         u16 next_to_use;
         u16 next_to_clean;
+
+       u32 hdr_truesize;
         u32 truesize;
  
         /* stats structs */
@@ -284,6 +289,7 @@ struct ice_rx_ring {
         struct ice_tx_ring *xdp_ring;
         struct ice_rx_ring *next;       /* pointer to next ring in q_vector */
         struct xsk_buff_pool *xsk_pool;
+       u16 rx_hdr_len;
         u16 rx_buf_len;
         dma_addr_t dma;                 /* physical address of ring */
         u8 dcb_tc;                      /* Traffic class of ring */
@@ -396,6 +402,7 @@ static inline unsigned int ice_rx_pg_order(struct ice_rx_ring *ring)
  union ice_32b_rx_flex_desc;
  
  void ice_init_ctrl_rx_descs(struct ice_rx_ring *rx_ring, u32 num_descs);
+void ice_rxq_pp_destroy(struct ice_rx_ring *rq);
  bool ice_alloc_rx_bufs(struct ice_rx_ring *rxr, unsigned int cleaned_count);
  netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev);
  u16
author	Alexander Lobakin <aleksander.lobakin@intel.com>
	Mon, 6 Oct 2025 16:20:53 +0000 (18:20 +0200)
committer	Tony Nguyen <anthony.l.nguyen@intel.com>
	Wed, 29 Oct 2025 20:55:21 +0000 (13:55 -0700)
drivers/net/ethernet/intel/ice/ice.h		patch \| blob \| blame \| history
drivers/net/ethernet/intel/ice/ice_base.c		patch \| blob \| blame \| history
drivers/net/ethernet/intel/ice/ice_ethtool.c		patch \| blob \| blame \| history
drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h		patch \| blob \| blame \| history
drivers/net/ethernet/intel/ice/ice_txrx.c		patch \| blob \| blame \| history
drivers/net/ethernet/intel/ice/ice_txrx.h		patch \| blob \| blame \| history