From 322617a06c97153f7b0681ecaa55490abccff7fa Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Thu, 23 Oct 2025 16:21:05 +0900 Subject: [PATCH] NTB: ntb_transport: Add 'tx_memcpy_offload' module option Some platforms (e.g. R-Car S4) do not gain from using a DMAC on TX path in ntb_transport and end up CPU-bound on memcpy_toio(). Add a module parameter 'tx_memcpy_offload' that moves the TX memcpy_toio() and descriptor writes to a per-QP kernel thread. It is disabled by default. This change also fixes a rare ordering hazard in ntb_tx_copy_callback(), that was observed on R-Car S4 once throughput improved with the new module parameter: the DONE flag write to the peer MW, which is WC mapped, could be observed after the DB/MSI trigger. Both operations are posted PCIe MWr (often via different OB iATUs), so WC buffering and bridges may reorder visibility. Insert dma_mb() to enforce store->load ordering and then read back hdr->flags to flush the posted write before ringing the doorbell / issuing MSI. While at it, update tx_index with WRITE_ONCE() at the earlier possible location to make ntb_transport_tx_free_entry() robust. Signed-off-by: Koichiro Den Signed-off-by: Jon Mason --- drivers/ntb/ntb_transport.c | 104 ++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 4 deletions(-) diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index 4483160851d4..cf64c6d0275a 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -54,12 +54,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include "linux/ntb.h" #include "linux/ntb_transport.h" @@ -100,6 +102,10 @@ module_param(use_msi, bool, 0644); MODULE_PARM_DESC(use_msi, "Use MSI interrupts instead of doorbells"); #endif +static bool tx_memcpy_offload; +module_param(tx_memcpy_offload, bool, 0644); +MODULE_PARM_DESC(tx_memcpy_offload, "Offload TX memcpy_toio() to a kernel thread"); + static struct dentry *nt_debugfs_dir; /* Only two-ports NTB devices are supported */ @@ -148,7 +154,9 @@ struct ntb_transport_qp { void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data, void *data, int len); struct list_head tx_free_q; + struct list_head tx_offl_q; spinlock_t ntb_tx_free_q_lock; + spinlock_t ntb_tx_offl_q_lock; void __iomem *tx_mw; phys_addr_t tx_mw_phys; size_t tx_mw_size; @@ -199,6 +207,9 @@ struct ntb_transport_qp { int msi_irq; struct ntb_msi_desc msi_desc; struct ntb_msi_desc peer_msi_desc; + + struct task_struct *tx_offload_thread; + wait_queue_head_t tx_offload_wq; }; struct ntb_transport_mw { @@ -284,7 +295,13 @@ static int ntb_async_tx_submit(struct ntb_transport_qp *qp, static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset); static int ntb_async_rx_submit(struct ntb_queue_entry *entry, void *offset); static void ntb_memcpy_rx(struct ntb_queue_entry *entry, void *offset); +static int ntb_tx_memcpy_kthread(void *data); + +static inline bool ntb_tx_offload_enabled(struct ntb_transport_qp *qp) +{ + return tx_memcpy_offload && qp && qp->tx_offload_thread; +} static int ntb_transport_bus_match(struct device *dev, const struct device_driver *drv) @@ -1254,11 +1271,13 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt, spin_lock_init(&qp->ntb_rx_q_lock); spin_lock_init(&qp->ntb_tx_free_q_lock); + spin_lock_init(&qp->ntb_tx_offl_q_lock); INIT_LIST_HEAD(&qp->rx_post_q); INIT_LIST_HEAD(&qp->rx_pend_q); INIT_LIST_HEAD(&qp->rx_free_q); INIT_LIST_HEAD(&qp->tx_free_q); + INIT_LIST_HEAD(&qp->tx_offl_q); tasklet_init(&qp->rxc_db_work, ntb_transport_rxc_db, (unsigned long)qp); @@ -1785,6 +1804,13 @@ static void ntb_tx_copy_callback(void *data, iowrite32(entry->flags | DESC_DONE_FLAG, &hdr->flags); + /* + * Make DONE flag visible before DB/MSI. WC + posted MWr may reorder + * across iATU/bridge (platform-dependent). Order and flush here. + */ + dma_mb(); + ioread32(&hdr->flags); + if (qp->use_msi) ntb_msi_peer_trigger(qp->ndev, PIDX, &qp->peer_msi_desc); else @@ -1805,7 +1831,7 @@ static void ntb_tx_copy_callback(void *data, ntb_list_add(&qp->ntb_tx_free_q_lock, &entry->entry, &qp->tx_free_q); } -static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset) +static void ntb_memcpy_tx_on_stack(struct ntb_queue_entry *entry, void __iomem *offset) { #ifdef ARCH_HAS_NOCACHE_UACCESS /* @@ -1823,6 +1849,54 @@ static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset) ntb_tx_copy_callback(entry, NULL); } +static int ntb_tx_memcpy_kthread(void *data) +{ + struct ntb_transport_qp *qp = data; + struct ntb_queue_entry *entry, *tmp; + const int resched_nr = 64; + LIST_HEAD(local_list); + void __iomem *offset; + int processed = 0; + + while (!kthread_should_stop()) { + spin_lock_irq(&qp->ntb_tx_offl_q_lock); + wait_event_interruptible_lock_irq_timeout(qp->tx_offload_wq, + kthread_should_stop() || + !list_empty(&qp->tx_offl_q), + qp->ntb_tx_offl_q_lock, 5*HZ); + list_splice_tail_init(&qp->tx_offl_q, &local_list); + spin_unlock_irq(&qp->ntb_tx_offl_q_lock); + + list_for_each_entry_safe(entry, tmp, &local_list, entry) { + list_del(&entry->entry); + offset = qp->tx_mw + qp->tx_max_frame * entry->tx_index; + ntb_memcpy_tx_on_stack(entry, offset); + if (++processed >= resched_nr) { + cond_resched(); + processed = 0; + } + } + cond_resched(); + } + + return 0; +} + +static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset) +{ + struct ntb_transport_qp *qp = entry->qp; + + if (WARN_ON_ONCE(!qp)) + return; + + if (ntb_tx_offload_enabled(qp)) { + ntb_list_add(&qp->ntb_tx_offl_q_lock, &entry->entry, + &qp->tx_offl_q); + wake_up(&qp->tx_offload_wq); + } else + ntb_memcpy_tx_on_stack(entry, offset); +} + static int ntb_async_tx_submit(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry) { @@ -1895,6 +1969,9 @@ static void ntb_async_tx(struct ntb_transport_qp *qp, hdr = offset + qp->tx_max_frame - sizeof(struct ntb_payload_header); entry->tx_hdr = hdr; + WARN_ON_ONCE(!ntb_transport_tx_free_entry(qp)); + WRITE_ONCE(qp->tx_index, (qp->tx_index + 1) % qp->tx_max_entry); + iowrite32(entry->len, &hdr->len); iowrite32((u32)qp->tx_pkts, &hdr->ver); @@ -1935,9 +2012,6 @@ static int ntb_process_tx(struct ntb_transport_qp *qp, ntb_async_tx(qp, entry); - qp->tx_index++; - qp->tx_index %= qp->tx_max_entry; - qp->tx_pkts++; return 0; @@ -2034,6 +2108,20 @@ ntb_transport_create_queue(void *data, struct device *client_dev, qp->tx_handler = handlers->tx_handler; qp->event_handler = handlers->event_handler; + init_waitqueue_head(&qp->tx_offload_wq); + if (tx_memcpy_offload) { + qp->tx_offload_thread = kthread_run(ntb_tx_memcpy_kthread, qp, + "ntb-txcpy/%s/%u", + pci_name(ndev->pdev), qp->qp_num); + if (IS_ERR(qp->tx_offload_thread)) { + dev_warn(&nt->ndev->dev, + "tx memcpy offload thread creation failed: %ld; falling back to inline copy\n", + PTR_ERR(qp->tx_offload_thread)); + qp->tx_offload_thread = NULL; + } + } else + qp->tx_offload_thread = NULL; + dma_cap_zero(dma_mask); dma_cap_set(DMA_MEMCPY, dma_mask); @@ -2141,6 +2229,11 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp) qp->active = false; + if (qp->tx_offload_thread) { + kthread_stop(qp->tx_offload_thread); + qp->tx_offload_thread = NULL; + } + if (qp->tx_dma_chan) { struct dma_chan *chan = qp->tx_dma_chan; /* Putting the dma_chan to NULL will force any new traffic to be @@ -2204,6 +2297,9 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp) while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q))) kfree(entry); + while ((entry = ntb_list_rm(&qp->ntb_tx_offl_q_lock, &qp->tx_offl_q))) + kfree(entry); + qp->transport->qp_bitmap_free |= qp_bit; dev_info(&pdev->dev, "NTB Transport QP %d freed\n", qp->qp_num); -- 2.47.3