]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
scsi: target: Move LUN stats to per-CPU
authorMike Christie <michael.christie@oracle.com>
Wed, 17 Sep 2025 22:12:55 +0000 (17:12 -0500)
committerMartin K. Petersen <martin.petersen@oracle.com>
Mon, 3 Nov 2025 03:06:12 +0000 (22:06 -0500)
The atomic use in the main I/O path is causing perf issues when using
higher performance backend devices and multiple queues (more than
10 when using vhost-scsi) like with this fio workload:

[global]
bs=4K
iodepth=128
direct=1
ioengine=libaio
group_reporting
time_based
runtime=120
name=standard-iops
rw=randread
numjobs=16
cpus_allowed=0-15

To fix this issue, move the LUN stats to per CPU.

Note: I forgot to include this patch with the delayed/ordered per CPU
tracking and per device/device entry per CPU stats. With this patch you
get the full 33% improvements when using fast backends, multiple queues
and multiple IO submiters.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Link: https://patch.msgid.link/20250917221338.14813-4-michael.christie@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/target/target_core_device.c
drivers/target/target_core_fabric_configfs.c
drivers/target/target_core_internal.h
drivers/target/target_core_stat.c
drivers/target/target_core_tpg.c
drivers/target/target_core_transport.c
include/target/target_core_base.h

index 7bb711b24c0d72d487600fde0b25addaf1d41dfb..2d4a7c0c69ce7b401c6ec114cb73e197c1005099 100644 (file)
@@ -814,6 +814,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
        dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN;
        dev->dev_attrib.submit_type = TARGET_FABRIC_DEFAULT_SUBMIT;
 
+       /* Skip allocating lun_stats since we can't export them. */
        xcopy_lun = &dev->xcopy_lun;
        rcu_assign_pointer(xcopy_lun->lun_se_dev, dev);
        init_completion(&xcopy_lun->lun_shutdown_comp);
index 7156a4dc1ca7d9ddebe469fa59791b52e63b653b..13159928e3657bf9c1dbdc5ae36bee7f92225db4 100644 (file)
@@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
        struct se_lun *lun = container_of(to_config_group(item),
                                          struct se_lun, lun_group);
 
-       kfree_rcu(lun, rcu_head);
+       call_rcu(&lun->rcu_head, target_tpg_free_lun);
 }
 
 static struct configfs_item_operations target_fabric_port_item_ops = {
index 20aab1f505655c9a76c2991c27e8780412d77b00..763e6d26e187725164033e6ae43c8c34d336fdab 100644 (file)
@@ -125,6 +125,7 @@ void        core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
                                  struct se_lun *);
 void   core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
 struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
+void   target_tpg_free_lun(struct rcu_head *head);
 int    core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
                bool, struct se_device *);
 void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
index e29d43dacaf7f895116dbeeece1eb10037925478..083205052be22d0ba4d6a4923ddacd7331d649e7 100644 (file)
@@ -606,53 +606,30 @@ static ssize_t target_stat_tgt_port_port_index_show(struct config_item *item,
        return ret;
 }
 
-static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
-               char *page)
-{
-       struct se_lun *lun = to_stat_tgt_port(item);
-       struct se_device *dev;
-       ssize_t ret = -ENODEV;
-
-       rcu_read_lock();
-       dev = rcu_dereference(lun->lun_se_dev);
-       if (dev)
-               ret = snprintf(page, PAGE_SIZE, "%lu\n",
-                              atomic_long_read(&lun->lun_stats.cmd_pdus));
-       rcu_read_unlock();
-       return ret;
-}
-
-static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
-               char *page)
-{
-       struct se_lun *lun = to_stat_tgt_port(item);
-       struct se_device *dev;
-       ssize_t ret = -ENODEV;
-
-       rcu_read_lock();
-       dev = rcu_dereference(lun->lun_se_dev);
-       if (dev)
-               ret = snprintf(page, PAGE_SIZE, "%u\n",
-                       (u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
-       rcu_read_unlock();
-       return ret;
+#define tgt_port_show_per_cpu_stat(prefix, field, shift)               \
+per_cpu_stat_snprintf(scsi_port_stats, prefix, field, shift);          \
+static ssize_t                                                         \
+target_stat_##prefix##_show(struct config_item *item, char *page)      \
+{                                                                      \
+       struct se_lun *lun = to_stat_tgt_port(item);                    \
+       struct se_device *dev;                                          \
+       int ret;                                                        \
+                                                                       \
+       rcu_read_lock();                                                \
+       dev = rcu_dereference(lun->lun_se_dev);                         \
+       if (!dev) {                                                     \
+               rcu_read_unlock();                                      \
+               return -ENODEV;                                         \
+       }                                                               \
+                                                                       \
+       ret = per_cpu_stat_##prefix##_snprintf(lun->lun_stats, page);   \
+       rcu_read_unlock();                                              \
+       return ret;                                                     \
 }
 
-static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
-               char *page)
-{
-       struct se_lun *lun = to_stat_tgt_port(item);
-       struct se_device *dev;
-       ssize_t ret = -ENODEV;
-
-       rcu_read_lock();
-       dev = rcu_dereference(lun->lun_se_dev);
-       if (dev)
-               ret = snprintf(page, PAGE_SIZE, "%u\n",
-                               (u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
-       rcu_read_unlock();
-       return ret;
-}
+tgt_port_show_per_cpu_stat(tgt_port_in_cmds, cmd_pdus, 0);
+tgt_port_show_per_cpu_stat(tgt_port_write_mbytes, rx_data_octets, 20);
+tgt_port_show_per_cpu_stat(tgt_port_read_mbytes, tx_data_octets, 20);
 
 static ssize_t target_stat_tgt_port_hs_in_cmds_show(struct config_item *item,
                char *page)
index c0e429e5ef310104c292747efb16f0d920252ba9..8b5ad50baa4360b92e63567ea47a7213b2e55bbe 100644 (file)
@@ -548,7 +548,7 @@ int core_tpg_register(
                ret = core_tpg_add_lun(se_tpg, se_tpg->tpg_virt_lun0,
                                true, g_lun0_dev);
                if (ret < 0) {
-                       kfree(se_tpg->tpg_virt_lun0);
+                       target_tpg_free_lun(&se_tpg->tpg_virt_lun0->rcu_head);
                        return ret;
                }
        }
@@ -595,7 +595,7 @@ int core_tpg_deregister(struct se_portal_group *se_tpg)
 
        if (se_tpg->proto_id >= 0) {
                core_tpg_remove_lun(se_tpg, se_tpg->tpg_virt_lun0);
-               kfree_rcu(se_tpg->tpg_virt_lun0, rcu_head);
+               call_rcu(&se_tpg->tpg_virt_lun0->rcu_head, target_tpg_free_lun);
        }
 
        target_tpg_deregister_rtpi(se_tpg);
@@ -615,6 +615,13 @@ struct se_lun *core_tpg_alloc_lun(
                pr_err("Unable to allocate se_lun memory\n");
                return ERR_PTR(-ENOMEM);
        }
+
+       lun->lun_stats = alloc_percpu(struct scsi_port_stats);
+       if (!lun->lun_stats) {
+               pr_err("Unable to allocate se_lun stats memory\n");
+               goto free_lun;
+       }
+
        lun->unpacked_lun = unpacked_lun;
        atomic_set(&lun->lun_acl_count, 0);
        init_completion(&lun->lun_shutdown_comp);
@@ -628,6 +635,18 @@ struct se_lun *core_tpg_alloc_lun(
        lun->lun_tpg = tpg;
 
        return lun;
+
+free_lun:
+       kfree(lun);
+       return ERR_PTR(-ENOMEM);
+}
+
+void target_tpg_free_lun(struct rcu_head *head)
+{
+       struct se_lun *lun = container_of(head, struct se_lun, rcu_head);
+
+       free_percpu(lun->lun_stats);
+       kfree(lun);
 }
 
 int core_tpg_add_lun(
index 0a76bdfe55282073c6cb423051d279d7ff1693b7..fca9b44288bcc011a7798b702d33bbc1e8db3a5f 100644 (file)
@@ -1571,7 +1571,12 @@ target_cmd_parse_cdb(struct se_cmd *cmd)
                return ret;
 
        cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE;
-       atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus);
+       /*
+        * If this is the xcopy_lun then we won't have lun_stats since we
+        * can't export them.
+        */
+       if (cmd->se_lun->lun_stats)
+               this_cpu_inc(cmd->se_lun->lun_stats->cmd_pdus);
        return 0;
 }
 EXPORT_SYMBOL(target_cmd_parse_cdb);
@@ -2597,8 +2602,9 @@ queue_rsp:
                    !(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
                        goto queue_status;
 
-               atomic_long_add(cmd->data_length,
-                               &cmd->se_lun->lun_stats.tx_data_octets);
+               if (cmd->se_lun->lun_stats)
+                       this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+                                    cmd->data_length);
                /*
                 * Perform READ_STRIP of PI using software emulation when
                 * backend had PI enabled, if the transport will not be
@@ -2621,14 +2627,16 @@ queue_rsp:
                        goto queue_full;
                break;
        case DMA_TO_DEVICE:
-               atomic_long_add(cmd->data_length,
-                               &cmd->se_lun->lun_stats.rx_data_octets);
+               if (cmd->se_lun->lun_stats)
+                       this_cpu_add(cmd->se_lun->lun_stats->rx_data_octets,
+                                    cmd->data_length);
                /*
                 * Check if we need to send READ payload for BIDI-COMMAND
                 */
                if (cmd->se_cmd_flags & SCF_BIDI) {
-                       atomic_long_add(cmd->data_length,
-                                       &cmd->se_lun->lun_stats.tx_data_octets);
+                       if (cmd->se_lun->lun_stats)
+                               this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+                                            cmd->data_length);
                        ret = cmd->se_tfo->queue_data_in(cmd);
                        if (ret)
                                goto queue_full;
index 27e1f9d5f0c6cdebd3a9fbfd8447fd50ef24c02e..372da2eadf541a8e503cf97b839e24bb7045a9aa 100644 (file)
@@ -744,9 +744,9 @@ struct se_port_stat_grps {
 };
 
 struct scsi_port_stats {
-       atomic_long_t   cmd_pdus;
-       atomic_long_t   tx_data_octets;
-       atomic_long_t   rx_data_octets;
+       u64                     cmd_pdus;
+       u64                     tx_data_octets;
+       u64                     rx_data_octets;
 };
 
 struct se_lun {
@@ -773,7 +773,7 @@ struct se_lun {
        spinlock_t              lun_tg_pt_gp_lock;
 
        struct se_portal_group  *lun_tpg;
-       struct scsi_port_stats  lun_stats;
+       struct scsi_port_stats  __percpu *lun_stats;
        struct config_group     lun_group;
        struct se_port_stat_grps port_stat_grps;
        struct completion       lun_shutdown_comp;