]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
smb: server: allocate enough space for RW WRs and ib_drain_qp()
authorStefan Metzmacher <metze@samba.org>
Fri, 17 Oct 2025 09:55:02 +0000 (11:55 +0200)
committerSteve French <stfrench@microsoft.com>
Thu, 23 Oct 2025 01:10:12 +0000 (20:10 -0500)
Make use of rdma_rw_mr_factor() to calculate the number of rw
credits and the number of pages per RDMA RW operation.

We get the same numbers for iWarp connections, tested
with siw.ko and irdma.ko (in iWarp mode).

siw:

CIFS: max_qp_rd_atom=128, max_fast_reg_page_list_len = 256
CIFS: max_sgl_rd=0, max_sge_rd=1
CIFS: responder_resources=32 max_frmr_depth=256 mr_io.type=0
CIFS: max_send_wr 384, device reporting max_cqe 3276800 max_qp_wr 32768
ksmbd: max_fast_reg_page_list_len = 256, max_sgl_rd=0, max_sge_rd=1
ksmbd: device reporting max_cqe 3276800 max_qp_wr 32768
ksmbd: Old sc->rw_io.credits: max = 9, num_pages = 256
ksmbd: New sc->rw_io.credits: max = 9, num_pages = 256, maxpages=2048
ksmbd: Info: rdma_send_wr 27 + max_send_wr 256 = 283

irdma (in iWarp mode):

CIFS: max_qp_rd_atom=127, max_fast_reg_page_list_len = 262144
CIFS: max_sgl_rd=0, max_sge_rd=13
CIFS: responder_resources=32 max_frmr_depth=2048 mr_io.type=0
CIFS: max_send_wr 384, device reporting max_cqe 1048574 max_qp_wr 4063
ksmbd: max_fast_reg_page_list_len = 262144, max_sgl_rd=0, max_sge_rd=13
ksmbd: device reporting max_cqe 1048574 max_qp_wr 4063
ksmbd: Old sc->rw_io.credits: max = 9, num_pages = 256
ksmbd: New sc->rw_io.credits: max = 9, num_pages = 256, maxpages=2048
ksmbd: rdma_send_wr 27 + max_send_wr 256 = 283

This means that we get the different correct numbers for ROCE,
tested with rdma_rxe.ko and irdma.ko (in RoCEv2 mode).

rxe:

CIFS: max_qp_rd_atom=128, max_fast_reg_page_list_len = 512
CIFS: max_sgl_rd=0, max_sge_rd=32
CIFS: responder_resources=32 max_frmr_depth=512 mr_io.type=0
CIFS: max_send_wr 384, device reporting max_cqe 32767 max_qp_wr 1048576
ksmbd: max_fast_reg_page_list_len = 512, max_sgl_rd=0, max_sge_rd=32
ksmbd: device reporting max_cqe 32767 max_qp_wr 1048576
ksmbd: Old sc->rw_io.credits: max = 9, num_pages = 256
ksmbd: New sc->rw_io.credits: max = 65, num_pages = 32, maxpages=2048
ksmbd: rdma_send_wr 65 + max_send_wr 256 = 321

irdma (in RoCEv2 mode):

CIFS: max_qp_rd_atom=127, max_fast_reg_page_list_len = 262144,
CIFS: max_sgl_rd=0, max_sge_rd=13
CIFS: responder_resources=32 max_frmr_depth=2048 mr_io.type=0
CIFS: max_send_wr 384, device reporting max_cqe 1048574 max_qp_wr 4063
ksmbd: max_fast_reg_page_list_len = 262144, max_sgl_rd=0, max_sge_rd=13
ksmbd: device reporting max_cqe 1048574 max_qp_wr 4063
ksmbd: Old sc->rw_io.credits: max = 9, num_pages = 256,
ksmbd: New sc->rw_io.credits: max = 159, num_pages = 13, maxpages=2048
ksmbd: rdma_send_wr 159 + max_send_wr 256 = 415

And rely on rdma_rw_init_qp() to setup ib_mr_pool_init() for
RW MRs. ib_mr_pool_destroy() will be called by rdma_rw_cleanup_mrs().

It seems the code was implemented before the rdma_rw_* layer
was fully established in the kernel.

While there also add additional space for ib_drain_qp().

This should make sure ib_post_send() will never fail
because the submission queue is full.

Fixes: ddbdc861e37c ("ksmbd: smbd: introduce read/write credits for RDMA read/write")
Fixes: 4c564f03e23b ("smb: server: make use of common smbdirect_socket")
Fixes: 177368b99243 ("smb: server: make use of common smbdirect_socket_parameters")
Fixes: 95475d8886bd ("smb: server: make use smbdirect_socket.rw_io.credits")
Cc: Steve French <smfrench@gmail.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
fs/smb/server/transport_rdma.c

index a201c5871a77cd49af6431b16e30fd0eeeef15e3..19b51205dc8c3c763e1f225d2f11d4222a43fc4f 100644 (file)
@@ -471,7 +471,6 @@ static void free_transport(struct smb_direct_transport *t)
 
        if (sc->ib.qp) {
                ib_drain_qp(sc->ib.qp);
-               ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs);
                sc->ib.qp = NULL;
                rdma_destroy_qp(sc->rdma.cm_id);
        }
@@ -1871,20 +1870,11 @@ out_err:
        return ret;
 }
 
-static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc)
-{
-       return min_t(unsigned int,
-                    sc->ib.dev->attrs.max_fast_reg_page_list_len,
-                    256);
-}
-
-static int smb_direct_init_params(struct smbdirect_socket *sc,
-                                 struct ib_qp_cap *cap)
+static int smb_direct_init_params(struct smbdirect_socket *sc)
 {
        struct smbdirect_socket_parameters *sp = &sc->parameters;
-       struct ib_device *device = sc->ib.dev;
-       int max_send_sges, max_rw_wrs, max_send_wrs;
-       unsigned int max_sge_per_wr, wrs_per_credit;
+       int max_send_sges;
+       unsigned int maxpages;
 
        /* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
         * SMB2 response could be mapped.
@@ -1895,67 +1885,18 @@ static int smb_direct_init_params(struct smbdirect_socket *sc,
                return -EINVAL;
        }
 
-       /* Calculate the number of work requests for RDMA R/W.
-        * The maximum number of pages which can be registered
-        * with one Memory region can be transferred with one
-        * R/W credit. And at least 4 work requests for each credit
-        * are needed for MR registration, RDMA R/W, local & remote
-        * MR invalidation.
-        */
-       sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc);
-       sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size,
-                                        (sc->rw_io.credits.num_pages - 1) *
-                                        PAGE_SIZE);
-
-       max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge,
-                              device->attrs.max_sge_rd);
-       max_sge_per_wr = max_t(unsigned int, max_sge_per_wr,
-                              max_send_sges);
-       wrs_per_credit = max_t(unsigned int, 4,
-                              DIV_ROUND_UP(sc->rw_io.credits.num_pages,
-                                           max_sge_per_wr) + 1);
-       max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit;
-
-       max_send_wrs = sp->send_credit_target + max_rw_wrs;
-       if (max_send_wrs > device->attrs.max_cqe ||
-           max_send_wrs > device->attrs.max_qp_wr) {
-               pr_err("consider lowering send_credit_target = %d\n",
-                      sp->send_credit_target);
-               pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
-                      device->attrs.max_cqe, device->attrs.max_qp_wr);
-               return -EINVAL;
-       }
-
-       if (sp->recv_credit_max > device->attrs.max_cqe ||
-           sp->recv_credit_max > device->attrs.max_qp_wr) {
-               pr_err("consider lowering receive_credit_max = %d\n",
-                      sp->recv_credit_max);
-               pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
-                      device->attrs.max_cqe, device->attrs.max_qp_wr);
-               return -EINVAL;
-       }
-
-       if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) {
-               pr_err("warning: device max_send_sge = %d too small\n",
-                      device->attrs.max_send_sge);
-               return -EINVAL;
-       }
-       if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
-               pr_err("warning: device max_recv_sge = %d too small\n",
-                      device->attrs.max_recv_sge);
-               return -EINVAL;
-       }
+       maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE);
+       sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev,
+                                                 sc->rdma.cm_id->port_num,
+                                                 maxpages);
+       sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max);
+       /* add one extra in order to handle unaligned pages */
+       sc->rw_io.credits.max += 1;
 
        sc->recv_io.credits.target = 1;
 
        atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max);
 
-       cap->max_send_wr = max_send_wrs;
-       cap->max_recv_wr = sp->recv_credit_max;
-       cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
-       cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
-       cap->max_inline_data = 0;
-       cap->max_rdma_ctxs = sc->rw_io.credits.max;
        return 0;
 }
 
@@ -2029,13 +1970,129 @@ err:
        return -ENOMEM;
 }
 
-static int smb_direct_create_qpair(struct smbdirect_socket *sc,
-                                  struct ib_qp_cap *cap)
+static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr)
+{
+       /*
+        * This could be split out of rdma_rw_init_qp()
+        * and be a helper function next to rdma_rw_mr_factor()
+        *
+        * We can't check unlikely(rdma_rw_force_mr) here,
+        * but that is most likely 0 anyway.
+        */
+       u32 factor;
+
+       WARN_ON_ONCE(attr->port_num == 0);
+
+       /*
+        * Each context needs at least one RDMA READ or WRITE WR.
+        *
+        * For some hardware we might need more, eventually we should ask the
+        * HCA driver for a multiplier here.
+        */
+       factor = 1;
+
+       /*
+        * If the device needs MRs to perform RDMA READ or WRITE operations,
+        * we'll need two additional MRs for the registrations and the
+        * invalidation.
+        */
+       if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd)
+               factor += 2;    /* inv + reg */
+
+       return factor * attr->cap.max_rdma_ctxs;
+}
+
+static int smb_direct_create_qpair(struct smbdirect_socket *sc)
 {
        struct smbdirect_socket_parameters *sp = &sc->parameters;
        int ret;
+       struct ib_qp_cap qp_cap;
        struct ib_qp_init_attr qp_attr;
-       int pages_per_rw;
+       u32 max_send_wr;
+       u32 rdma_send_wr;
+
+       /*
+        * Note that {rdma,ib}_create_qp() will call
+        * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0.
+        * It will adjust cap->max_send_wr to the required
+        * number of additional WRs for the RDMA RW operations.
+        * It will cap cap->max_send_wr to the device limit.
+        *
+        * +1 for ib_drain_qp
+        */
+       qp_cap.max_send_wr = sp->send_credit_target + 1;
+       qp_cap.max_recv_wr = sp->recv_credit_max + 1;
+       qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
+       qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
+       qp_cap.max_inline_data = 0;
+       qp_cap.max_rdma_ctxs = sc->rw_io.credits.max;
+
+       /*
+        * Find out the number of max_send_wr
+        * after rdma_rw_init_qp() adjusted it.
+        *
+        * We only do it on a temporary variable,
+        * as rdma_create_qp() will trigger
+        * rdma_rw_init_qp() again.
+        */
+       memset(&qp_attr, 0, sizeof(qp_attr));
+       qp_attr.cap = qp_cap;
+       qp_attr.port_num = sc->rdma.cm_id->port_num;
+       rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr);
+       max_send_wr = qp_cap.max_send_wr + rdma_send_wr;
+
+       if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe ||
+           qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) {
+               pr_err("Possible CQE overrun: max_send_wr %d\n",
+                      qp_cap.max_send_wr);
+               pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+                      IB_DEVICE_NAME_MAX,
+                      sc->ib.dev->name,
+                      sc->ib.dev->attrs.max_cqe,
+                      sc->ib.dev->attrs.max_qp_wr);
+               pr_err("consider lowering send_credit_target = %d\n",
+                      sp->send_credit_target);
+               return -EINVAL;
+       }
+
+       if (qp_cap.max_rdma_ctxs &&
+           (max_send_wr >= sc->ib.dev->attrs.max_cqe ||
+            max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) {
+               pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n",
+                      rdma_send_wr, qp_cap.max_send_wr, max_send_wr);
+               pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+                      IB_DEVICE_NAME_MAX,
+                      sc->ib.dev->name,
+                      sc->ib.dev->attrs.max_cqe,
+                      sc->ib.dev->attrs.max_qp_wr);
+               pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n",
+                      sp->send_credit_target, qp_cap.max_rdma_ctxs);
+               return -EINVAL;
+       }
+
+       if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe ||
+           qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) {
+               pr_err("Possible CQE overrun: max_recv_wr %d\n",
+                      qp_cap.max_recv_wr);
+               pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+                      IB_DEVICE_NAME_MAX,
+                      sc->ib.dev->name,
+                      sc->ib.dev->attrs.max_cqe,
+                      sc->ib.dev->attrs.max_qp_wr);
+               pr_err("consider lowering receive_credit_max = %d\n",
+                      sp->recv_credit_max);
+               return -EINVAL;
+       }
+
+       if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge ||
+           qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) {
+               pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
+                      IB_DEVICE_NAME_MAX,
+                      sc->ib.dev->name,
+                      sc->ib.dev->attrs.max_send_sge,
+                      sc->ib.dev->attrs.max_recv_sge);
+               return -EINVAL;
+       }
 
        sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
        if (IS_ERR(sc->ib.pd)) {
@@ -2046,8 +2103,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
        }
 
        sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc,
-                                        sp->send_credit_target +
-                                        cap->max_rdma_ctxs,
+                                        max_send_wr,
                                         IB_POLL_WORKQUEUE);
        if (IS_ERR(sc->ib.send_cq)) {
                pr_err("Can't create RDMA send CQ\n");
@@ -2057,7 +2113,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
        }
 
        sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc,
-                                        sp->recv_credit_max,
+                                        qp_cap.max_recv_wr,
                                         IB_POLL_WORKQUEUE);
        if (IS_ERR(sc->ib.recv_cq)) {
                pr_err("Can't create RDMA recv CQ\n");
@@ -2066,10 +2122,18 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
                goto err;
        }
 
+       /*
+        * We reset completely here!
+        * As the above use was just temporary
+        * to calc max_send_wr and rdma_send_wr.
+        *
+        * rdma_create_qp() will trigger rdma_rw_init_qp()
+        * again if max_rdma_ctxs is not 0.
+        */
        memset(&qp_attr, 0, sizeof(qp_attr));
        qp_attr.event_handler = smb_direct_qpair_handler;
        qp_attr.qp_context = sc;
-       qp_attr.cap = *cap;
+       qp_attr.cap = qp_cap;
        qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
        qp_attr.qp_type = IB_QPT_RC;
        qp_attr.send_cq = sc->ib.send_cq;
@@ -2085,18 +2149,6 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
        sc->ib.qp = sc->rdma.cm_id->qp;
        sc->rdma.cm_id->event_handler = smb_direct_cm_handler;
 
-       pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1;
-       if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) {
-               ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs,
-                                     sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG,
-                                     sc->rw_io.credits.num_pages, 0);
-               if (ret) {
-                       pr_err("failed to init mr pool count %zu pages %zu\n",
-                              sc->rw_io.credits.max, sc->rw_io.credits.num_pages);
-                       goto err;
-               }
-       }
-
        return 0;
 err:
        if (sc->ib.qp) {
@@ -2183,10 +2235,9 @@ out:
 
 static int smb_direct_connect(struct smbdirect_socket *sc)
 {
-       struct ib_qp_cap qp_cap;
        int ret;
 
-       ret = smb_direct_init_params(sc, &qp_cap);
+       ret = smb_direct_init_params(sc);
        if (ret) {
                pr_err("Can't configure RDMA parameters\n");
                return ret;
@@ -2198,7 +2249,7 @@ static int smb_direct_connect(struct smbdirect_socket *sc)
                return ret;
        }
 
-       ret = smb_direct_create_qpair(sc, &qp_cap);
+       ret = smb_direct_create_qpair(sc);
        if (ret) {
                pr_err("Can't accept RDMA client: %d\n", ret);
                return ret;