]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - net/sunrpc/xprtrdma/verbs.c
xprtrdma: Double free in rpcrdma_sendctxs_create()
[thirdparty/kernel/stable.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
a2268cfb 1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
f58851e6 2/*
62b56a67 3 * Copyright (c) 2014-2017 Oracle. All rights reserved.
c56c65fb
TT
4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the BSD-type
10 * license below:
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 *
16 * Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 *
19 * Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials provided
22 * with the distribution.
23 *
24 * Neither the name of the Network Appliance, Inc. nor the names of
25 * its contributors may be used to endorse or promote products
26 * derived from this software without specific prior written
27 * permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
40 */
41
c56c65fb
TT
42/*
43 * verbs.c
44 *
45 * Encapsulates the major functions managing:
46 * o adapters
47 * o endpoints
48 * o connections
49 * o buffer memory
50 */
51
a6b7a407 52#include <linux/interrupt.h>
5a0e3ad6 53#include <linux/slab.h>
0dd39cae 54#include <linux/sunrpc/addr.h>
05c97466 55#include <linux/sunrpc/svc_rdma.h>
ae72950a
CL
56
57#include <asm-generic/barrier.h>
65866f82 58#include <asm/bitops.h>
56a6bd15 59
0a90487b 60#include <rdma/ib_cm.h>
c56c65fb 61
f58851e6 62#include "xprt_rdma.h"
b6e717cb 63#include <trace/events/rpcrdma.h>
f58851e6 64
c56c65fb
TT
65/*
66 * Globals/Macros
67 */
68
f895b252 69#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb
TT
70# define RPCDBG_FACILITY RPCDBG_TRANS
71#endif
72
73/*
74 * internal functions
75 */
efd81e90 76static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
96ceddea
CL
77static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
78static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
7c8d9e7c 79static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp);
bebd0318 80static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
6ceea368 81static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
c56c65fb 82
6d2d0ee2
CL
83/* Wait for outstanding transport work to finish.
84 */
85static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
c56c65fb 86{
6d2d0ee2
CL
87 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
88 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb 89
6d2d0ee2
CL
90 /* Flush Receives, then wait for deferred Reply work
91 * to complete.
92 */
93 ib_drain_qp(ia->ri_id->qp);
94 drain_workqueue(buf->rb_completion_wq);
f1a03b76 95
6d2d0ee2
CL
96 /* Deferred Reply processing might have scheduled
97 * local invalidations.
98 */
99 ib_drain_sq(ia->ri_id->qp);
f1a03b76
CL
100}
101
f9521d53
CL
102/**
103 * rpcrdma_qp_event_handler - Handle one QP event (error notification)
104 * @event: details of the event
105 * @context: ep that owns QP where event occurred
106 *
107 * Called from the RDMA provider (device driver) possibly in an interrupt
108 * context.
109 */
c56c65fb 110static void
f9521d53 111rpcrdma_qp_event_handler(struct ib_event *event, void *context)
c56c65fb
TT
112{
113 struct rpcrdma_ep *ep = context;
643cf323
CL
114 struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
115 rx_ep);
c56c65fb 116
f9521d53 117 trace_xprtrdma_qp_event(r_xprt, event);
c56c65fb
TT
118}
119
2fa8f88d
CL
120/**
121 * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
122 * @cq: completion queue (ignored)
123 * @wc: completed WR
124 *
fc664485
CL
125 */
126static void
2fa8f88d 127rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
fc664485 128{
ae72950a
CL
129 struct ib_cqe *cqe = wc->wr_cqe;
130 struct rpcrdma_sendctx *sc =
131 container_of(cqe, struct rpcrdma_sendctx, sc_cqe);
132
2fa8f88d 133 /* WARNING: Only wr_cqe and status are reliable at this point */
ab03eff5 134 trace_xprtrdma_wc_send(sc, wc);
2fa8f88d
CL
135 if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
136 pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
137 ib_wc_status_msg(wc->status),
138 wc->status, wc->vendor_err);
ae72950a
CL
139
140 rpcrdma_sendctx_put_locked(sc);
fc664485 141}
c56c65fb 142
552bf225 143/**
1519e969 144 * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
552bf225
CL
145 * @cq: completion queue (ignored)
146 * @wc: completed WR
147 *
148 */
fc664485 149static void
1519e969 150rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
fc664485 151{
552bf225
CL
152 struct ib_cqe *cqe = wc->wr_cqe;
153 struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
154 rr_cqe);
6ceea368 155 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
fc664485 156
6ceea368 157 /* WARNING: Only wr_cqe and status are reliable at this point */
0e0b854c 158 trace_xprtrdma_wc_receive(wc);
6ceea368 159 --r_xprt->rx_ep.rep_receive_count;
8502427c 160 if (wc->status != IB_WC_SUCCESS)
6ceea368 161 goto out_flushed;
fc664485 162
8502427c 163 /* status == SUCCESS means all fields in wc are trustworthy */
96f8778f 164 rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
c8b920bb
CL
165 rep->rr_wc_flags = wc->wc_flags;
166 rep->rr_inv_rkey = wc->ex.invalidate_rkey;
167
91a10c52 168 ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
6b1184cd 169 rdmab_addr(rep->rr_rdmabuf),
e2a67190 170 wc->byte_len, DMA_FROM_DEVICE);
23826c7a 171
6ceea368 172 rpcrdma_post_recvs(r_xprt, false);
d8f532d2 173 rpcrdma_reply_handler(rep);
8502427c 174 return;
fe97b47c 175
6ceea368 176out_flushed:
8502427c 177 if (wc->status != IB_WC_WR_FLUSH_ERR)
552bf225
CL
178 pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
179 ib_wc_status_msg(wc->status),
180 wc->status, wc->vendor_err);
6ceea368 181 rpcrdma_recv_buffer_put(rep);
fc664485
CL
182}
183
87cfb9a0
CL
184static void
185rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
186 struct rdma_conn_param *param)
187{
188 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
189 const struct rpcrdma_connect_private *pmsg = param->private_data;
190 unsigned int rsize, wsize;
191
c8b920bb 192 /* Default settings for RPC-over-RDMA Version One */
b5f0afbe 193 r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
87cfb9a0
CL
194 rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
195 wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
196
197 if (pmsg &&
198 pmsg->cp_magic == rpcrdma_cmp_magic &&
199 pmsg->cp_version == RPCRDMA_CMP_VERSION) {
c95a3c6b 200 r_xprt->rx_ia.ri_implicit_roundup = true;
87cfb9a0
CL
201 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
202 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
203 }
204
205 if (rsize < cdata->inline_rsize)
206 cdata->inline_rsize = rsize;
207 if (wsize < cdata->inline_wsize)
208 cdata->inline_wsize = wsize;
6d6bf72d
CL
209 dprintk("RPC: %s: max send %u, max recv %u\n",
210 __func__, cdata->inline_wsize, cdata->inline_rsize);
87cfb9a0
CL
211 rpcrdma_set_max_header_sizes(r_xprt);
212}
213
ae38288e
CL
214/**
215 * rpcrdma_cm_event_handler - Handle RDMA CM events
216 * @id: rdma_cm_id on which an event has occurred
217 * @event: details of the event
218 *
219 * Called with @id's mutex held. Returns 1 if caller should
220 * destroy @id, otherwise 0.
221 */
c56c65fb 222static int
ae38288e 223rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
c56c65fb 224{
ed97f1f7
CL
225 struct rpcrdma_xprt *r_xprt = id->context;
226 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
227 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
228 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
c56c65fb 229
ae38288e
CL
230 might_sleep();
231
ed97f1f7 232 trace_xprtrdma_cm_event(r_xprt, event);
c56c65fb
TT
233 switch (event->event) {
234 case RDMA_CM_EVENT_ADDR_RESOLVED:
235 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 236 ia->ri_async_rc = 0;
c56c65fb 237 complete(&ia->ri_done);
316a616e 238 return 0;
c56c65fb 239 case RDMA_CM_EVENT_ADDR_ERROR:
52d28fe4 240 ia->ri_async_rc = -EPROTO;
c56c65fb 241 complete(&ia->ri_done);
316a616e 242 return 0;
c56c65fb
TT
243 case RDMA_CM_EVENT_ROUTE_ERROR:
244 ia->ri_async_rc = -ENETUNREACH;
c56c65fb 245 complete(&ia->ri_done);
316a616e 246 return 0;
bebd0318
CL
247 case RDMA_CM_EVENT_DEVICE_REMOVAL:
248#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
d461f1f2 249 pr_info("rpcrdma: removing device %s for %s:%s\n",
173b8f49 250 ia->ri_device->name,
ed97f1f7 251 rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
bebd0318
CL
252#endif
253 set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
254 ep->rep_connected = -ENODEV;
ed97f1f7 255 xprt_force_disconnect(xprt);
bebd0318
CL
256 wait_for_completion(&ia->ri_remove_done);
257
258 ia->ri_id = NULL;
bebd0318
CL
259 ia->ri_device = NULL;
260 /* Return 1 to ensure the core destroys the id. */
261 return 1;
c56c65fb 262 case RDMA_CM_EVENT_ESTABLISHED:
ed97f1f7 263 ++xprt->connect_cookie;
aadc5a94 264 ep->rep_connected = 1;
ed97f1f7 265 rpcrdma_update_connect_private(r_xprt, &event->param.conn);
31e62d25
CL
266 wake_up_all(&ep->rep_connect_wait);
267 break;
c56c65fb 268 case RDMA_CM_EVENT_CONNECT_ERROR:
aadc5a94 269 ep->rep_connected = -ENOTCONN;
31e62d25 270 goto disconnected;
c56c65fb 271 case RDMA_CM_EVENT_UNREACHABLE:
aadc5a94 272 ep->rep_connected = -ENETUNREACH;
31e62d25 273 goto disconnected;
c56c65fb 274 case RDMA_CM_EVENT_REJECTED:
d461f1f2 275 dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
ed97f1f7 276 rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
0a90487b 277 rdma_reject_msg(id, event->status));
aadc5a94 278 ep->rep_connected = -ECONNREFUSED;
0a90487b 279 if (event->status == IB_CM_REJ_STALE_CONN)
aadc5a94 280 ep->rep_connected = -EAGAIN;
31e62d25 281 goto disconnected;
c56c65fb 282 case RDMA_CM_EVENT_DISCONNECTED:
aadc5a94 283 ep->rep_connected = -ECONNABORTED;
31e62d25
CL
284disconnected:
285 xprt_force_disconnect(xprt);
c56c65fb 286 wake_up_all(&ep->rep_connect_wait);
316a616e 287 break;
c56c65fb 288 default:
c56c65fb
TT
289 break;
290 }
291
5f62412b 292 dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__,
316a616e 293 rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
5f62412b 294 ia->ri_device->name, rdma_event_msg(event->event));
c56c65fb
TT
295 return 0;
296}
297
298static struct rdma_cm_id *
dd229cee 299rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
c56c65fb 300{
109b88ab 301 unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
c56c65fb
TT
302 struct rdma_cm_id *id;
303 int rc;
304
b4744e00
CL
305 trace_xprtrdma_conn_start(xprt);
306
1a954051 307 init_completion(&ia->ri_done);
bebd0318 308 init_completion(&ia->ri_remove_done);
1a954051 309
ae38288e 310 id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
107c4beb 311 xprt, RDMA_PS_TCP, IB_QPT_RC);
ddbb347f 312 if (IS_ERR(id))
c56c65fb 313 return id;
c56c65fb 314
5675add3 315 ia->ri_async_rc = -ETIMEDOUT;
dd229cee
CL
316 rc = rdma_resolve_addr(id, NULL,
317 (struct sockaddr *)&xprt->rx_xprt.addr,
318 RDMA_RESOLVE_TIMEOUT);
ddbb347f 319 if (rc)
c56c65fb 320 goto out;
109b88ab
CL
321 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
322 if (rc < 0) {
b4744e00 323 trace_xprtrdma_conn_tout(xprt);
109b88ab
CL
324 goto out;
325 }
d0f36c46 326
c56c65fb
TT
327 rc = ia->ri_async_rc;
328 if (rc)
329 goto out;
330
5675add3 331 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb 332 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
ddbb347f 333 if (rc)
56a6bd15 334 goto out;
109b88ab
CL
335 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
336 if (rc < 0) {
b4744e00 337 trace_xprtrdma_conn_tout(xprt);
56a6bd15 338 goto out;
109b88ab 339 }
c56c65fb
TT
340 rc = ia->ri_async_rc;
341 if (rc)
56a6bd15 342 goto out;
c56c65fb
TT
343
344 return id;
56a6bd15 345
c56c65fb
TT
346out:
347 rdma_destroy_id(id);
348 return ERR_PTR(rc);
349}
350
c56c65fb
TT
351/*
352 * Exported functions.
353 */
354
fff09594
CL
355/**
356 * rpcrdma_ia_open - Open and initialize an Interface Adapter.
dd229cee 357 * @xprt: transport with IA to (re)initialize
fff09594
CL
358 *
359 * Returns 0 on success, negative errno if an appropriate
360 * Interface Adapter could not be found and opened.
c56c65fb
TT
361 */
362int
dd229cee 363rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
c56c65fb 364{
c56c65fb 365 struct rpcrdma_ia *ia = &xprt->rx_ia;
d1ed857e
CL
366 int rc;
367
dd229cee 368 ia->ri_id = rpcrdma_create_id(xprt, ia);
c56c65fb
TT
369 if (IS_ERR(ia->ri_id)) {
370 rc = PTR_ERR(ia->ri_id);
fff09594 371 goto out_err;
c56c65fb 372 }
89e0d112 373 ia->ri_device = ia->ri_id->device;
c56c65fb 374
ed082d36 375 ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
c56c65fb
TT
376 if (IS_ERR(ia->ri_pd)) {
377 rc = PTR_ERR(ia->ri_pd);
b54054ca 378 pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
fff09594 379 goto out_err;
c56c65fb
TT
380 }
381
fff09594 382 switch (xprt_rdma_memreg_strategy) {
ce5b3717 383 case RPCRDMA_FRWR:
5f62412b 384 if (frwr_is_supported(ia))
b54054ca 385 break;
b54054ca 386 /*FALLTHROUGH*/
bd7ed1d1 387 default:
fff09594
CL
388 pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
389 ia->ri_device->name, xprt_rdma_memreg_strategy);
b54054ca 390 rc = -EINVAL;
fff09594 391 goto out_err;
c56c65fb
TT
392 }
393
c56c65fb 394 return 0;
5ae711a2 395
fff09594
CL
396out_err:
397 rpcrdma_ia_close(ia);
c56c65fb
TT
398 return rc;
399}
400
bebd0318
CL
401/**
402 * rpcrdma_ia_remove - Handle device driver unload
403 * @ia: interface adapter being removed
404 *
405 * Divest transport H/W resources associated with this adapter,
406 * but allow it to be restored later.
407 */
408void
409rpcrdma_ia_remove(struct rpcrdma_ia *ia)
410{
411 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
412 rx_ia);
413 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
414 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
415 struct rpcrdma_req *req;
416 struct rpcrdma_rep *rep;
417
418 cancel_delayed_work_sync(&buf->rb_refresh_worker);
419
420 /* This is similar to rpcrdma_ep_destroy, but:
421 * - Don't cancel the connect worker.
422 * - Don't call rpcrdma_ep_disconnect, which waits
423 * for another conn upcall, which will deadlock.
424 * - rdma_disconnect is unneeded, the underlying
425 * connection is already gone.
426 */
427 if (ia->ri_id->qp) {
6d2d0ee2 428 rpcrdma_xprt_drain(r_xprt);
bebd0318
CL
429 rdma_destroy_qp(ia->ri_id);
430 ia->ri_id->qp = NULL;
431 }
432 ib_free_cq(ep->rep_attr.recv_cq);
25524288 433 ep->rep_attr.recv_cq = NULL;
bebd0318 434 ib_free_cq(ep->rep_attr.send_cq);
25524288 435 ep->rep_attr.send_cq = NULL;
bebd0318
CL
436
437 /* The ULP is responsible for ensuring all DMA
438 * mappings and MRs are gone.
439 */
440 list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
441 rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
442 list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
443 rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
444 rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
445 rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
446 }
96ceddea 447 rpcrdma_mrs_destroy(buf);
25524288
CL
448 ib_dealloc_pd(ia->ri_pd);
449 ia->ri_pd = NULL;
bebd0318
CL
450
451 /* Allow waiters to continue */
452 complete(&ia->ri_remove_done);
b4744e00
CL
453
454 trace_xprtrdma_remove(r_xprt);
bebd0318
CL
455}
456
fff09594
CL
457/**
458 * rpcrdma_ia_close - Clean up/close an IA.
459 * @ia: interface adapter to close
460 *
c56c65fb
TT
461 */
462void
463rpcrdma_ia_close(struct rpcrdma_ia *ia)
464{
fee08caf
TT
465 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
466 if (ia->ri_id->qp)
467 rdma_destroy_qp(ia->ri_id);
56a6bd15 468 rdma_destroy_id(ia->ri_id);
fee08caf 469 }
fff09594
CL
470 ia->ri_id = NULL;
471 ia->ri_device = NULL;
6d44698d
CL
472
473 /* If the pd is still busy, xprtrdma missed freeing a resource */
474 if (ia->ri_pd && !IS_ERR(ia->ri_pd))
7dd78647 475 ib_dealloc_pd(ia->ri_pd);
fff09594 476 ia->ri_pd = NULL;
c56c65fb
TT
477}
478
479/*
480 * Create unconnected endpoint.
481 */
482int
483rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
16f906d6 484 struct rpcrdma_create_data_internal *cdata)
c56c65fb 485{
87cfb9a0 486 struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
fc664485 487 struct ib_cq *sendcq, *recvcq;
914fcad9 488 unsigned int max_sge;
2fa8f88d 489 int rc;
c56c65fb 490
33023fb8 491 max_sge = min_t(unsigned int, ia->ri_device->attrs.max_send_sge,
eed50879 492 RPCRDMA_MAX_SEND_SGES);
16f906d6
CL
493 if (max_sge < RPCRDMA_MIN_SEND_SGES) {
494 pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
b3221d6a
CL
495 return -ENOMEM;
496 }
1179e2c2 497 ia->ri_max_send_sges = max_sge;
b3221d6a 498
5f62412b 499 rc = frwr_open(ia, ep, cdata);
914fcad9
CL
500 if (rc)
501 return rc;
c56c65fb 502
f9521d53 503 ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
c56c65fb 504 ep->rep_attr.qp_context = ep;
c56c65fb 505 ep->rep_attr.srq = NULL;
16f906d6 506 ep->rep_attr.cap.max_send_sge = max_sge;
c56c65fb
TT
507 ep->rep_attr.cap.max_recv_sge = 1;
508 ep->rep_attr.cap.max_inline_data = 0;
509 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
510 ep->rep_attr.qp_type = IB_QPT_RC;
511 ep->rep_attr.port_num = ~0;
512
513 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
514 "iovs: send %d recv %d\n",
515 __func__,
516 ep->rep_attr.cap.max_send_wr,
517 ep->rep_attr.cap.max_recv_wr,
518 ep->rep_attr.cap.max_send_sge,
519 ep->rep_attr.cap.max_recv_sge);
520
521 /* set trigger for requesting send completion */
ae72950a
CL
522 ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH,
523 cdata->max_requests >> 2);
524 ep->rep_send_count = ep->rep_send_batch;
c56c65fb 525 init_waitqueue_head(&ep->rep_connect_wait);
6ceea368 526 ep->rep_receive_count = 0;
c56c65fb 527
2fa8f88d
CL
528 sendcq = ib_alloc_cq(ia->ri_device, NULL,
529 ep->rep_attr.cap.max_send_wr + 1,
a4699f56 530 1, IB_POLL_WORKQUEUE);
fc664485
CL
531 if (IS_ERR(sendcq)) {
532 rc = PTR_ERR(sendcq);
c56c65fb
TT
533 goto out1;
534 }
535
552bf225
CL
536 recvcq = ib_alloc_cq(ia->ri_device, NULL,
537 ep->rep_attr.cap.max_recv_wr + 1,
d8f532d2 538 0, IB_POLL_WORKQUEUE);
fc664485
CL
539 if (IS_ERR(recvcq)) {
540 rc = PTR_ERR(recvcq);
fc664485
CL
541 goto out2;
542 }
543
fc664485
CL
544 ep->rep_attr.send_cq = sendcq;
545 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
546
547 /* Initialize cma parameters */
b2dde94b 548 memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
c56c65fb 549
87cfb9a0
CL
550 /* Prepare RDMA-CM private message */
551 pmsg->cp_magic = rpcrdma_cmp_magic;
552 pmsg->cp_version = RPCRDMA_CMP_VERSION;
5f62412b 553 pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK;
87cfb9a0
CL
554 pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
555 pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
556 ep->rep_remote_cma.private_data = pmsg;
557 ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
c56c65fb
TT
558
559 /* Client offers RDMA Read but does not initiate */
b334eaab 560 ep->rep_remote_cma.initiator_depth = 0;
b7e85fff
CL
561 ep->rep_remote_cma.responder_resources =
562 min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom);
c56c65fb 563
b2dde94b
CL
564 /* Limit transport retries so client can detect server
565 * GID changes quickly. RPC layer handles re-establishing
566 * transport connection and retransmission.
567 */
568 ep->rep_remote_cma.retry_count = 6;
569
570 /* RPC-over-RDMA handles its own flow control. In addition,
571 * make all RNR NAKs visible so we know that RPC-over-RDMA
572 * flow control is working correctly (no NAKs should be seen).
573 */
c56c65fb
TT
574 ep->rep_remote_cma.flow_control = 0;
575 ep->rep_remote_cma.rnr_retry_count = 0;
576
577 return 0;
578
579out2:
2fa8f88d 580 ib_free_cq(sendcq);
c56c65fb
TT
581out1:
582 return rc;
583}
584
585/*
586 * rpcrdma_ep_destroy
587 *
588 * Disconnect and destroy endpoint. After this, the only
589 * valid operations on the ep are to free it (if dynamically
590 * allocated) or re-create it.
c56c65fb 591 */
7f1d5419 592void
c56c65fb
TT
593rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
594{
25524288 595 if (ia->ri_id && ia->ri_id->qp) {
550d7502 596 rpcrdma_ep_disconnect(ep, ia);
fee08caf
TT
597 rdma_destroy_qp(ia->ri_id);
598 ia->ri_id->qp = NULL;
c56c65fb
TT
599 }
600
25524288
CL
601 if (ep->rep_attr.recv_cq)
602 ib_free_cq(ep->rep_attr.recv_cq);
603 if (ep->rep_attr.send_cq)
604 ib_free_cq(ep->rep_attr.send_cq);
c56c65fb
TT
605}
606
a9b0e381
CL
607/* Re-establish a connection after a device removal event.
608 * Unlike a normal reconnection, a fresh PD and a new set
609 * of MRs and buffers is needed.
610 */
611static int
612rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
613 struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
614{
a9b0e381
CL
615 int rc, err;
616
b4744e00 617 trace_xprtrdma_reinsert(r_xprt);
a9b0e381
CL
618
619 rc = -EHOSTUNREACH;
dd229cee 620 if (rpcrdma_ia_open(r_xprt))
a9b0e381
CL
621 goto out1;
622
623 rc = -ENOMEM;
624 err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
625 if (err) {
626 pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
627 goto out2;
628 }
629
630 rc = -ENETUNREACH;
631 err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
632 if (err) {
633 pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
634 goto out3;
635 }
636
96ceddea 637 rpcrdma_mrs_create(r_xprt);
a9b0e381
CL
638 return 0;
639
640out3:
641 rpcrdma_ep_destroy(ep, ia);
642out2:
643 rpcrdma_ia_close(ia);
644out1:
645 return rc;
646}
647
1890896b
CL
648static int
649rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
650 struct rpcrdma_ia *ia)
651{
1890896b
CL
652 struct rdma_cm_id *id, *old;
653 int err, rc;
654
b4744e00 655 trace_xprtrdma_reconnect(r_xprt);
1890896b
CL
656
657 rpcrdma_ep_disconnect(ep, ia);
658
659 rc = -EHOSTUNREACH;
dd229cee 660 id = rpcrdma_create_id(r_xprt, ia);
1890896b
CL
661 if (IS_ERR(id))
662 goto out;
663
664 /* As long as the new ID points to the same device as the
665 * old ID, we can reuse the transport's existing PD and all
666 * previously allocated MRs. Also, the same device means
667 * the transport's previous DMA mappings are still valid.
668 *
669 * This is a sanity check only. There should be no way these
670 * point to two different devices here.
671 */
672 old = id;
673 rc = -ENETUNREACH;
674 if (ia->ri_device != id->device) {
675 pr_err("rpcrdma: can't reconnect on different device!\n");
676 goto out_destroy;
677 }
678
679 err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
ddbb347f 680 if (err)
1890896b 681 goto out_destroy;
1890896b
CL
682
683 /* Atomically replace the transport's ID and QP. */
684 rc = 0;
685 old = ia->ri_id;
686 ia->ri_id = id;
687 rdma_destroy_qp(old);
688
689out_destroy:
56a6bd15 690 rdma_destroy_id(old);
1890896b
CL
691out:
692 return rc;
693}
694
c56c65fb
TT
695/*
696 * Connect unconnected endpoint.
697 */
698int
699rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
700{
0a90487b
CL
701 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
702 rx_ia);
31e62d25 703 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1890896b 704 int rc;
c56c65fb 705
c56c65fb 706retry:
1890896b
CL
707 switch (ep->rep_connected) {
708 case 0:
ec62f40d
CL
709 dprintk("RPC: %s: connecting...\n", __func__);
710 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
711 if (rc) {
1890896b
CL
712 rc = -ENETUNREACH;
713 goto out_noupdate;
ec62f40d 714 }
1890896b 715 break;
a9b0e381
CL
716 case -ENODEV:
717 rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
718 if (rc)
719 goto out_noupdate;
720 break;
1890896b
CL
721 default:
722 rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
723 if (rc)
724 goto out;
c56c65fb
TT
725 }
726
c56c65fb 727 ep->rep_connected = 0;
31e62d25
CL
728 xprt_clear_connected(xprt);
729
8d4fb8ff 730 rpcrdma_post_recvs(r_xprt, true);
c56c65fb
TT
731
732 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
ddbb347f 733 if (rc)
c56c65fb 734 goto out;
c56c65fb 735
c56c65fb 736 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
c56c65fb 737 if (ep->rep_connected <= 0) {
0a90487b 738 if (ep->rep_connected == -EAGAIN)
c56c65fb
TT
739 goto retry;
740 rc = ep->rep_connected;
0a90487b 741 goto out;
c56c65fb
TT
742 }
743
0a90487b 744 dprintk("RPC: %s: connected\n", __func__);
7c8d9e7c 745
c56c65fb
TT
746out:
747 if (rc)
748 ep->rep_connected = rc;
1890896b
CL
749
750out_noupdate:
c56c65fb
TT
751 return rc;
752}
753
6d2d0ee2
CL
754/**
755 * rpcrdma_ep_disconnect - Disconnect underlying transport
756 * @ep: endpoint to disconnect
757 * @ia: associated interface adapter
c56c65fb
TT
758 *
759 * This is separate from destroy to facilitate the ability
760 * to reconnect without recreating the endpoint.
761 *
762 * This call is not reentrant, and must not be made in parallel
763 * on the same endpoint.
764 */
282191cb 765void
c56c65fb
TT
766rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
767{
6d2d0ee2
CL
768 struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
769 rx_ep);
c56c65fb
TT
770 int rc;
771
6d2d0ee2 772 /* returns without wait if ID is not connected */
c56c65fb 773 rc = rdma_disconnect(ia->ri_id);
b4744e00 774 if (!rc)
c56c65fb
TT
775 wait_event_interruptible(ep->rep_connect_wait,
776 ep->rep_connected != 1);
b4744e00 777 else
c56c65fb 778 ep->rep_connected = rc;
6d2d0ee2 779 trace_xprtrdma_disconnect(r_xprt, rc);
550d7502 780
6d2d0ee2 781 rpcrdma_xprt_drain(r_xprt);
c56c65fb
TT
782}
783
ae72950a
CL
784/* Fixed-size circular FIFO queue. This implementation is wait-free and
785 * lock-free.
786 *
787 * Consumer is the code path that posts Sends. This path dequeues a
788 * sendctx for use by a Send operation. Multiple consumer threads
789 * are serialized by the RPC transport lock, which allows only one
790 * ->send_request call at a time.
791 *
792 * Producer is the code path that handles Send completions. This path
793 * enqueues a sendctx that has been completed. Multiple producer
794 * threads are serialized by the ib_poll_cq() function.
795 */
796
797/* rpcrdma_sendctxs_destroy() assumes caller has already quiesced
798 * queue activity, and ib_drain_qp has flushed all remaining Send
799 * requests.
800 */
801static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf)
802{
803 unsigned long i;
804
805 for (i = 0; i <= buf->rb_sc_last; i++)
806 kfree(buf->rb_sc_ctxs[i]);
807 kfree(buf->rb_sc_ctxs);
808}
809
810static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia)
811{
812 struct rpcrdma_sendctx *sc;
813
814 sc = kzalloc(sizeof(*sc) +
815 ia->ri_max_send_sges * sizeof(struct ib_sge),
816 GFP_KERNEL);
817 if (!sc)
818 return NULL;
819
820 sc->sc_wr.wr_cqe = &sc->sc_cqe;
821 sc->sc_wr.sg_list = sc->sc_sges;
822 sc->sc_wr.opcode = IB_WR_SEND;
823 sc->sc_cqe.done = rpcrdma_wc_send;
824 return sc;
825}
826
827static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
828{
829 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
830 struct rpcrdma_sendctx *sc;
831 unsigned long i;
832
833 /* Maximum number of concurrent outstanding Send WRs. Capping
834 * the circular queue size stops Send Queue overflow by causing
835 * the ->send_request call to fail temporarily before too many
836 * Sends are posted.
837 */
838 i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS;
839 dprintk("RPC: %s: allocating %lu send_ctxs\n", __func__, i);
840 buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
841 if (!buf->rb_sc_ctxs)
842 return -ENOMEM;
843
844 buf->rb_sc_last = i - 1;
845 for (i = 0; i <= buf->rb_sc_last; i++) {
846 sc = rpcrdma_sendctx_create(&r_xprt->rx_ia);
847 if (!sc)
6e17f58c 848 return -ENOMEM;
ae72950a
CL
849
850 sc->sc_xprt = r_xprt;
851 buf->rb_sc_ctxs[i] = sc;
852 }
853
854 return 0;
ae72950a
CL
855}
856
857/* The sendctx queue is not guaranteed to have a size that is a
858 * power of two, thus the helpers in circ_buf.h cannot be used.
859 * The other option is to use modulus (%), which can be expensive.
860 */
861static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf,
862 unsigned long item)
863{
864 return likely(item < buf->rb_sc_last) ? item + 1 : 0;
865}
866
867/**
868 * rpcrdma_sendctx_get_locked - Acquire a send context
869 * @buf: transport buffers from which to acquire an unused context
870 *
871 * Returns pointer to a free send completion context; or NULL if
872 * the queue is empty.
873 *
874 * Usage: Called to acquire an SGE array before preparing a Send WR.
875 *
876 * The caller serializes calls to this function (per rpcrdma_buffer),
877 * and provides an effective memory barrier that flushes the new value
878 * of rb_sc_head.
879 */
880struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf)
881{
882 struct rpcrdma_xprt *r_xprt;
883 struct rpcrdma_sendctx *sc;
884 unsigned long next_head;
885
886 next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head);
887
888 if (next_head == READ_ONCE(buf->rb_sc_tail))
889 goto out_emptyq;
890
891 /* ORDER: item must be accessed _before_ head is updated */
892 sc = buf->rb_sc_ctxs[next_head];
893
894 /* Releasing the lock in the caller acts as a memory
895 * barrier that flushes rb_sc_head.
896 */
897 buf->rb_sc_head = next_head;
898
899 return sc;
900
901out_emptyq:
902 /* The queue is "empty" if there have not been enough Send
903 * completions recently. This is a sign the Send Queue is
904 * backing up. Cause the caller to pause and try again.
905 */
2fad6592 906 set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags);
ae72950a
CL
907 r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf);
908 r_xprt->rx_stats.empty_sendctx_q++;
909 return NULL;
910}
911
912/**
913 * rpcrdma_sendctx_put_locked - Release a send context
914 * @sc: send context to release
915 *
916 * Usage: Called from Send completion to return a sendctxt
917 * to the queue.
918 *
919 * The caller serializes calls to this function (per rpcrdma_buffer).
920 */
efd81e90
CL
921static void
922rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
ae72950a
CL
923{
924 struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf;
925 unsigned long next_tail;
926
927 /* Unmap SGEs of previously completed by unsignaled
928 * Sends by walking up the queue until @sc is found.
929 */
930 next_tail = buf->rb_sc_tail;
931 do {
932 next_tail = rpcrdma_sendctx_next(buf, next_tail);
933
934 /* ORDER: item must be accessed _before_ tail is updated */
935 rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]);
936
937 } while (buf->rb_sc_ctxs[next_tail] != sc);
938
939 /* Paired with READ_ONCE */
940 smp_store_release(&buf->rb_sc_tail, next_tail);
2fad6592
CL
941
942 if (test_and_clear_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags)) {
943 smp_mb__after_atomic();
944 xprt_write_space(&sc->sc_xprt->rx_xprt);
945 }
ae72950a
CL
946}
947
e2ac236c 948static void
96ceddea 949rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
e2ac236c
CL
950{
951 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
952 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
953 unsigned int count;
954 LIST_HEAD(free);
955 LIST_HEAD(all);
956
c421ece6 957 for (count = 0; count < ia->ri_max_segs; count++) {
96ceddea 958 struct rpcrdma_mr *mr;
e2ac236c
CL
959 int rc;
960
96ceddea
CL
961 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
962 if (!mr)
e2ac236c
CL
963 break;
964
5f62412b 965 rc = frwr_init_mr(ia, mr);
e2ac236c 966 if (rc) {
96ceddea 967 kfree(mr);
e2ac236c
CL
968 break;
969 }
970
96ceddea 971 mr->mr_xprt = r_xprt;
e2ac236c 972
96ceddea
CL
973 list_add(&mr->mr_list, &free);
974 list_add(&mr->mr_all, &all);
e2ac236c
CL
975 }
976
96ceddea
CL
977 spin_lock(&buf->rb_mrlock);
978 list_splice(&free, &buf->rb_mrs);
e2ac236c
CL
979 list_splice(&all, &buf->rb_all);
980 r_xprt->rx_stats.mrs_allocated += count;
96ceddea 981 spin_unlock(&buf->rb_mrlock);
1c443eff 982 trace_xprtrdma_createmrs(r_xprt, count);
9e679d5e
CL
983
984 xprt_write_space(&r_xprt->rx_xprt);
e2ac236c
CL
985}
986
987static void
988rpcrdma_mr_refresh_worker(struct work_struct *work)
989{
990 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
991 rb_refresh_worker.work);
992 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
993 rx_buf);
994
96ceddea 995 rpcrdma_mrs_create(r_xprt);
e2ac236c
CL
996}
997
f531a5db 998struct rpcrdma_req *
1392402c
CL
999rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1000{
f531a5db 1001 struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
2dd4a012 1002 struct rpcrdma_regbuf *rb;
1392402c 1003 struct rpcrdma_req *req;
1392402c 1004
85275c87 1005 req = kzalloc(sizeof(*req), GFP_KERNEL);
1392402c 1006 if (req == NULL)
85275c87 1007 return ERR_PTR(-ENOMEM);
1392402c 1008
2dd4a012
CL
1009 rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
1010 DMA_TO_DEVICE, GFP_KERNEL);
1011 if (IS_ERR(rb)) {
1012 kfree(req);
1013 return ERR_PTR(-ENOMEM);
1014 }
1015 req->rl_rdmabuf = rb;
1016 xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
1017 req->rl_buffer = buffer;
1018 INIT_LIST_HEAD(&req->rl_registered);
1019
92f4433e 1020 spin_lock(&buffer->rb_lock);
f531a5db 1021 list_add(&req->rl_all, &buffer->rb_allreqs);
92f4433e 1022 spin_unlock(&buffer->rb_lock);
1392402c 1023 return req;
1392402c
CL
1024}
1025
7c8d9e7c
CL
1026static int
1027rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp)
1392402c
CL
1028{
1029 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
d698c4a0 1030 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1392402c
CL
1031 struct rpcrdma_rep *rep;
1032 int rc;
1033
1034 rc = -ENOMEM;
6b1184cd 1035 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1392402c
CL
1036 if (rep == NULL)
1037 goto out;
1392402c 1038
13650c23 1039 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
99ef4db3 1040 DMA_FROM_DEVICE, GFP_KERNEL);
6b1184cd
CL
1041 if (IS_ERR(rep->rr_rdmabuf)) {
1042 rc = PTR_ERR(rep->rr_rdmabuf);
1392402c 1043 goto out_free;
6b1184cd 1044 }
96f8778f
CL
1045 xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base,
1046 rdmab_length(rep->rr_rdmabuf));
1392402c 1047
1519e969 1048 rep->rr_cqe.done = rpcrdma_wc_receive;
fed171b3 1049 rep->rr_rxprt = r_xprt;
d8f532d2 1050 INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion);
6ea8e711
CL
1051 rep->rr_recv_wr.next = NULL;
1052 rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
1053 rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
1054 rep->rr_recv_wr.num_sge = 1;
7c8d9e7c 1055 rep->rr_temp = temp;
d698c4a0
CL
1056
1057 spin_lock(&buf->rb_lock);
1058 list_add(&rep->rr_list, &buf->rb_recv_bufs);
1059 spin_unlock(&buf->rb_lock);
1060 return 0;
1392402c
CL
1061
1062out_free:
1063 kfree(rep);
1064out:
d698c4a0 1065 return rc;
1392402c
CL
1066}
1067
c56c65fb 1068int
ac920d04 1069rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
c56c65fb 1070{
ac920d04 1071 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
c56c65fb
TT
1072 int i, rc;
1073
512ccfb6 1074 buf->rb_flags = 0;
1e465fd4 1075 buf->rb_max_requests = r_xprt->rx_data.max_requests;
f531a5db 1076 buf->rb_bc_srv_max_requests = 0;
96ceddea 1077 spin_lock_init(&buf->rb_mrlock);
505bbe64 1078 spin_lock_init(&buf->rb_lock);
96ceddea 1079 INIT_LIST_HEAD(&buf->rb_mrs);
e2ac236c 1080 INIT_LIST_HEAD(&buf->rb_all);
e2ac236c
CL
1081 INIT_DELAYED_WORK(&buf->rb_refresh_worker,
1082 rpcrdma_mr_refresh_worker);
c56c65fb 1083
96ceddea 1084 rpcrdma_mrs_create(r_xprt);
c56c65fb 1085
1e465fd4 1086 INIT_LIST_HEAD(&buf->rb_send_bufs);
f531a5db 1087 INIT_LIST_HEAD(&buf->rb_allreqs);
c56c65fb
TT
1088 for (i = 0; i < buf->rb_max_requests; i++) {
1089 struct rpcrdma_req *req;
c56c65fb 1090
1392402c
CL
1091 req = rpcrdma_create_req(r_xprt);
1092 if (IS_ERR(req)) {
c56c65fb
TT
1093 dprintk("RPC: %s: request buffer %d alloc"
1094 " failed\n", __func__, i);
1392402c 1095 rc = PTR_ERR(req);
c56c65fb
TT
1096 goto out;
1097 }
a80d66c9 1098 list_add(&req->rl_list, &buf->rb_send_bufs);
1e465fd4
CL
1099 }
1100
8d4fb8ff 1101 buf->rb_credits = 1;
1e465fd4 1102 INIT_LIST_HEAD(&buf->rb_recv_bufs);
1392402c 1103
ae72950a
CL
1104 rc = rpcrdma_sendctxs_create(r_xprt);
1105 if (rc)
1106 goto out;
1107
6d2d0ee2
CL
1108 buf->rb_completion_wq = alloc_workqueue("rpcrdma-%s",
1109 WQ_MEM_RECLAIM | WQ_HIGHPRI,
1110 0,
1111 r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]);
4429b668
DC
1112 if (!buf->rb_completion_wq) {
1113 rc = -ENOMEM;
6d2d0ee2 1114 goto out;
4429b668 1115 }
6d2d0ee2 1116
c56c65fb
TT
1117 return 0;
1118out:
1119 rpcrdma_buffer_destroy(buf);
1120 return rc;
1121}
1122
1392402c 1123static void
13650c23 1124rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
1392402c 1125{
13650c23 1126 rpcrdma_free_regbuf(rep->rr_rdmabuf);
1392402c
CL
1127 kfree(rep);
1128}
1129
92f4433e
CL
1130/**
1131 * rpcrdma_req_destroy - Destroy an rpcrdma_req object
1132 * @req: unused object to be destroyed
1133 *
1134 * This function assumes that the caller prevents concurrent device
1135 * unload and transport tear-down.
1136 */
f531a5db 1137void
92f4433e 1138rpcrdma_req_destroy(struct rpcrdma_req *req)
1392402c 1139{
92f4433e
CL
1140 list_del(&req->rl_all);
1141
13650c23
CL
1142 rpcrdma_free_regbuf(req->rl_recvbuf);
1143 rpcrdma_free_regbuf(req->rl_sendbuf);
1144 rpcrdma_free_regbuf(req->rl_rdmabuf);
1392402c
CL
1145 kfree(req);
1146}
1147
e2ac236c 1148static void
96ceddea 1149rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
e2ac236c
CL
1150{
1151 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
1152 rx_buf);
96ceddea 1153 struct rpcrdma_mr *mr;
e2ac236c
CL
1154 unsigned int count;
1155
1156 count = 0;
96ceddea 1157 spin_lock(&buf->rb_mrlock);
e2ac236c 1158 while (!list_empty(&buf->rb_all)) {
96ceddea
CL
1159 mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all);
1160 list_del(&mr->mr_all);
e2ac236c 1161
96ceddea 1162 spin_unlock(&buf->rb_mrlock);
054f1557
CL
1163
1164 /* Ensure MW is not on any rl_registered list */
1165 if (!list_empty(&mr->mr_list))
1166 list_del(&mr->mr_list);
1167
5f62412b 1168 frwr_release_mr(mr);
e2ac236c 1169 count++;
96ceddea 1170 spin_lock(&buf->rb_mrlock);
e2ac236c 1171 }
96ceddea 1172 spin_unlock(&buf->rb_mrlock);
e2ac236c
CL
1173 r_xprt->rx_stats.mrs_allocated = 0;
1174
1175 dprintk("RPC: %s: released %u MRs\n", __func__, count);
1176}
1177
af65ed40
CL
1178/**
1179 * rpcrdma_buffer_destroy - Release all hw resources
1180 * @buf: root control block for resources
1181 *
1182 * ORDERING: relies on a prior ib_drain_qp :
1183 * - No more Send or Receive completions can occur
1184 * - All MRs, reps, and reqs are returned to their free lists
1185 */
c56c65fb
TT
1186void
1187rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1188{
9378b274 1189 cancel_delayed_work_sync(&buf->rb_refresh_worker);
505bbe64 1190
6d2d0ee2
CL
1191 if (buf->rb_completion_wq) {
1192 destroy_workqueue(buf->rb_completion_wq);
1193 buf->rb_completion_wq = NULL;
1194 }
1195
ae72950a
CL
1196 rpcrdma_sendctxs_destroy(buf);
1197
1e465fd4
CL
1198 while (!list_empty(&buf->rb_recv_bufs)) {
1199 struct rpcrdma_rep *rep;
c56c65fb 1200
9d95cd53
CL
1201 rep = list_first_entry(&buf->rb_recv_bufs,
1202 struct rpcrdma_rep, rr_list);
1203 list_del(&rep->rr_list);
13650c23 1204 rpcrdma_destroy_rep(rep);
c56c65fb
TT
1205 }
1206
92f4433e 1207 while (!list_empty(&buf->rb_send_bufs)) {
1e465fd4 1208 struct rpcrdma_req *req;
4034ba04 1209
92f4433e
CL
1210 req = list_first_entry(&buf->rb_send_bufs,
1211 struct rpcrdma_req, rl_list);
1212 list_del(&req->rl_list);
1213 rpcrdma_req_destroy(req);
1e465fd4 1214 }
4034ba04 1215
96ceddea 1216 rpcrdma_mrs_destroy(buf);
c56c65fb
TT
1217}
1218
96ceddea
CL
1219/**
1220 * rpcrdma_mr_get - Allocate an rpcrdma_mr object
1221 * @r_xprt: controlling transport
1222 *
1223 * Returns an initialized rpcrdma_mr or NULL if no free
1224 * rpcrdma_mr objects are available.
1225 */
1226struct rpcrdma_mr *
1227rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
c2922c02 1228{
346aa66b 1229 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
96ceddea 1230 struct rpcrdma_mr *mr = NULL;
346aa66b 1231
96ceddea
CL
1232 spin_lock(&buf->rb_mrlock);
1233 if (!list_empty(&buf->rb_mrs))
1234 mr = rpcrdma_mr_pop(&buf->rb_mrs);
1235 spin_unlock(&buf->rb_mrlock);
346aa66b 1236
96ceddea
CL
1237 if (!mr)
1238 goto out_nomrs;
1239 return mr;
e2ac236c 1240
96ceddea 1241out_nomrs:
1c443eff 1242 trace_xprtrdma_nomrs(r_xprt);
bebd0318
CL
1243 if (r_xprt->rx_ep.rep_connected != -ENODEV)
1244 schedule_delayed_work(&buf->rb_refresh_worker, 0);
e2ac236c
CL
1245
1246 /* Allow the reply handler and refresh worker to run */
1247 cond_resched();
1248
1249 return NULL;
c2922c02
CL
1250}
1251
ec12e479
CL
1252static void
1253__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
1254{
1255 spin_lock(&buf->rb_mrlock);
1256 rpcrdma_mr_push(mr, &buf->rb_mrs);
1257 spin_unlock(&buf->rb_mrlock);
1258}
1259
96ceddea
CL
1260/**
1261 * rpcrdma_mr_put - Release an rpcrdma_mr object
1262 * @mr: object to release
1263 *
1264 */
346aa66b 1265void
96ceddea 1266rpcrdma_mr_put(struct rpcrdma_mr *mr)
ec12e479
CL
1267{
1268 __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
1269}
1270
1271/**
1272 * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it
1273 * @mr: object to release
1274 *
1275 */
1276void
1277rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
c2922c02 1278{
96ceddea 1279 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
c2922c02 1280
e2f34e26
CL
1281 if (mr->mr_dir != DMA_NONE) {
1282 trace_xprtrdma_mr_unmap(mr);
1283 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
1284 mr->mr_sg, mr->mr_nents, mr->mr_dir);
1285 mr->mr_dir = DMA_NONE;
1286 }
ec12e479 1287 __rpcrdma_mr_put(&r_xprt->rx_buf, mr);
c2922c02
CL
1288}
1289
7c8d9e7c
CL
1290/**
1291 * rpcrdma_buffer_get - Get a request buffer
1292 * @buffers: Buffer pool from which to obtain a buffer
78d506e1 1293 *
7c8d9e7c 1294 * Returns a fresh rpcrdma_req, or NULL if none are available.
c56c65fb
TT
1295 */
1296struct rpcrdma_req *
1297rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1298{
1299 struct rpcrdma_req *req;
c14d86e5 1300
a5b027e1 1301 spin_lock(&buffers->rb_lock);
e68699cc
CL
1302 req = list_first_entry_or_null(&buffers->rb_send_bufs,
1303 struct rpcrdma_req, rl_list);
1304 if (req)
1305 list_del_init(&req->rl_list);
a5b027e1 1306 spin_unlock(&buffers->rb_lock);
1e465fd4 1307 return req;
c56c65fb
TT
1308}
1309
7c8d9e7c
CL
1310/**
1311 * rpcrdma_buffer_put - Put request/reply buffers back into pool
1312 * @req: object to return
1313 *
c56c65fb
TT
1314 */
1315void
1316rpcrdma_buffer_put(struct rpcrdma_req *req)
1317{
1318 struct rpcrdma_buffer *buffers = req->rl_buffer;
1e465fd4 1319 struct rpcrdma_rep *rep = req->rl_reply;
c56c65fb 1320
1e465fd4
CL
1321 req->rl_reply = NULL;
1322
a5b027e1 1323 spin_lock(&buffers->rb_lock);
7c8d9e7c 1324 list_add(&req->rl_list, &buffers->rb_send_bufs);
05c97466 1325 if (rep) {
7c8d9e7c
CL
1326 if (!rep->rr_temp) {
1327 list_add(&rep->rr_list, &buffers->rb_recv_bufs);
1328 rep = NULL;
1329 }
05c97466 1330 }
a5b027e1 1331 spin_unlock(&buffers->rb_lock);
7c8d9e7c
CL
1332 if (rep)
1333 rpcrdma_destroy_rep(rep);
c56c65fb
TT
1334}
1335
1336/*
1337 * Put reply buffers back into pool when not attached to
b45ccfd2 1338 * request. This happens in error conditions.
c56c65fb
TT
1339 */
1340void
1341rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1342{
fed171b3 1343 struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
c56c65fb 1344
7c8d9e7c
CL
1345 if (!rep->rr_temp) {
1346 spin_lock(&buffers->rb_lock);
1347 list_add(&rep->rr_list, &buffers->rb_recv_bufs);
1348 spin_unlock(&buffers->rb_lock);
1349 } else {
1350 rpcrdma_destroy_rep(rep);
1351 }
c56c65fb
TT
1352}
1353
9128c3e7 1354/**
99ef4db3 1355 * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
9128c3e7 1356 * @size: size of buffer to be allocated, in bytes
99ef4db3 1357 * @direction: direction of data movement
9128c3e7
CL
1358 * @flags: GFP flags
1359 *
54cbd6b0
CL
1360 * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
1361 * can be persistently DMA-mapped for I/O.
9128c3e7
CL
1362 *
1363 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
99ef4db3 1364 * receiving the payload of RDMA RECV operations. During Long Calls
5f62412b 1365 * or Replies they may be registered externally via frwr_map.
9128c3e7
CL
1366 */
1367struct rpcrdma_regbuf *
13650c23
CL
1368rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
1369 gfp_t flags)
9128c3e7
CL
1370{
1371 struct rpcrdma_regbuf *rb;
9128c3e7 1372
9128c3e7
CL
1373 rb = kmalloc(sizeof(*rb) + size, flags);
1374 if (rb == NULL)
54cbd6b0 1375 return ERR_PTR(-ENOMEM);
9128c3e7 1376
54cbd6b0 1377 rb->rg_device = NULL;
99ef4db3 1378 rb->rg_direction = direction;
54cbd6b0 1379 rb->rg_iov.length = size;
9128c3e7
CL
1380
1381 return rb;
54cbd6b0 1382}
9128c3e7 1383
54cbd6b0
CL
1384/**
1385 * __rpcrdma_map_regbuf - DMA-map a regbuf
1386 * @ia: controlling rpcrdma_ia
1387 * @rb: regbuf to be mapped
1388 */
1389bool
1390__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1391{
91a10c52
CL
1392 struct ib_device *device = ia->ri_device;
1393
54cbd6b0
CL
1394 if (rb->rg_direction == DMA_NONE)
1395 return false;
1396
91a10c52 1397 rb->rg_iov.addr = ib_dma_map_single(device,
54cbd6b0
CL
1398 (void *)rb->rg_base,
1399 rdmab_length(rb),
1400 rb->rg_direction);
53b2c1cb
CL
1401 if (ib_dma_mapping_error(device, rdmab_addr(rb))) {
1402 trace_xprtrdma_dma_maperr(rdmab_addr(rb));
54cbd6b0 1403 return false;
53b2c1cb 1404 }
54cbd6b0 1405
91a10c52 1406 rb->rg_device = device;
54cbd6b0
CL
1407 rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
1408 return true;
1409}
1410
1411static void
1412rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
1413{
e89e8d8f
CL
1414 if (!rb)
1415 return;
1416
54cbd6b0
CL
1417 if (!rpcrdma_regbuf_is_mapped(rb))
1418 return;
1419
1420 ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
1421 rdmab_length(rb), rb->rg_direction);
1422 rb->rg_device = NULL;
9128c3e7
CL
1423}
1424
1425/**
1426 * rpcrdma_free_regbuf - deregister and free registered buffer
9128c3e7
CL
1427 * @rb: regbuf to be deregistered and freed
1428 */
1429void
13650c23 1430rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
9128c3e7 1431{
54cbd6b0 1432 rpcrdma_dma_unmap_regbuf(rb);
e531dcab 1433 kfree(rb);
9128c3e7
CL
1434}
1435
995d312a
CL
1436/**
1437 * rpcrdma_ep_post - Post WRs to a transport's Send Queue
1438 * @ia: transport's device information
1439 * @ep: transport's RDMA endpoint information
1440 * @req: rpcrdma_req containing the Send WR to post
c56c65fb 1441 *
995d312a
CL
1442 * Returns 0 if the post was successful, otherwise -ENOTCONN
1443 * is returned.
c56c65fb
TT
1444 */
1445int
1446rpcrdma_ep_post(struct rpcrdma_ia *ia,
1447 struct rpcrdma_ep *ep,
1448 struct rpcrdma_req *req)
1449{
ae72950a 1450 struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
655fec69 1451 int rc;
c56c65fb 1452
01bb35c8
CL
1453 if (!ep->rep_send_count ||
1454 test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
ae72950a
CL
1455 send_wr->send_flags |= IB_SEND_SIGNALED;
1456 ep->rep_send_count = ep->rep_send_batch;
1457 } else {
1458 send_wr->send_flags &= ~IB_SEND_SIGNALED;
1459 --ep->rep_send_count;
1460 }
ab03eff5 1461
5f62412b 1462 rc = frwr_send(ia, req);
ab03eff5 1463 trace_xprtrdma_post_send(req, rc);
c56c65fb 1464 if (rc)
ab03eff5 1465 return -ENOTCONN;
7a89f9c6 1466 return 0;
c56c65fb
TT
1467}
1468
6ceea368 1469static void
7c8d9e7c 1470rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
f531a5db 1471{
7c8d9e7c 1472 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
6ceea368 1473 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
7c8d9e7c
CL
1474 struct ib_recv_wr *wr, *bad_wr;
1475 int needed, count, rc;
f531a5db 1476
61c208a5
CL
1477 rc = 0;
1478 count = 0;
7c8d9e7c 1479 needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
6ceea368 1480 if (ep->rep_receive_count > needed)
61c208a5 1481 goto out;
6ceea368 1482 needed -= ep->rep_receive_count;
f531a5db 1483
7c8d9e7c
CL
1484 count = 0;
1485 wr = NULL;
1486 while (needed) {
1487 struct rpcrdma_regbuf *rb;
1488 struct rpcrdma_rep *rep;
f531a5db 1489
7c8d9e7c
CL
1490 spin_lock(&buf->rb_lock);
1491 rep = list_first_entry_or_null(&buf->rb_recv_bufs,
1492 struct rpcrdma_rep, rr_list);
1493 if (likely(rep))
1494 list_del(&rep->rr_list);
1495 spin_unlock(&buf->rb_lock);
1496 if (!rep) {
1497 if (rpcrdma_create_rep(r_xprt, temp))
1498 break;
1499 continue;
1500 }
f531a5db 1501
7c8d9e7c
CL
1502 rb = rep->rr_rdmabuf;
1503 if (!rpcrdma_regbuf_is_mapped(rb)) {
1504 if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) {
1505 rpcrdma_recv_buffer_put(rep);
1506 break;
1507 }
1508 }
f531a5db 1509
7c8d9e7c
CL
1510 trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
1511 rep->rr_recv_wr.next = wr;
1512 wr = &rep->rr_recv_wr;
1513 ++count;
1514 --needed;
1515 }
1516 if (!count)
61c208a5 1517 goto out;
7c8d9e7c 1518
d34ac5cd
BVA
1519 rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
1520 (const struct ib_recv_wr **)&bad_wr);
7c8d9e7c
CL
1521 if (rc) {
1522 for (wr = bad_wr; wr; wr = wr->next) {
1523 struct rpcrdma_rep *rep;
1524
1525 rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
1526 rpcrdma_recv_buffer_put(rep);
1527 --count;
1528 }
1529 }
6ceea368 1530 ep->rep_receive_count += count;
61c208a5 1531out:
7c8d9e7c 1532 trace_xprtrdma_post_recvs(r_xprt, count, rc);
f531a5db 1533}