1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
3 /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
4 /* Kai Shen <kaishen@linux.alibaba.com> */
5 /* Copyright (c) 2020-2021, Alibaba Group */
6 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
7 /* Copyright (c) 2008-2019, IBM Corporation */
9 #include <linux/errno.h>
10 #include <linux/pci.h>
11 #include <linux/scatterlist.h>
12 #include <linux/types.h>
14 #include <rdma/ib_user_verbs.h>
15 #include <rdma/ib_verbs.h>
19 #include "erdma_verbs.h"
21 void erdma_qp_llp_close(struct erdma_qp
*qp
)
23 struct erdma_qp_attrs qp_attrs
;
25 down_write(&qp
->state_lock
);
27 switch (qp
->attrs
.state
) {
28 case ERDMA_QP_STATE_RTS
:
29 case ERDMA_QP_STATE_RTR
:
30 case ERDMA_QP_STATE_IDLE
:
31 case ERDMA_QP_STATE_TERMINATE
:
32 qp_attrs
.state
= ERDMA_QP_STATE_CLOSING
;
33 erdma_modify_qp_internal(qp
, &qp_attrs
, ERDMA_QP_ATTR_STATE
);
35 case ERDMA_QP_STATE_CLOSING
:
36 qp
->attrs
.state
= ERDMA_QP_STATE_IDLE
;
43 erdma_cep_put(qp
->cep
);
47 up_write(&qp
->state_lock
);
50 struct ib_qp
*erdma_get_ibqp(struct ib_device
*ibdev
, int id
)
52 struct erdma_qp
*qp
= find_qp_by_qpn(to_edev(ibdev
), id
);
60 static int erdma_modify_qp_state_to_rts(struct erdma_qp
*qp
,
61 struct erdma_qp_attrs
*attrs
,
62 enum erdma_qp_attr_mask mask
)
65 struct erdma_dev
*dev
= qp
->dev
;
66 struct erdma_cmdq_modify_qp_req req
;
68 struct erdma_cep
*cep
= qp
->cep
;
69 struct sockaddr_storage local_addr
, remote_addr
;
71 if (!(mask
& ERDMA_QP_ATTR_LLP_HANDLE
))
74 if (!(mask
& ERDMA_QP_ATTR_MPA
))
77 ret
= getname_local(cep
->sock
, &local_addr
);
81 ret
= getname_peer(cep
->sock
, &remote_addr
);
85 qp
->attrs
.state
= ERDMA_QP_STATE_RTS
;
87 tp
= tcp_sk(qp
->cep
->sock
->sk
);
89 erdma_cmdq_build_reqhdr(&req
.hdr
, CMDQ_SUBMOD_RDMA
,
90 CMDQ_OPCODE_MODIFY_QP
);
92 req
.cfg
= FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK
, qp
->attrs
.state
) |
93 FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK
, qp
->attrs
.cc
) |
94 FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK
, QP_ID(qp
));
96 req
.cookie
= be32_to_cpu(qp
->cep
->mpa
.ext_data
.cookie
);
97 req
.dip
= to_sockaddr_in(remote_addr
).sin_addr
.s_addr
;
98 req
.sip
= to_sockaddr_in(local_addr
).sin_addr
.s_addr
;
99 req
.dport
= to_sockaddr_in(remote_addr
).sin_port
;
100 req
.sport
= to_sockaddr_in(local_addr
).sin_port
;
102 req
.send_nxt
= tp
->snd_nxt
;
103 /* rsvd tcp seq for mpa-rsp in server. */
104 if (qp
->attrs
.qp_type
== ERDMA_QP_PASSIVE
)
105 req
.send_nxt
+= MPA_DEFAULT_HDR_LEN
+ qp
->attrs
.pd_len
;
106 req
.recv_nxt
= tp
->rcv_nxt
;
108 return erdma_post_cmd_wait(&dev
->cmdq
, (u64
*)&req
, sizeof(req
), NULL
,
112 static int erdma_modify_qp_state_to_stop(struct erdma_qp
*qp
,
113 struct erdma_qp_attrs
*attrs
,
114 enum erdma_qp_attr_mask mask
)
116 struct erdma_dev
*dev
= qp
->dev
;
117 struct erdma_cmdq_modify_qp_req req
;
119 qp
->attrs
.state
= attrs
->state
;
121 erdma_cmdq_build_reqhdr(&req
.hdr
, CMDQ_SUBMOD_RDMA
,
122 CMDQ_OPCODE_MODIFY_QP
);
124 req
.cfg
= FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK
, attrs
->state
) |
125 FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK
, QP_ID(qp
));
127 return erdma_post_cmd_wait(&dev
->cmdq
, (u64
*)&req
, sizeof(req
), NULL
,
131 int erdma_modify_qp_internal(struct erdma_qp
*qp
, struct erdma_qp_attrs
*attrs
,
132 enum erdma_qp_attr_mask mask
)
134 int drop_conn
, ret
= 0;
139 if (!(mask
& ERDMA_QP_ATTR_STATE
))
142 switch (qp
->attrs
.state
) {
143 case ERDMA_QP_STATE_IDLE
:
144 case ERDMA_QP_STATE_RTR
:
145 if (attrs
->state
== ERDMA_QP_STATE_RTS
) {
146 ret
= erdma_modify_qp_state_to_rts(qp
, attrs
, mask
);
147 } else if (attrs
->state
== ERDMA_QP_STATE_ERROR
) {
148 qp
->attrs
.state
= ERDMA_QP_STATE_ERROR
;
150 erdma_cep_put(qp
->cep
);
153 ret
= erdma_modify_qp_state_to_stop(qp
, attrs
, mask
);
156 case ERDMA_QP_STATE_RTS
:
159 if (attrs
->state
== ERDMA_QP_STATE_CLOSING
) {
160 ret
= erdma_modify_qp_state_to_stop(qp
, attrs
, mask
);
162 } else if (attrs
->state
== ERDMA_QP_STATE_TERMINATE
) {
163 qp
->attrs
.state
= ERDMA_QP_STATE_TERMINATE
;
164 ret
= erdma_modify_qp_state_to_stop(qp
, attrs
, mask
);
166 } else if (attrs
->state
== ERDMA_QP_STATE_ERROR
) {
167 ret
= erdma_modify_qp_state_to_stop(qp
, attrs
, mask
);
168 qp
->attrs
.state
= ERDMA_QP_STATE_ERROR
;
173 erdma_qp_cm_drop(qp
);
176 case ERDMA_QP_STATE_TERMINATE
:
177 if (attrs
->state
== ERDMA_QP_STATE_ERROR
)
178 qp
->attrs
.state
= ERDMA_QP_STATE_ERROR
;
180 case ERDMA_QP_STATE_CLOSING
:
181 if (attrs
->state
== ERDMA_QP_STATE_IDLE
) {
182 qp
->attrs
.state
= ERDMA_QP_STATE_IDLE
;
183 } else if (attrs
->state
== ERDMA_QP_STATE_ERROR
) {
184 ret
= erdma_modify_qp_state_to_stop(qp
, attrs
, mask
);
185 qp
->attrs
.state
= ERDMA_QP_STATE_ERROR
;
186 } else if (attrs
->state
!= ERDMA_QP_STATE_CLOSING
) {
187 return -ECONNABORTED
;
197 static void erdma_qp_safe_free(struct kref
*ref
)
199 struct erdma_qp
*qp
= container_of(ref
, struct erdma_qp
, ref
);
201 complete(&qp
->safe_free
);
204 void erdma_qp_put(struct erdma_qp
*qp
)
206 WARN_ON(kref_read(&qp
->ref
) < 1);
207 kref_put(&qp
->ref
, erdma_qp_safe_free
);
210 void erdma_qp_get(struct erdma_qp
*qp
)
215 static int fill_inline_data(struct erdma_qp
*qp
,
216 const struct ib_send_wr
*send_wr
, u16 wqe_idx
,
217 u32 sgl_offset
, __le32
*length_field
)
219 u32 remain_size
, copy_size
, data_off
, bytes
= 0;
223 wqe_idx
+= (sgl_offset
>> SQEBB_SHIFT
);
224 sgl_offset
&= (SQEBB_SIZE
- 1);
225 data
= get_queue_entry(qp
->kern_qp
.sq_buf
, wqe_idx
, qp
->attrs
.sq_size
,
228 while (i
< send_wr
->num_sge
) {
229 bytes
+= send_wr
->sg_list
[i
].length
;
230 if (bytes
> (int)ERDMA_MAX_INLINE
)
233 remain_size
= send_wr
->sg_list
[i
].length
;
237 copy_size
= min(remain_size
, SQEBB_SIZE
- sgl_offset
);
239 memcpy(data
+ sgl_offset
,
240 (void *)(uintptr_t)send_wr
->sg_list
[i
].addr
+
243 remain_size
-= copy_size
;
244 data_off
+= copy_size
;
245 sgl_offset
+= copy_size
;
246 wqe_idx
+= (sgl_offset
>> SQEBB_SHIFT
);
247 sgl_offset
&= (SQEBB_SIZE
- 1);
249 data
= get_queue_entry(qp
->kern_qp
.sq_buf
, wqe_idx
,
250 qp
->attrs
.sq_size
, SQEBB_SHIFT
);
257 *length_field
= cpu_to_le32(bytes
);
262 static int fill_sgl(struct erdma_qp
*qp
, const struct ib_send_wr
*send_wr
,
263 u16 wqe_idx
, u32 sgl_offset
, __le32
*length_field
)
269 if (send_wr
->num_sge
> qp
->dev
->attrs
.max_send_sge
)
272 if (sgl_offset
& 0xF)
275 while (i
< send_wr
->num_sge
) {
276 wqe_idx
+= (sgl_offset
>> SQEBB_SHIFT
);
277 sgl_offset
&= (SQEBB_SIZE
- 1);
278 sgl
= get_queue_entry(qp
->kern_qp
.sq_buf
, wqe_idx
,
279 qp
->attrs
.sq_size
, SQEBB_SHIFT
);
281 bytes
+= send_wr
->sg_list
[i
].length
;
282 memcpy(sgl
+ sgl_offset
, &send_wr
->sg_list
[i
],
283 sizeof(struct ib_sge
));
285 sgl_offset
+= sizeof(struct ib_sge
);
289 *length_field
= cpu_to_le32(bytes
);
293 static int erdma_push_one_sqe(struct erdma_qp
*qp
, u16
*pi
,
294 const struct ib_send_wr
*send_wr
)
296 u32 wqe_size
, wqebb_cnt
, hw_op
, flags
, sgl_offset
;
297 u32 idx
= *pi
& (qp
->attrs
.sq_size
- 1);
298 enum ib_wr_opcode op
= send_wr
->opcode
;
299 struct erdma_readreq_sqe
*read_sqe
;
300 struct erdma_reg_mr_sqe
*regmr_sge
;
301 struct erdma_write_sqe
*write_sqe
;
302 struct erdma_send_sqe
*send_sqe
;
303 struct ib_rdma_wr
*rdma_wr
;
305 __le32
*length_field
;
311 entry
= get_queue_entry(qp
->kern_qp
.sq_buf
, idx
, qp
->attrs
.sq_size
,
314 /* Clear the SQE header section. */
317 qp
->kern_qp
.swr_tbl
[idx
] = send_wr
->wr_id
;
318 flags
= send_wr
->send_flags
;
319 wqe_hdr
= FIELD_PREP(
320 ERDMA_SQE_HDR_CE_MASK
,
321 ((flags
& IB_SEND_SIGNALED
) || qp
->kern_qp
.sig_all
) ? 1 : 0);
322 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_SE_MASK
,
323 flags
& IB_SEND_SOLICITED
? 1 : 0);
324 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK
,
325 flags
& IB_SEND_FENCE
? 1 : 0);
326 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK
,
327 flags
& IB_SEND_INLINE
? 1 : 0);
328 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK
, QP_ID(qp
));
331 case IB_WR_RDMA_WRITE
:
332 case IB_WR_RDMA_WRITE_WITH_IMM
:
333 hw_op
= ERDMA_OP_WRITE
;
334 if (op
== IB_WR_RDMA_WRITE_WITH_IMM
)
335 hw_op
= ERDMA_OP_WRITE_WITH_IMM
;
336 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK
, hw_op
);
337 rdma_wr
= container_of(send_wr
, struct ib_rdma_wr
, wr
);
338 write_sqe
= (struct erdma_write_sqe
*)entry
;
340 write_sqe
->imm_data
= send_wr
->ex
.imm_data
;
341 write_sqe
->sink_stag
= cpu_to_le32(rdma_wr
->rkey
);
342 write_sqe
->sink_to_h
=
343 cpu_to_le32(upper_32_bits(rdma_wr
->remote_addr
));
344 write_sqe
->sink_to_l
=
345 cpu_to_le32(lower_32_bits(rdma_wr
->remote_addr
));
347 length_field
= &write_sqe
->length
;
348 wqe_size
= sizeof(struct erdma_write_sqe
);
349 sgl_offset
= wqe_size
;
351 case IB_WR_RDMA_READ
:
352 case IB_WR_RDMA_READ_WITH_INV
:
353 read_sqe
= (struct erdma_readreq_sqe
*)entry
;
354 if (unlikely(send_wr
->num_sge
!= 1))
356 hw_op
= ERDMA_OP_READ
;
357 if (op
== IB_WR_RDMA_READ_WITH_INV
) {
358 hw_op
= ERDMA_OP_READ_WITH_INV
;
359 read_sqe
->invalid_stag
=
360 cpu_to_le32(send_wr
->ex
.invalidate_rkey
);
363 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK
, hw_op
);
364 rdma_wr
= container_of(send_wr
, struct ib_rdma_wr
, wr
);
365 read_sqe
->length
= cpu_to_le32(send_wr
->sg_list
[0].length
);
366 read_sqe
->sink_stag
= cpu_to_le32(send_wr
->sg_list
[0].lkey
);
367 read_sqe
->sink_to_l
=
368 cpu_to_le32(lower_32_bits(send_wr
->sg_list
[0].addr
));
369 read_sqe
->sink_to_h
=
370 cpu_to_le32(upper_32_bits(send_wr
->sg_list
[0].addr
));
372 sge
= get_queue_entry(qp
->kern_qp
.sq_buf
, idx
+ 1,
373 qp
->attrs
.sq_size
, SQEBB_SHIFT
);
374 sge
->addr
= rdma_wr
->remote_addr
;
375 sge
->lkey
= rdma_wr
->rkey
;
376 sge
->length
= send_wr
->sg_list
[0].length
;
377 wqe_size
= sizeof(struct erdma_readreq_sqe
) +
378 send_wr
->num_sge
* sizeof(struct ib_sge
);
382 case IB_WR_SEND_WITH_IMM
:
383 case IB_WR_SEND_WITH_INV
:
384 send_sqe
= (struct erdma_send_sqe
*)entry
;
385 hw_op
= ERDMA_OP_SEND
;
386 if (op
== IB_WR_SEND_WITH_IMM
) {
387 hw_op
= ERDMA_OP_SEND_WITH_IMM
;
388 send_sqe
->imm_data
= send_wr
->ex
.imm_data
;
389 } else if (op
== IB_WR_SEND_WITH_INV
) {
390 hw_op
= ERDMA_OP_SEND_WITH_INV
;
391 send_sqe
->invalid_stag
=
392 cpu_to_le32(send_wr
->ex
.invalidate_rkey
);
394 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK
, hw_op
);
395 length_field
= &send_sqe
->length
;
396 wqe_size
= sizeof(struct erdma_send_sqe
);
397 sgl_offset
= wqe_size
;
402 FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK
, ERDMA_OP_REG_MR
);
403 regmr_sge
= (struct erdma_reg_mr_sqe
*)entry
;
404 mr
= to_emr(reg_wr(send_wr
)->mr
);
406 mr
->access
= ERDMA_MR_ACC_LR
|
407 to_erdma_access_flags(reg_wr(send_wr
)->access
);
408 regmr_sge
->addr
= cpu_to_le64(mr
->ibmr
.iova
);
409 regmr_sge
->length
= cpu_to_le32(mr
->ibmr
.length
);
410 regmr_sge
->stag
= cpu_to_le32(mr
->ibmr
.lkey
);
411 attrs
= FIELD_PREP(ERDMA_SQE_MR_MODE_MASK
, 0) |
412 FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK
, mr
->access
) |
413 FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK
,
416 if (mr
->mem
.mtt_nents
< ERDMA_MAX_INLINE_MTT_ENTRIES
) {
417 attrs
|= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK
, 0);
418 /* Copy SGLs to SQE content to accelerate */
419 memcpy(get_queue_entry(qp
->kern_qp
.sq_buf
, idx
+ 1,
420 qp
->attrs
.sq_size
, SQEBB_SHIFT
),
421 mr
->mem
.mtt_buf
, MTT_SIZE(mr
->mem
.mtt_nents
));
422 wqe_size
= sizeof(struct erdma_reg_mr_sqe
) +
423 MTT_SIZE(mr
->mem
.mtt_nents
);
425 attrs
|= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK
, 1);
426 wqe_size
= sizeof(struct erdma_reg_mr_sqe
);
429 regmr_sge
->attrs
= cpu_to_le32(attrs
);
431 case IB_WR_LOCAL_INV
:
432 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK
,
434 regmr_sge
= (struct erdma_reg_mr_sqe
*)entry
;
435 regmr_sge
->stag
= cpu_to_le32(send_wr
->ex
.invalidate_rkey
);
436 wqe_size
= sizeof(struct erdma_reg_mr_sqe
);
442 if (flags
& IB_SEND_INLINE
) {
443 ret
= fill_inline_data(qp
, send_wr
, idx
, sgl_offset
,
448 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK
, ret
);
450 ret
= fill_sgl(qp
, send_wr
, idx
, sgl_offset
, length_field
);
453 wqe_size
+= send_wr
->num_sge
* sizeof(struct ib_sge
);
454 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK
,
459 wqebb_cnt
= SQEBB_COUNT(wqe_size
);
460 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK
, wqebb_cnt
- 1);
462 wqe_hdr
|= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK
, *pi
);
469 static void kick_sq_db(struct erdma_qp
*qp
, u16 pi
)
471 u64 db_data
= FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK
, QP_ID(qp
)) |
472 FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK
, pi
);
474 *(u64
*)qp
->kern_qp
.sq_db_info
= db_data
;
475 writeq(db_data
, qp
->kern_qp
.hw_sq_db
);
478 int erdma_post_send(struct ib_qp
*ibqp
, const struct ib_send_wr
*send_wr
,
479 const struct ib_send_wr
**bad_send_wr
)
481 struct erdma_qp
*qp
= to_eqp(ibqp
);
483 const struct ib_send_wr
*wr
= send_wr
;
490 spin_lock_irqsave(&qp
->lock
, flags
);
491 sq_pi
= qp
->kern_qp
.sq_pi
;
494 if ((u16
)(sq_pi
- qp
->kern_qp
.sq_ci
) >= qp
->attrs
.sq_size
) {
496 *bad_send_wr
= send_wr
;
500 ret
= erdma_push_one_sqe(qp
, &sq_pi
, wr
);
505 qp
->kern_qp
.sq_pi
= sq_pi
;
506 kick_sq_db(qp
, sq_pi
);
510 spin_unlock_irqrestore(&qp
->lock
, flags
);
515 static int erdma_post_recv_one(struct erdma_qp
*qp
,
516 const struct ib_recv_wr
*recv_wr
)
518 struct erdma_rqe
*rqe
=
519 get_queue_entry(qp
->kern_qp
.rq_buf
, qp
->kern_qp
.rq_pi
,
520 qp
->attrs
.rq_size
, RQE_SHIFT
);
522 rqe
->qe_idx
= cpu_to_le16(qp
->kern_qp
.rq_pi
+ 1);
523 rqe
->qpn
= cpu_to_le32(QP_ID(qp
));
525 if (recv_wr
->num_sge
== 0) {
527 } else if (recv_wr
->num_sge
== 1) {
528 rqe
->stag
= cpu_to_le32(recv_wr
->sg_list
[0].lkey
);
529 rqe
->to
= cpu_to_le64(recv_wr
->sg_list
[0].addr
);
530 rqe
->length
= cpu_to_le32(recv_wr
->sg_list
[0].length
);
535 *(u64
*)qp
->kern_qp
.rq_db_info
= *(u64
*)rqe
;
536 writeq(*(u64
*)rqe
, qp
->kern_qp
.hw_rq_db
);
538 qp
->kern_qp
.rwr_tbl
[qp
->kern_qp
.rq_pi
& (qp
->attrs
.rq_size
- 1)] =
545 int erdma_post_recv(struct ib_qp
*ibqp
, const struct ib_recv_wr
*recv_wr
,
546 const struct ib_recv_wr
**bad_recv_wr
)
548 const struct ib_recv_wr
*wr
= recv_wr
;
549 struct erdma_qp
*qp
= to_eqp(ibqp
);
553 spin_lock_irqsave(&qp
->lock
, flags
);
556 ret
= erdma_post_recv_one(qp
, wr
);
564 spin_unlock_irqrestore(&qp
->lock
, flags
);