1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
6 #include <linux/errno.h>
7 #include <linux/types.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
13 #include <rdma/iw_cm.h>
14 #include <rdma/ib_verbs.h>
15 #include <rdma/ib_user_verbs.h>
18 #include "siw_verbs.h"
21 #define MAX_HDR_INLINE \
22 (((uint32_t)(sizeof(struct siw_rreq_pkt) - \
23 sizeof(struct iwarp_send))) & 0xF8)
25 static struct page
*siw_get_pblpage(struct siw_mem
*mem
, u64 addr
, int *idx
)
27 struct siw_pbl
*pbl
= mem
->pbl
;
28 u64 offset
= addr
- mem
->va
;
29 dma_addr_t paddr
= siw_pbl_get_buffer(pbl
, offset
, NULL
, idx
);
32 return virt_to_page((void *)paddr
);
38 * Copy short payload at provided destination payload address
40 static int siw_try_1seg(struct siw_iwarp_tx
*c_tx
, void *paddr
)
42 struct siw_wqe
*wqe
= &c_tx
->wqe_active
;
43 struct siw_sge
*sge
= &wqe
->sqe
.sge
[0];
44 u32 bytes
= sge
->length
;
46 if (bytes
> MAX_HDR_INLINE
|| wqe
->sqe
.num_sge
!= 1)
47 return MAX_HDR_INLINE
+ 1;
52 if (tx_flags(wqe
) & SIW_WQE_INLINE
) {
53 memcpy(paddr
, &wqe
->sqe
.sge
[1], bytes
);
55 struct siw_mem
*mem
= wqe
->mem
[0];
58 /* Kernel client using kva */
60 (const void *)(uintptr_t)sge
->laddr
, bytes
);
61 } else if (c_tx
->in_syscall
) {
62 if (copy_from_user(paddr
, u64_to_user_ptr(sge
->laddr
),
66 unsigned int off
= sge
->laddr
& ~PAGE_MASK
;
72 p
= siw_get_upage(mem
->umem
, sge
->laddr
);
74 p
= siw_get_pblpage(mem
, sge
->laddr
, &pbl_idx
);
79 buffer
= kmap_local_page(p
);
81 if (likely(PAGE_SIZE
- off
>= bytes
)) {
82 memcpy(paddr
, buffer
+ off
, bytes
);
84 unsigned long part
= bytes
- (PAGE_SIZE
- off
);
86 memcpy(paddr
, buffer
+ off
, part
);
90 p
= siw_get_upage(mem
->umem
,
93 p
= siw_get_pblpage(mem
,
99 buffer
= kmap_local_page(p
);
100 memcpy(paddr
+ part
, buffer
, bytes
- part
);
102 kunmap_local(buffer
);
108 #define PKT_FRAGMENTED 1
109 #define PKT_COMPLETE 0
112 * siw_qp_prepare_tx()
114 * Prepare tx state for sending out one fpdu. Builds complete pkt
115 * if no user data or only immediate data are present.
117 * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
119 static int siw_qp_prepare_tx(struct siw_iwarp_tx
*c_tx
)
121 struct siw_wqe
*wqe
= &c_tx
->wqe_active
;
125 switch (tx_type(wqe
)) {
127 case SIW_OP_READ_LOCAL_INV
:
128 memcpy(&c_tx
->pkt
.ctrl
,
129 &iwarp_pktinfo
[RDMAP_RDMA_READ_REQ
].ctrl
,
130 sizeof(struct iwarp_ctrl
));
132 c_tx
->pkt
.rreq
.rsvd
= 0;
133 c_tx
->pkt
.rreq
.ddp_qn
= htonl(RDMAP_UNTAGGED_QN_RDMA_READ
);
134 c_tx
->pkt
.rreq
.ddp_msn
=
135 htonl(++c_tx
->ddp_msn
[RDMAP_UNTAGGED_QN_RDMA_READ
]);
136 c_tx
->pkt
.rreq
.ddp_mo
= 0;
137 c_tx
->pkt
.rreq
.sink_stag
= htonl(wqe
->sqe
.sge
[0].lkey
);
138 c_tx
->pkt
.rreq
.sink_to
=
139 cpu_to_be64(wqe
->sqe
.sge
[0].laddr
);
140 c_tx
->pkt
.rreq
.source_stag
= htonl(wqe
->sqe
.rkey
);
141 c_tx
->pkt
.rreq
.source_to
= cpu_to_be64(wqe
->sqe
.raddr
);
142 c_tx
->pkt
.rreq
.read_size
= htonl(wqe
->sqe
.sge
[0].length
);
144 c_tx
->ctrl_len
= sizeof(struct iwarp_rdma_rreq
);
145 crc
= (char *)&c_tx
->pkt
.rreq_pkt
.crc
;
149 if (tx_flags(wqe
) & SIW_WQE_SOLICITED
)
150 memcpy(&c_tx
->pkt
.ctrl
,
151 &iwarp_pktinfo
[RDMAP_SEND_SE
].ctrl
,
152 sizeof(struct iwarp_ctrl
));
154 memcpy(&c_tx
->pkt
.ctrl
, &iwarp_pktinfo
[RDMAP_SEND
].ctrl
,
155 sizeof(struct iwarp_ctrl
));
157 c_tx
->pkt
.send
.ddp_qn
= RDMAP_UNTAGGED_QN_SEND
;
158 c_tx
->pkt
.send
.ddp_msn
=
159 htonl(++c_tx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
]);
160 c_tx
->pkt
.send
.ddp_mo
= 0;
162 c_tx
->pkt
.send_inv
.inval_stag
= 0;
164 c_tx
->ctrl_len
= sizeof(struct iwarp_send
);
166 crc
= (char *)&c_tx
->pkt
.send_pkt
.crc
;
167 data
= siw_try_1seg(c_tx
, crc
);
170 case SIW_OP_SEND_REMOTE_INV
:
171 if (tx_flags(wqe
) & SIW_WQE_SOLICITED
)
172 memcpy(&c_tx
->pkt
.ctrl
,
173 &iwarp_pktinfo
[RDMAP_SEND_SE_INVAL
].ctrl
,
174 sizeof(struct iwarp_ctrl
));
176 memcpy(&c_tx
->pkt
.ctrl
,
177 &iwarp_pktinfo
[RDMAP_SEND_INVAL
].ctrl
,
178 sizeof(struct iwarp_ctrl
));
180 c_tx
->pkt
.send
.ddp_qn
= RDMAP_UNTAGGED_QN_SEND
;
181 c_tx
->pkt
.send
.ddp_msn
=
182 htonl(++c_tx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
]);
183 c_tx
->pkt
.send
.ddp_mo
= 0;
185 c_tx
->pkt
.send_inv
.inval_stag
= cpu_to_be32(wqe
->sqe
.rkey
);
187 c_tx
->ctrl_len
= sizeof(struct iwarp_send_inv
);
189 crc
= (char *)&c_tx
->pkt
.send_pkt
.crc
;
190 data
= siw_try_1seg(c_tx
, crc
);
194 memcpy(&c_tx
->pkt
.ctrl
, &iwarp_pktinfo
[RDMAP_RDMA_WRITE
].ctrl
,
195 sizeof(struct iwarp_ctrl
));
197 c_tx
->pkt
.rwrite
.sink_stag
= htonl(wqe
->sqe
.rkey
);
198 c_tx
->pkt
.rwrite
.sink_to
= cpu_to_be64(wqe
->sqe
.raddr
);
199 c_tx
->ctrl_len
= sizeof(struct iwarp_rdma_write
);
201 crc
= (char *)&c_tx
->pkt
.write_pkt
.crc
;
202 data
= siw_try_1seg(c_tx
, crc
);
205 case SIW_OP_READ_RESPONSE
:
206 memcpy(&c_tx
->pkt
.ctrl
,
207 &iwarp_pktinfo
[RDMAP_RDMA_READ_RESP
].ctrl
,
208 sizeof(struct iwarp_ctrl
));
211 c_tx
->pkt
.rresp
.sink_stag
= cpu_to_be32(wqe
->sqe
.rkey
);
212 c_tx
->pkt
.rresp
.sink_to
= cpu_to_be64(wqe
->sqe
.raddr
);
214 c_tx
->ctrl_len
= sizeof(struct iwarp_rdma_rresp
);
216 crc
= (char *)&c_tx
->pkt
.write_pkt
.crc
;
217 data
= siw_try_1seg(c_tx
, crc
);
221 siw_dbg_qp(tx_qp(c_tx
), "stale wqe type %d\n", tx_type(wqe
));
224 if (unlikely(data
< 0))
229 if (data
<= MAX_HDR_INLINE
) {
231 wqe
->processed
= data
;
233 c_tx
->pkt
.ctrl
.mpa_len
=
234 htons(c_tx
->ctrl_len
+ data
- MPA_HDR_SIZE
);
236 /* Add pad, if needed */
237 data
+= -(int)data
& 0x3;
238 /* advance CRC location after payload */
240 c_tx
->ctrl_len
+= data
;
242 if (!(c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
& DDP_FLAG_TAGGED
))
243 c_tx
->pkt
.c_untagged
.ddp_mo
= 0;
245 c_tx
->pkt
.c_tagged
.ddp_to
=
246 cpu_to_be64(wqe
->sqe
.raddr
);
251 * Do complete CRC if enabled and short packet
253 if (c_tx
->mpa_crc_hd
) {
254 crypto_shash_init(c_tx
->mpa_crc_hd
);
255 if (crypto_shash_update(c_tx
->mpa_crc_hd
,
259 crypto_shash_final(c_tx
->mpa_crc_hd
, (u8
*)crc
);
261 c_tx
->ctrl_len
+= MPA_CRC_SIZE
;
265 c_tx
->ctrl_len
+= MPA_CRC_SIZE
;
271 * Allow direct sending out of user buffer if WR is non signalled
272 * and payload is over threshold.
273 * Per RDMA verbs, the application should not change the send buffer
274 * until the work completed. In iWarp, work completion is only
275 * local delivery to TCP. TCP may reuse the buffer for
276 * retransmission. Changing unsent data also breaks the CRC,
279 if (c_tx
->zcopy_tx
&& wqe
->bytes
>= SENDPAGE_THRESH
&&
280 !(tx_flags(wqe
) & SIW_WQE_SIGNALLED
))
281 c_tx
->use_sendpage
= 1;
283 c_tx
->use_sendpage
= 0;
285 return PKT_FRAGMENTED
;
289 * Send out one complete control type FPDU, or header of FPDU carrying
290 * data. Used for fixed sized packets like Read.Requests or zero length
291 * SENDs, WRITEs, READ.Responses, or header only.
293 static int siw_tx_ctrl(struct siw_iwarp_tx
*c_tx
, struct socket
*s
,
296 struct msghdr msg
= { .msg_flags
= flags
};
297 struct kvec iov
= { .iov_base
=
298 (char *)&c_tx
->pkt
.ctrl
+ c_tx
->ctrl_sent
,
299 .iov_len
= c_tx
->ctrl_len
- c_tx
->ctrl_sent
};
301 int rv
= kernel_sendmsg(s
, &msg
, &iov
, 1,
302 c_tx
->ctrl_len
- c_tx
->ctrl_sent
);
305 c_tx
->ctrl_sent
+= rv
;
307 if (c_tx
->ctrl_sent
== c_tx
->ctrl_len
)
316 * 0copy TCP transmit interface: Use do_tcp_sendpages.
318 * Using sendpage to push page by page appears to be less efficient
319 * than using sendmsg, even if data are copied.
321 * A general performance limitation might be the extra four bytes
322 * trailer checksum segment to be pushed after user data.
324 static int siw_tcp_sendpages(struct socket
*s
, struct page
**page
, int offset
,
327 struct sock
*sk
= s
->sk
;
328 int i
= 0, rv
= 0, sent
= 0,
329 flags
= MSG_MORE
| MSG_DONTWAIT
| MSG_SENDPAGE_NOTLAST
;
332 size_t bytes
= min_t(size_t, PAGE_SIZE
- offset
, size
);
334 if (size
+ offset
<= PAGE_SIZE
)
335 flags
= MSG_MORE
| MSG_DONTWAIT
;
337 tcp_rate_check_app_limited(sk
);
340 rv
= do_tcp_sendpages(sk
, page
[i
], offset
, bytes
, flags
);
353 if (rv
== -EAGAIN
|| rv
== 0)
365 * Pushes list of pages to TCP socket. If pages from multiple
366 * SGE's, all referenced pages of each SGE are pushed in one
369 static int siw_0copy_tx(struct socket
*s
, struct page
**page
,
370 struct siw_sge
*sge
, unsigned int offset
,
373 int i
= 0, sent
= 0, rv
;
374 int sge_bytes
= min(sge
->length
- offset
, size
);
376 offset
= (sge
->laddr
+ offset
) & ~PAGE_MASK
;
378 while (sent
!= size
) {
379 rv
= siw_tcp_sendpages(s
, &page
[i
], offset
, sge_bytes
);
382 if (size
== sent
|| sge_bytes
> rv
)
385 i
+= PAGE_ALIGN(sge_bytes
+ offset
) >> PAGE_SHIFT
;
387 sge_bytes
= min(sge
->length
, size
- sent
);
388 offset
= sge
->laddr
& ~PAGE_MASK
;
397 #define MAX_TRAILER (MPA_CRC_SIZE + 4)
399 static void siw_unmap_pages(struct kvec
*iov
, unsigned long kmap_mask
, int len
)
404 * Work backwards through the array to honor the kmap_local_page()
405 * ordering requirements.
407 for (i
= (len
-1); i
>= 0; i
--) {
408 if (kmap_mask
& BIT(i
)) {
409 unsigned long addr
= (unsigned long)iov
[i
].iov_base
;
411 kunmap_local((void *)(addr
& PAGE_MASK
));
417 * siw_tx_hdt() tries to push a complete packet to TCP where all
418 * packet fragments are referenced by the elements of one iovec.
419 * For the data portion, each involved page must be referenced by
420 * one extra element. All sge's data can be non-aligned to page
421 * boundaries. Two more elements are referencing iWARP header
423 * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL
425 #define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2))
428 * Write out iov referencing hdr, data and trailer of current FPDU.
429 * Update transmit state dependent on write return status
431 static int siw_tx_hdt(struct siw_iwarp_tx
*c_tx
, struct socket
*s
)
433 struct siw_wqe
*wqe
= &c_tx
->wqe_active
;
434 struct siw_sge
*sge
= &wqe
->sqe
.sge
[c_tx
->sge_idx
];
435 struct kvec iov
[MAX_ARRAY
];
436 struct page
*page_array
[MAX_ARRAY
];
437 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
| MSG_EOR
};
439 int seg
= 0, do_crc
= c_tx
->do_crc
, is_kva
= 0, rv
;
440 unsigned int data_len
= c_tx
->bytes_unsent
, hdr_len
= 0, trl_len
= 0,
441 sge_off
= c_tx
->sge_off
, sge_idx
= c_tx
->sge_idx
,
442 pbl_idx
= c_tx
->pbl_idx
;
443 unsigned long kmap_mask
= 0L;
445 if (c_tx
->state
== SIW_SEND_HDR
) {
446 if (c_tx
->use_sendpage
) {
447 rv
= siw_tx_ctrl(c_tx
, s
, MSG_DONTWAIT
| MSG_MORE
);
451 c_tx
->state
= SIW_SEND_DATA
;
454 (char *)&c_tx
->pkt
.ctrl
+ c_tx
->ctrl_sent
;
455 iov
[0].iov_len
= hdr_len
=
456 c_tx
->ctrl_len
- c_tx
->ctrl_sent
;
461 wqe
->processed
+= data_len
;
463 while (data_len
) { /* walk the list of SGE's */
464 unsigned int sge_len
= min(sge
->length
- sge_off
, data_len
);
465 unsigned int fp_off
= (sge
->laddr
+ sge_off
) & ~PAGE_MASK
;
468 if (!(tx_flags(wqe
) & SIW_WQE_INLINE
)) {
469 mem
= wqe
->mem
[sge_idx
];
470 is_kva
= mem
->mem_obj
== NULL
? 1 : 0;
474 if (is_kva
&& !c_tx
->use_sendpage
) {
476 * tx from kernel virtual address: either inline data
477 * or memory region with assigned kernel buffer
480 (void *)(uintptr_t)(sge
->laddr
+ sge_off
);
481 iov
[seg
].iov_len
= sge_len
;
484 crypto_shash_update(c_tx
->mpa_crc_hd
,
494 size_t plen
= min((int)PAGE_SIZE
- fp_off
, sge_len
);
502 mem
, sge
->laddr
+ sge_off
,
505 p
= siw_get_upage(mem
->umem
,
506 sge
->laddr
+ sge_off
);
508 siw_unmap_pages(iov
, kmap_mask
, seg
);
509 wqe
->processed
-= c_tx
->bytes_unsent
;
515 if (!c_tx
->use_sendpage
) {
516 void *kaddr
= kmap_local_page(p
);
518 /* Remember for later kunmap() */
519 kmap_mask
|= BIT(seg
);
520 iov
[seg
].iov_base
= kaddr
+ fp_off
;
521 iov
[seg
].iov_len
= plen
;
529 kaddr
= kmap_local_page(p
);
530 crypto_shash_update(c_tx
->mpa_crc_hd
,
537 * Cast to an uintptr_t to preserve all 64 bits
540 uintptr_t va
= (uintptr_t)(sge
->laddr
+ sge_off
);
543 * virt_to_page() takes a (void *) pointer
544 * so cast to a (void *) meaning it will be 64
545 * bits on a 64 bit platform and 32 bits on a
548 page_array
[seg
] = virt_to_page((void *)(va
& PAGE_MASK
));
561 if (++seg
> (int)MAX_ARRAY
) {
562 siw_dbg_qp(tx_qp(c_tx
), "to many fragments\n");
563 siw_unmap_pages(iov
, kmap_mask
, seg
-1);
564 wqe
->processed
-= c_tx
->bytes_unsent
;
570 /* Update SGE variables at end of SGE */
571 if (sge_off
== sge
->length
&&
572 (data_len
!= 0 || wqe
->processed
< wqe
->bytes
)) {
579 if (likely(c_tx
->state
!= SIW_SEND_TRAILER
)) {
580 iov
[seg
].iov_base
= &c_tx
->trailer
.pad
[4 - c_tx
->pad
];
581 iov
[seg
].iov_len
= trl_len
= MAX_TRAILER
- (4 - c_tx
->pad
);
583 iov
[seg
].iov_base
= &c_tx
->trailer
.pad
[c_tx
->ctrl_sent
];
584 iov
[seg
].iov_len
= trl_len
= MAX_TRAILER
- c_tx
->ctrl_sent
;
588 *(u32
*)c_tx
->trailer
.pad
= 0;
590 crypto_shash_update(c_tx
->mpa_crc_hd
,
591 (u8
*)&c_tx
->trailer
.crc
- c_tx
->pad
,
594 if (!c_tx
->mpa_crc_hd
)
595 c_tx
->trailer
.crc
= 0;
597 crypto_shash_final(c_tx
->mpa_crc_hd
, (u8
*)&c_tx
->trailer
.crc
);
599 data_len
= c_tx
->bytes_unsent
;
601 if (c_tx
->use_sendpage
) {
602 rv
= siw_0copy_tx(s
, page_array
, &wqe
->sqe
.sge
[c_tx
->sge_idx
],
603 c_tx
->sge_off
, data_len
);
604 if (rv
== data_len
) {
605 rv
= kernel_sendmsg(s
, &msg
, &iov
[seg
], 1, trl_len
);
612 rv
= kernel_sendmsg(s
, &msg
, iov
, seg
+ 1,
613 hdr_len
+ data_len
+ trl_len
);
614 siw_unmap_pages(iov
, kmap_mask
, seg
);
616 if (rv
< (int)hdr_len
) {
617 /* Not even complete hdr pushed or negative rv */
618 wqe
->processed
-= data_len
;
620 c_tx
->ctrl_sent
+= rv
;
627 if (rv
>= (int)data_len
) {
628 /* all user data pushed to TCP or no data to push */
629 if (data_len
> 0 && wqe
->processed
< wqe
->bytes
) {
630 /* Save the current state for next tx */
631 c_tx
->sge_idx
= sge_idx
;
632 c_tx
->sge_off
= sge_off
;
633 c_tx
->pbl_idx
= pbl_idx
;
637 if (rv
== trl_len
) /* all pushed */
640 c_tx
->state
= SIW_SEND_TRAILER
;
641 c_tx
->ctrl_len
= MAX_TRAILER
;
642 c_tx
->ctrl_sent
= rv
+ 4 - c_tx
->pad
;
643 c_tx
->bytes_unsent
= 0;
647 } else if (data_len
> 0) {
648 /* Maybe some user data pushed to TCP */
649 c_tx
->state
= SIW_SEND_DATA
;
650 wqe
->processed
-= data_len
- rv
;
654 * Some bytes out. Recompute tx state based
655 * on old state and bytes pushed
657 unsigned int sge_unsent
;
659 c_tx
->bytes_unsent
-= rv
;
660 sge
= &wqe
->sqe
.sge
[c_tx
->sge_idx
];
661 sge_unsent
= sge
->length
- c_tx
->sge_off
;
663 while (sge_unsent
<= rv
) {
668 sge_unsent
= sge
->length
;
680 static void siw_update_tcpseg(struct siw_iwarp_tx
*c_tx
,
683 struct tcp_sock
*tp
= tcp_sk(s
->sk
);
686 if (c_tx
->gso_seg_limit
== 0)
687 c_tx
->tcp_seglen
= tp
->mss_cache
* tp
->gso_segs
;
691 min_t(u16
, c_tx
->gso_seg_limit
, tp
->gso_segs
);
693 c_tx
->tcp_seglen
= tp
->mss_cache
;
695 /* Loopback may give odd numbers */
696 c_tx
->tcp_seglen
&= 0xfffffff8;
702 * Prepares transmit context to send out one FPDU if FPDU will contain
703 * user data and user data are not immediate data.
704 * Computes maximum FPDU length to fill up TCP MSS if possible.
706 * @qp: QP from which to transmit
707 * @wqe: Current WQE causing transmission
709 * TODO: Take into account real available sendspace on socket
710 * to avoid header misalignment due to send pausing within
713 static void siw_prepare_fpdu(struct siw_qp
*qp
, struct siw_wqe
*wqe
)
715 struct siw_iwarp_tx
*c_tx
= &qp
->tx_ctx
;
719 iwarp_pktinfo
[__rdmap_get_opcode(&c_tx
->pkt
.ctrl
)].hdr_len
;
723 * Update target buffer offset if any
725 if (!(c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
& DDP_FLAG_TAGGED
))
726 /* Untagged message */
727 c_tx
->pkt
.c_untagged
.ddp_mo
= cpu_to_be32(wqe
->processed
);
728 else /* Tagged message */
729 c_tx
->pkt
.c_tagged
.ddp_to
=
730 cpu_to_be64(wqe
->sqe
.raddr
+ wqe
->processed
);
732 data_len
= wqe
->bytes
- wqe
->processed
;
733 if (data_len
+ c_tx
->ctrl_len
+ MPA_CRC_SIZE
> c_tx
->tcp_seglen
) {
734 /* Trim DDP payload to fit into current TCP segment */
735 data_len
= c_tx
->tcp_seglen
- (c_tx
->ctrl_len
+ MPA_CRC_SIZE
);
736 c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
&= ~DDP_FLAG_LAST
;
739 c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
|= DDP_FLAG_LAST
;
740 c_tx
->pad
= -data_len
& 0x3;
742 c_tx
->bytes_unsent
= data_len
;
744 c_tx
->pkt
.ctrl
.mpa_len
=
745 htons(c_tx
->ctrl_len
+ data_len
- MPA_HDR_SIZE
);
748 * Init MPA CRC computation
750 if (c_tx
->mpa_crc_hd
) {
751 crypto_shash_init(c_tx
->mpa_crc_hd
);
752 crypto_shash_update(c_tx
->mpa_crc_hd
, (u8
*)&c_tx
->pkt
,
761 * Check permissions for a list of SGE's (SGL).
762 * A successful check will have all memory referenced
763 * for transmission resolved and assigned to the WQE.
765 * @pd: Protection Domain SGL should belong to
766 * @wqe: WQE to be checked
767 * @perms: requested access permissions
771 static int siw_check_sgl_tx(struct ib_pd
*pd
, struct siw_wqe
*wqe
,
772 enum ib_access_flags perms
)
774 struct siw_sge
*sge
= &wqe
->sqe
.sge
[0];
775 int i
, len
, num_sge
= wqe
->sqe
.num_sge
;
777 if (unlikely(num_sge
> SIW_MAX_SGE
))
780 for (i
= 0, len
= 0; num_sge
; num_sge
--, i
++, sge
++) {
782 * rdma verbs: do not check stag for a zero length sge
785 int rv
= siw_check_sge(pd
, sge
, &wqe
->mem
[i
], perms
, 0,
788 if (unlikely(rv
!= E_ACCESS_OK
))
797 * siw_qp_sq_proc_tx()
799 * Process one WQE which needs transmission on the wire.
801 static int siw_qp_sq_proc_tx(struct siw_qp
*qp
, struct siw_wqe
*wqe
)
803 struct siw_iwarp_tx
*c_tx
= &qp
->tx_ctx
;
804 struct socket
*s
= qp
->attrs
.sk
;
805 int rv
= 0, burst_len
= qp
->tx_ctx
.burst
;
806 enum rdmap_ecode ecode
= RDMAP_ECODE_CATASTROPHIC_STREAM
;
808 if (unlikely(wqe
->wr_status
== SIW_WR_IDLE
))
812 burst_len
= SQ_USER_MAXBURST
;
814 if (wqe
->wr_status
== SIW_WR_QUEUED
) {
815 if (!(wqe
->sqe
.flags
& SIW_WQE_INLINE
)) {
816 if (tx_type(wqe
) == SIW_OP_READ_RESPONSE
)
817 wqe
->sqe
.num_sge
= 1;
819 if (tx_type(wqe
) != SIW_OP_READ
&&
820 tx_type(wqe
) != SIW_OP_READ_LOCAL_INV
) {
822 * Reference memory to be tx'd w/o checking
823 * access for LOCAL_READ permission, since
824 * not defined in RDMA core.
826 rv
= siw_check_sgl_tx(qp
->pd
, wqe
, 0);
829 SIW_OP_READ_RESPONSE
)
830 ecode
= siw_rdmap_error(-rv
);
839 wqe
->bytes
= wqe
->sqe
.sge
[0].length
;
840 if (!rdma_is_kernel_res(&qp
->base_qp
.res
)) {
841 if (wqe
->bytes
> SIW_MAX_INLINE
) {
845 wqe
->sqe
.sge
[0].laddr
=
846 (u64
)(uintptr_t)&wqe
->sqe
.sge
[1];
849 wqe
->wr_status
= SIW_WR_INPROGRESS
;
852 siw_update_tcpseg(c_tx
, s
);
854 rv
= siw_qp_prepare_tx(c_tx
);
855 if (rv
== PKT_FRAGMENTED
) {
856 c_tx
->state
= SIW_SEND_HDR
;
857 siw_prepare_fpdu(qp
, wqe
);
858 } else if (rv
== PKT_COMPLETE
) {
859 c_tx
->state
= SIW_SEND_SHORT_FPDU
;
866 siw_dbg_qp(qp
, "wr type %d, state %d, data %u, sent %u, id %llx\n",
867 tx_type(wqe
), wqe
->wr_status
, wqe
->bytes
, wqe
->processed
,
870 if (--burst_len
== 0) {
874 if (c_tx
->state
== SIW_SEND_SHORT_FPDU
) {
875 enum siw_opcode tx_type
= tx_type(wqe
);
876 unsigned int msg_flags
;
878 if (siw_sq_empty(qp
) || !siw_tcp_nagle
|| burst_len
== 1)
880 * End current TCP segment, if SQ runs empty,
881 * or siw_tcp_nagle is not set, or we bail out
882 * soon due to no burst credit left.
884 msg_flags
= MSG_DONTWAIT
;
886 msg_flags
= MSG_DONTWAIT
| MSG_MORE
;
888 rv
= siw_tx_ctrl(c_tx
, s
, msg_flags
);
890 if (!rv
&& tx_type
!= SIW_OP_READ
&&
891 tx_type
!= SIW_OP_READ_LOCAL_INV
)
892 wqe
->processed
= wqe
->bytes
;
897 rv
= siw_tx_hdt(c_tx
, s
);
901 * One segment sent. Processing completed if last
902 * segment, Do next segment otherwise.
904 if (unlikely(c_tx
->tx_suspend
)) {
906 * Verbs, 6.4.: Try stopping sending after a full
907 * DDP segment if the connection goes down
908 * (== peer halfclose)
913 if (c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
& DDP_FLAG_LAST
) {
914 siw_dbg_qp(qp
, "WQE completed\n");
917 c_tx
->state
= SIW_SEND_HDR
;
919 siw_update_tcpseg(c_tx
, s
);
921 siw_prepare_fpdu(qp
, wqe
);
925 qp
->tx_ctx
.burst
= burst_len
;
929 if (ecode
!= RDMAP_ECODE_CATASTROPHIC_STREAM
)
930 siw_init_terminate(qp
, TERM_ERROR_LAYER_RDMAP
,
931 RDMAP_ETYPE_REMOTE_PROTECTION
, ecode
, 1);
933 siw_init_terminate(qp
, TERM_ERROR_LAYER_RDMAP
,
934 RDMAP_ETYPE_CATASTROPHIC
,
935 RDMAP_ECODE_UNSPECIFIED
, 1);
939 static int siw_fastreg_mr(struct ib_pd
*pd
, struct siw_sqe
*sqe
)
941 struct ib_mr
*base_mr
= (struct ib_mr
*)(uintptr_t)sqe
->base_mr
;
942 struct siw_device
*sdev
= to_siw_dev(pd
->device
);
946 siw_dbg_pd(pd
, "STag 0x%08x\n", sqe
->rkey
);
948 if (unlikely(!base_mr
)) {
949 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe
->rkey
);
953 if (unlikely(base_mr
->rkey
>> 8 != sqe
->rkey
>> 8)) {
954 pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe
->rkey
);
958 mem
= siw_mem_id2obj(sdev
, sqe
->rkey
>> 8);
959 if (unlikely(!mem
)) {
960 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe
->rkey
);
964 if (unlikely(mem
->pd
!= pd
)) {
965 pr_warn("siw: fastreg: PD mismatch\n");
969 if (unlikely(mem
->stag_valid
)) {
970 pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe
->rkey
);
974 /* Refresh STag since user may have changed key part */
975 mem
->stag
= sqe
->rkey
;
976 mem
->perms
= sqe
->access
;
978 siw_dbg_mem(mem
, "STag 0x%08x now valid\n", sqe
->rkey
);
979 mem
->va
= base_mr
->iova
;
986 static int siw_qp_sq_proc_local(struct siw_qp
*qp
, struct siw_wqe
*wqe
)
990 switch (tx_type(wqe
)) {
992 rv
= siw_fastreg_mr(qp
->pd
, &wqe
->sqe
);
995 case SIW_OP_INVAL_STAG
:
996 rv
= siw_invalidate_stag(qp
->pd
, wqe
->sqe
.rkey
);
1006 * siw_qp_sq_process()
1008 * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
1009 * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
1010 * MPA FPDUs, each containing a DDP segment.
1012 * SQ processing may occur in user context as a result of posting
1013 * new WQE's or from siw_sq_work_handler() context. Processing in
1014 * user context is limited to non-kernel verbs users.
1016 * SQ processing may get paused anytime, possibly in the middle of a WR
1017 * or FPDU, if insufficient send space is available. SQ processing
1018 * gets resumed from siw_sq_work_handler(), if send space becomes
1021 * Must be called with the QP state read-locked.
1024 * An outbound RREQ can be satisfied by the corresponding RRESP
1025 * _before_ it gets assigned to the ORQ. This happens regularly
1026 * in RDMA READ via loopback case. Since both outbound RREQ and
1027 * inbound RRESP can be handled by the same CPU, locking the ORQ
1028 * is dead-lock prone and thus not an option. With that, the
1029 * RREQ gets assigned to the ORQ _before_ being sent - see
1030 * siw_activate_tx() - and pulled back in case of send failure.
1032 int siw_qp_sq_process(struct siw_qp
*qp
)
1034 struct siw_wqe
*wqe
= tx_wqe(qp
);
1035 enum siw_opcode tx_type
;
1036 unsigned long flags
;
1039 siw_dbg_qp(qp
, "enter for type %d\n", tx_type(wqe
));
1043 * Stop QP processing if SQ state changed
1045 if (unlikely(qp
->tx_ctx
.tx_suspend
)) {
1046 siw_dbg_qp(qp
, "tx suspended\n");
1049 tx_type
= tx_type(wqe
);
1051 if (tx_type
<= SIW_OP_READ_RESPONSE
)
1052 rv
= siw_qp_sq_proc_tx(qp
, wqe
);
1054 rv
= siw_qp_sq_proc_local(qp
, wqe
);
1058 * WQE processing done
1062 case SIW_OP_SEND_REMOTE_INV
:
1064 siw_wqe_put_mem(wqe
, tx_type
);
1067 case SIW_OP_INVAL_STAG
:
1069 if (tx_flags(wqe
) & SIW_WQE_SIGNALLED
)
1070 siw_sqe_complete(qp
, &wqe
->sqe
, wqe
->bytes
,
1075 case SIW_OP_READ_LOCAL_INV
:
1077 * already enqueued to ORQ queue
1081 case SIW_OP_READ_RESPONSE
:
1082 siw_wqe_put_mem(wqe
, tx_type
);
1086 WARN(1, "undefined WQE type %d\n", tx_type
);
1091 spin_lock_irqsave(&qp
->sq_lock
, flags
);
1092 wqe
->wr_status
= SIW_WR_IDLE
;
1093 rv
= siw_activate_tx(qp
);
1094 spin_unlock_irqrestore(&qp
->sq_lock
, flags
);
1101 } else if (rv
== -EAGAIN
) {
1102 siw_dbg_qp(qp
, "sq paused: hd/tr %d of %d, data %d\n",
1103 qp
->tx_ctx
.ctrl_sent
, qp
->tx_ctx
.ctrl_len
,
1104 qp
->tx_ctx
.bytes_unsent
);
1107 } else if (rv
== -EINPROGRESS
) {
1108 rv
= siw_sq_start(qp
);
1112 * WQE processing failed.
1114 * o It turns any WQE into a signalled WQE.
1115 * o Local catastrophic error must be surfaced
1116 * o QP must be moved into Terminate state: done by code
1117 * doing socket state change processing
1119 * o TODO: Termination message must be sent.
1120 * o TODO: Implement more precise work completion errors,
1121 * see enum ib_wc_status in ib_verbs.h
1123 siw_dbg_qp(qp
, "wqe type %d processing failed: %d\n",
1126 spin_lock_irqsave(&qp
->sq_lock
, flags
);
1128 * RREQ may have already been completed by inbound RRESP!
1130 if ((tx_type
== SIW_OP_READ
||
1131 tx_type
== SIW_OP_READ_LOCAL_INV
) && qp
->attrs
.orq_size
) {
1132 /* Cleanup pending entry in ORQ */
1134 qp
->orq
[qp
->orq_put
% qp
->attrs
.orq_size
].flags
= 0;
1136 spin_unlock_irqrestore(&qp
->sq_lock
, flags
);
1138 * immediately suspends further TX processing
1140 if (!qp
->tx_ctx
.tx_suspend
)
1141 siw_qp_cm_drop(qp
, 0);
1145 case SIW_OP_SEND_REMOTE_INV
:
1146 case SIW_OP_SEND_WITH_IMM
:
1149 case SIW_OP_READ_LOCAL_INV
:
1150 siw_wqe_put_mem(wqe
, tx_type
);
1153 case SIW_OP_INVAL_STAG
:
1155 siw_sqe_complete(qp
, &wqe
->sqe
, wqe
->bytes
,
1156 SIW_WC_LOC_QP_OP_ERR
);
1158 siw_qp_event(qp
, IB_EVENT_QP_FATAL
);
1162 case SIW_OP_READ_RESPONSE
:
1163 siw_dbg_qp(qp
, "proc. read.response failed: %d\n", rv
);
1165 siw_qp_event(qp
, IB_EVENT_QP_REQ_ERR
);
1167 siw_wqe_put_mem(wqe
, SIW_OP_READ_RESPONSE
);
1172 WARN(1, "undefined WQE type %d\n", tx_type
);
1175 wqe
->wr_status
= SIW_WR_IDLE
;
1181 static void siw_sq_resume(struct siw_qp
*qp
)
1183 if (down_read_trylock(&qp
->state_lock
)) {
1184 if (likely(qp
->attrs
.state
== SIW_QP_STATE_RTS
&&
1185 !qp
->tx_ctx
.tx_suspend
)) {
1186 int rv
= siw_qp_sq_process(qp
);
1188 up_read(&qp
->state_lock
);
1190 if (unlikely(rv
< 0)) {
1191 siw_dbg_qp(qp
, "SQ task failed: err %d\n", rv
);
1193 if (!qp
->tx_ctx
.tx_suspend
)
1194 siw_qp_cm_drop(qp
, 0);
1197 up_read(&qp
->state_lock
);
1200 siw_dbg_qp(qp
, "Resume SQ while QP locked\n");
1206 struct llist_head active
;
1207 wait_queue_head_t waiting
;
1210 static DEFINE_PER_CPU(struct tx_task_t
, siw_tx_task_g
);
1212 void siw_stop_tx_thread(int nr_cpu
)
1214 kthread_stop(siw_tx_thread
[nr_cpu
]);
1215 wake_up(&per_cpu(siw_tx_task_g
, nr_cpu
).waiting
);
1218 int siw_run_sq(void *data
)
1220 const int nr_cpu
= (unsigned int)(long)data
;
1221 struct llist_node
*active
;
1223 struct tx_task_t
*tx_task
= &per_cpu(siw_tx_task_g
, nr_cpu
);
1225 init_llist_head(&tx_task
->active
);
1226 init_waitqueue_head(&tx_task
->waiting
);
1229 struct llist_node
*fifo_list
= NULL
;
1231 wait_event_interruptible(tx_task
->waiting
,
1232 !llist_empty(&tx_task
->active
) ||
1233 kthread_should_stop());
1235 if (kthread_should_stop())
1238 active
= llist_del_all(&tx_task
->active
);
1240 * llist_del_all returns a list with newest entry first.
1241 * Re-order list for fairness among QP's.
1244 struct llist_node
*tmp
= active
;
1246 active
= llist_next(active
);
1247 tmp
->next
= fifo_list
;
1251 qp
= container_of(fifo_list
, struct siw_qp
, tx_list
);
1252 fifo_list
= llist_next(fifo_list
);
1253 qp
->tx_list
.next
= NULL
;
1258 active
= llist_del_all(&tx_task
->active
);
1260 llist_for_each_entry(qp
, active
, tx_list
) {
1261 qp
->tx_list
.next
= NULL
;
1268 int siw_sq_start(struct siw_qp
*qp
)
1270 if (tx_wqe(qp
)->wr_status
== SIW_WR_IDLE
)
1273 if (unlikely(!cpu_online(qp
->tx_cpu
))) {
1274 siw_put_tx_cpu(qp
->tx_cpu
);
1275 qp
->tx_cpu
= siw_get_tx_cpu(qp
->sdev
);
1276 if (qp
->tx_cpu
< 0) {
1277 pr_warn("siw: no tx cpu available\n");
1284 llist_add(&qp
->tx_list
, &per_cpu(siw_tx_task_g
, qp
->tx_cpu
).active
);
1286 wake_up(&per_cpu(siw_tx_task_g
, qp
->tx_cpu
).waiting
);