]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
f13193f5 | 2 | /* |
ecf85b23 | 3 | * Copyright (c) 2016-2018 Oracle. All rights reserved. |
f13193f5 CL |
4 | * |
5 | * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. | |
6 | */ | |
7 | ||
98895edb CL |
8 | #include <rdma/rw.h> |
9 | ||
f13193f5 CL |
10 | #include <linux/sunrpc/rpc_rdma.h> |
11 | #include <linux/sunrpc/svc_rdma.h> | |
12 | #include <linux/sunrpc/debug.h> | |
13 | ||
98895edb CL |
14 | #include "xprt_rdma.h" |
15 | #include <trace/events/rpcrdma.h> | |
f13193f5 CL |
16 | |
17 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT | |
18 | ||
026d958b CL |
19 | static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc); |
20 | static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); | |
21 | ||
f13193f5 CL |
22 | /* Each R/W context contains state for one chain of RDMA Read or |
23 | * Write Work Requests. | |
24 | * | |
25 | * Each WR chain handles a single contiguous server-side buffer, | |
26 | * because scatterlist entries after the first have to start on | |
27 | * page alignment. xdr_buf iovecs cannot guarantee alignment. | |
28 | * | |
29 | * Each WR chain handles only one R_key. Each RPC-over-RDMA segment | |
30 | * from a client may contain a unique R_key, so each WR chain moves | |
31 | * up to one segment at a time. | |
32 | * | |
33 | * The scatterlist makes this data structure over 4KB in size. To | |
34 | * make it less likely to fail, and to handle the allocation for | |
35 | * smaller I/O requests without disabling bottom-halves, these | |
36 | * contexts are created on demand, but cached and reused until the | |
37 | * controlling svcxprt_rdma is destroyed. | |
38 | */ | |
39 | struct svc_rdma_rw_ctxt { | |
40 | struct list_head rw_list; | |
41 | struct rdma_rw_ctx rw_ctx; | |
42 | int rw_nents; | |
43 | struct sg_table rw_sg_table; | |
c0fb23f8 | 44 | struct scatterlist rw_first_sgl[]; |
f13193f5 CL |
45 | }; |
46 | ||
47 | static inline struct svc_rdma_rw_ctxt * | |
48 | svc_rdma_next_ctxt(struct list_head *list) | |
49 | { | |
50 | return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt, | |
51 | rw_list); | |
52 | } | |
53 | ||
54 | static struct svc_rdma_rw_ctxt * | |
55 | svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) | |
56 | { | |
57 | struct svc_rdma_rw_ctxt *ctxt; | |
58 | ||
59 | spin_lock(&rdma->sc_rw_ctxt_lock); | |
60 | ||
61 | ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); | |
62 | if (ctxt) { | |
63 | list_del(&ctxt->rw_list); | |
64 | spin_unlock(&rdma->sc_rw_ctxt_lock); | |
65 | } else { | |
66 | spin_unlock(&rdma->sc_rw_ctxt_lock); | |
14cfbd94 | 67 | ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), |
f13193f5 CL |
68 | GFP_KERNEL); |
69 | if (!ctxt) | |
70 | goto out; | |
71 | INIT_LIST_HEAD(&ctxt->rw_list); | |
72 | } | |
73 | ||
74 | ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; | |
75 | if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, | |
4635873c ML |
76 | ctxt->rw_sg_table.sgl, |
77 | SG_CHUNK_SIZE)) { | |
f13193f5 CL |
78 | kfree(ctxt); |
79 | ctxt = NULL; | |
80 | } | |
81 | out: | |
82 | return ctxt; | |
83 | } | |
84 | ||
85 | static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, | |
86 | struct svc_rdma_rw_ctxt *ctxt) | |
87 | { | |
4635873c | 88 | sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); |
f13193f5 CL |
89 | |
90 | spin_lock(&rdma->sc_rw_ctxt_lock); | |
91 | list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); | |
92 | spin_unlock(&rdma->sc_rw_ctxt_lock); | |
93 | } | |
94 | ||
95 | /** | |
96 | * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts | |
97 | * @rdma: transport about to be destroyed | |
98 | * | |
99 | */ | |
100 | void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) | |
101 | { | |
102 | struct svc_rdma_rw_ctxt *ctxt; | |
103 | ||
104 | while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { | |
105 | list_del(&ctxt->rw_list); | |
106 | kfree(ctxt); | |
107 | } | |
108 | } | |
109 | ||
110 | /* A chunk context tracks all I/O for moving one Read or Write | |
111 | * chunk. This is a a set of rdma_rw's that handle data movement | |
112 | * for all segments of one chunk. | |
113 | * | |
114 | * These are small, acquired with a single allocator call, and | |
115 | * no more than one is needed per chunk. They are allocated on | |
116 | * demand, and not cached. | |
117 | */ | |
118 | struct svc_rdma_chunk_ctxt { | |
119 | struct ib_cqe cc_cqe; | |
120 | struct svcxprt_rdma *cc_rdma; | |
121 | struct list_head cc_rwctxts; | |
122 | int cc_sqecount; | |
f13193f5 CL |
123 | }; |
124 | ||
125 | static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, | |
35a30fc3 | 126 | struct svc_rdma_chunk_ctxt *cc) |
f13193f5 CL |
127 | { |
128 | cc->cc_rdma = rdma; | |
129 | svc_xprt_get(&rdma->sc_xprt); | |
130 | ||
131 | INIT_LIST_HEAD(&cc->cc_rwctxts); | |
132 | cc->cc_sqecount = 0; | |
f13193f5 CL |
133 | } |
134 | ||
35a30fc3 CL |
135 | static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, |
136 | enum dma_data_direction dir) | |
f13193f5 CL |
137 | { |
138 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
139 | struct svc_rdma_rw_ctxt *ctxt; | |
140 | ||
141 | while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { | |
142 | list_del(&ctxt->rw_list); | |
143 | ||
144 | rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, | |
145 | rdma->sc_port_num, ctxt->rw_sg_table.sgl, | |
35a30fc3 | 146 | ctxt->rw_nents, dir); |
f13193f5 CL |
147 | svc_rdma_put_rw_ctxt(rdma, ctxt); |
148 | } | |
149 | svc_xprt_put(&rdma->sc_xprt); | |
150 | } | |
151 | ||
152 | /* State for sending a Write or Reply chunk. | |
153 | * - Tracks progress of writing one chunk over all its segments | |
154 | * - Stores arguments for the SGL constructor functions | |
155 | */ | |
156 | struct svc_rdma_write_info { | |
157 | /* write state of this chunk */ | |
158 | unsigned int wi_seg_off; | |
159 | unsigned int wi_seg_no; | |
160 | unsigned int wi_nsegs; | |
161 | __be32 *wi_segs; | |
162 | ||
163 | /* SGL constructor arguments */ | |
164 | struct xdr_buf *wi_xdr; | |
165 | unsigned char *wi_base; | |
166 | unsigned int wi_next_off; | |
167 | ||
168 | struct svc_rdma_chunk_ctxt wi_cc; | |
169 | }; | |
170 | ||
171 | static struct svc_rdma_write_info * | |
172 | svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) | |
173 | { | |
174 | struct svc_rdma_write_info *info; | |
175 | ||
176 | info = kmalloc(sizeof(*info), GFP_KERNEL); | |
177 | if (!info) | |
178 | return info; | |
179 | ||
180 | info->wi_seg_off = 0; | |
181 | info->wi_seg_no = 0; | |
182 | info->wi_nsegs = be32_to_cpup(++chunk); | |
183 | info->wi_segs = ++chunk; | |
35a30fc3 | 184 | svc_rdma_cc_init(rdma, &info->wi_cc); |
026d958b | 185 | info->wi_cc.cc_cqe.done = svc_rdma_write_done; |
f13193f5 CL |
186 | return info; |
187 | } | |
188 | ||
189 | static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) | |
190 | { | |
35a30fc3 | 191 | svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE); |
f13193f5 CL |
192 | kfree(info); |
193 | } | |
194 | ||
195 | /** | |
196 | * svc_rdma_write_done - Write chunk completion | |
197 | * @cq: controlling Completion Queue | |
198 | * @wc: Work Completion | |
199 | * | |
200 | * Pages under I/O are freed by a subsequent Send completion. | |
201 | */ | |
202 | static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) | |
203 | { | |
204 | struct ib_cqe *cqe = wc->wr_cqe; | |
205 | struct svc_rdma_chunk_ctxt *cc = | |
206 | container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); | |
207 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
208 | struct svc_rdma_write_info *info = | |
209 | container_of(cc, struct svc_rdma_write_info, wi_cc); | |
210 | ||
bd2abef3 CL |
211 | trace_svcrdma_wc_write(wc); |
212 | ||
f13193f5 CL |
213 | atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); |
214 | wake_up(&rdma->sc_send_wait); | |
215 | ||
8820bcaa | 216 | if (unlikely(wc->status != IB_WC_SUCCESS)) |
f13193f5 | 217 | set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); |
f13193f5 CL |
218 | |
219 | svc_rdma_write_info_free(info); | |
220 | } | |
221 | ||
026d958b CL |
222 | /* State for pulling a Read chunk. |
223 | */ | |
224 | struct svc_rdma_read_info { | |
ecf85b23 | 225 | struct svc_rdma_recv_ctxt *ri_readctxt; |
026d958b CL |
226 | unsigned int ri_position; |
227 | unsigned int ri_pageno; | |
228 | unsigned int ri_pageoff; | |
229 | unsigned int ri_chunklen; | |
230 | ||
231 | struct svc_rdma_chunk_ctxt ri_cc; | |
232 | }; | |
233 | ||
234 | static struct svc_rdma_read_info * | |
235 | svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) | |
236 | { | |
237 | struct svc_rdma_read_info *info; | |
238 | ||
239 | info = kmalloc(sizeof(*info), GFP_KERNEL); | |
240 | if (!info) | |
241 | return info; | |
242 | ||
35a30fc3 | 243 | svc_rdma_cc_init(rdma, &info->ri_cc); |
026d958b CL |
244 | info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done; |
245 | return info; | |
246 | } | |
247 | ||
248 | static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) | |
249 | { | |
35a30fc3 | 250 | svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE); |
026d958b CL |
251 | kfree(info); |
252 | } | |
253 | ||
254 | /** | |
255 | * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx | |
256 | * @cq: controlling Completion Queue | |
257 | * @wc: Work Completion | |
258 | * | |
259 | */ | |
260 | static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) | |
261 | { | |
262 | struct ib_cqe *cqe = wc->wr_cqe; | |
263 | struct svc_rdma_chunk_ctxt *cc = | |
264 | container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); | |
265 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
266 | struct svc_rdma_read_info *info = | |
267 | container_of(cc, struct svc_rdma_read_info, ri_cc); | |
268 | ||
bd2abef3 CL |
269 | trace_svcrdma_wc_read(wc); |
270 | ||
026d958b CL |
271 | atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); |
272 | wake_up(&rdma->sc_send_wait); | |
273 | ||
274 | if (unlikely(wc->status != IB_WC_SUCCESS)) { | |
275 | set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); | |
1e5f4160 | 276 | svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt); |
026d958b CL |
277 | } else { |
278 | spin_lock(&rdma->sc_rq_dto_lock); | |
ecf85b23 | 279 | list_add_tail(&info->ri_readctxt->rc_list, |
026d958b | 280 | &rdma->sc_read_complete_q); |
95503d29 BF |
281 | /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */ |
282 | set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); | |
026d958b CL |
283 | spin_unlock(&rdma->sc_rq_dto_lock); |
284 | ||
026d958b CL |
285 | svc_xprt_enqueue(&rdma->sc_xprt); |
286 | } | |
287 | ||
288 | svc_rdma_read_info_free(info); | |
289 | } | |
290 | ||
f13193f5 CL |
291 | /* This function sleeps when the transport's Send Queue is congested. |
292 | * | |
293 | * Assumptions: | |
294 | * - If ib_post_send() succeeds, only one completion is expected, | |
295 | * even if one or more WRs are flushed. This is true when posting | |
296 | * an rdma_rw_ctx or when posting a single signaled WR. | |
297 | */ | |
298 | static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) | |
299 | { | |
300 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
301 | struct svc_xprt *xprt = &rdma->sc_xprt; | |
d34ac5cd BVA |
302 | struct ib_send_wr *first_wr; |
303 | const struct ib_send_wr *bad_wr; | |
f13193f5 CL |
304 | struct list_head *tmp; |
305 | struct ib_cqe *cqe; | |
306 | int ret; | |
307 | ||
107c1d0a CL |
308 | if (cc->cc_sqecount > rdma->sc_sq_depth) |
309 | return -EINVAL; | |
310 | ||
f13193f5 CL |
311 | first_wr = NULL; |
312 | cqe = &cc->cc_cqe; | |
313 | list_for_each(tmp, &cc->cc_rwctxts) { | |
314 | struct svc_rdma_rw_ctxt *ctxt; | |
315 | ||
316 | ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list); | |
317 | first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp, | |
318 | rdma->sc_port_num, cqe, first_wr); | |
319 | cqe = NULL; | |
320 | } | |
321 | ||
322 | do { | |
323 | if (atomic_sub_return(cc->cc_sqecount, | |
324 | &rdma->sc_sq_avail) > 0) { | |
5d85a822 | 325 | ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); |
f13193f5 CL |
326 | if (ret) |
327 | break; | |
328 | return 0; | |
329 | } | |
330 | ||
bd2abef3 | 331 | trace_svcrdma_sq_full(rdma); |
f13193f5 CL |
332 | atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); |
333 | wait_event(rdma->sc_send_wait, | |
334 | atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); | |
bd2abef3 | 335 | trace_svcrdma_sq_retry(rdma); |
f13193f5 CL |
336 | } while (1); |
337 | ||
e28b4fc6 | 338 | trace_svcrdma_sq_post_err(rdma, ret); |
f13193f5 CL |
339 | set_bit(XPT_CLOSE, &xprt->xpt_flags); |
340 | ||
341 | /* If even one was posted, there will be a completion. */ | |
342 | if (bad_wr != first_wr) | |
343 | return 0; | |
344 | ||
345 | atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); | |
346 | wake_up(&rdma->sc_send_wait); | |
347 | return -ENOTCONN; | |
348 | } | |
349 | ||
350 | /* Build and DMA-map an SGL that covers one kvec in an xdr_buf | |
351 | */ | |
352 | static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, | |
353 | unsigned int len, | |
354 | struct svc_rdma_rw_ctxt *ctxt) | |
355 | { | |
356 | struct scatterlist *sg = ctxt->rw_sg_table.sgl; | |
357 | ||
358 | sg_set_buf(&sg[0], info->wi_base, len); | |
359 | info->wi_base += len; | |
360 | ||
361 | ctxt->rw_nents = 1; | |
362 | } | |
363 | ||
364 | /* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. | |
365 | */ | |
366 | static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, | |
367 | unsigned int remaining, | |
368 | struct svc_rdma_rw_ctxt *ctxt) | |
369 | { | |
370 | unsigned int sge_no, sge_bytes, page_off, page_no; | |
371 | struct xdr_buf *xdr = info->wi_xdr; | |
372 | struct scatterlist *sg; | |
373 | struct page **page; | |
374 | ||
91b022ec CL |
375 | page_off = info->wi_next_off + xdr->page_base; |
376 | page_no = page_off >> PAGE_SHIFT; | |
377 | page_off = offset_in_page(page_off); | |
f13193f5 CL |
378 | page = xdr->pages + page_no; |
379 | info->wi_next_off += remaining; | |
380 | sg = ctxt->rw_sg_table.sgl; | |
381 | sge_no = 0; | |
382 | do { | |
383 | sge_bytes = min_t(unsigned int, remaining, | |
384 | PAGE_SIZE - page_off); | |
385 | sg_set_page(sg, *page, sge_bytes, page_off); | |
386 | ||
387 | remaining -= sge_bytes; | |
388 | sg = sg_next(sg); | |
389 | page_off = 0; | |
390 | sge_no++; | |
391 | page++; | |
392 | } while (remaining); | |
393 | ||
394 | ctxt->rw_nents = sge_no; | |
395 | } | |
396 | ||
397 | /* Construct RDMA Write WRs to send a portion of an xdr_buf containing | |
398 | * an RPC Reply. | |
399 | */ | |
400 | static int | |
401 | svc_rdma_build_writes(struct svc_rdma_write_info *info, | |
402 | void (*constructor)(struct svc_rdma_write_info *info, | |
403 | unsigned int len, | |
404 | struct svc_rdma_rw_ctxt *ctxt), | |
405 | unsigned int remaining) | |
406 | { | |
407 | struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; | |
408 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
409 | struct svc_rdma_rw_ctxt *ctxt; | |
410 | __be32 *seg; | |
411 | int ret; | |
412 | ||
f13193f5 CL |
413 | seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; |
414 | do { | |
415 | unsigned int write_len; | |
416 | u32 seg_length, seg_handle; | |
417 | u64 seg_offset; | |
418 | ||
419 | if (info->wi_seg_no >= info->wi_nsegs) | |
420 | goto out_overflow; | |
421 | ||
422 | seg_handle = be32_to_cpup(seg); | |
423 | seg_length = be32_to_cpup(seg + 1); | |
424 | xdr_decode_hyper(seg + 2, &seg_offset); | |
425 | seg_offset += info->wi_seg_off; | |
426 | ||
427 | write_len = min(remaining, seg_length - info->wi_seg_off); | |
428 | ctxt = svc_rdma_get_rw_ctxt(rdma, | |
429 | (write_len >> PAGE_SHIFT) + 2); | |
430 | if (!ctxt) | |
431 | goto out_noctx; | |
432 | ||
433 | constructor(info, write_len, ctxt); | |
434 | ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, | |
435 | rdma->sc_port_num, ctxt->rw_sg_table.sgl, | |
436 | ctxt->rw_nents, 0, seg_offset, | |
437 | seg_handle, DMA_TO_DEVICE); | |
438 | if (ret < 0) | |
439 | goto out_initerr; | |
440 | ||
a406c563 CL |
441 | trace_svcrdma_send_wseg(seg_handle, write_len, seg_offset); |
442 | ||
f13193f5 CL |
443 | list_add(&ctxt->rw_list, &cc->cc_rwctxts); |
444 | cc->cc_sqecount += ret; | |
445 | if (write_len == seg_length - info->wi_seg_off) { | |
446 | seg += 4; | |
447 | info->wi_seg_no++; | |
448 | info->wi_seg_off = 0; | |
449 | } else { | |
450 | info->wi_seg_off += write_len; | |
451 | } | |
452 | remaining -= write_len; | |
453 | } while (remaining); | |
454 | ||
455 | return 0; | |
456 | ||
457 | out_overflow: | |
458 | dprintk("svcrdma: inadequate space in Write chunk (%u)\n", | |
459 | info->wi_nsegs); | |
460 | return -E2BIG; | |
461 | ||
462 | out_noctx: | |
463 | dprintk("svcrdma: no R/W ctxs available\n"); | |
464 | return -ENOMEM; | |
465 | ||
466 | out_initerr: | |
467 | svc_rdma_put_rw_ctxt(rdma, ctxt); | |
bd2abef3 | 468 | trace_svcrdma_dma_map_rwctx(rdma, ret); |
f13193f5 CL |
469 | return -EIO; |
470 | } | |
471 | ||
472 | /* Send one of an xdr_buf's kvecs by itself. To send a Reply | |
473 | * chunk, the whole RPC Reply is written back to the client. | |
474 | * This function writes either the head or tail of the xdr_buf | |
475 | * containing the Reply. | |
476 | */ | |
477 | static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, | |
478 | struct kvec *vec) | |
479 | { | |
480 | info->wi_base = vec->iov_base; | |
481 | return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, | |
482 | vec->iov_len); | |
483 | } | |
484 | ||
41205539 CL |
485 | /* Send an xdr_buf's page list by itself. A Write chunk is just |
486 | * the page list. A Reply chunk is @xdr's head, page list, and | |
487 | * tail. This function is shared between the two types of chunk. | |
f13193f5 CL |
488 | */ |
489 | static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, | |
41205539 CL |
490 | struct xdr_buf *xdr, |
491 | unsigned int offset, | |
492 | unsigned long length) | |
f13193f5 CL |
493 | { |
494 | info->wi_xdr = xdr; | |
41205539 | 495 | info->wi_next_off = offset - xdr->head[0].iov_len; |
f13193f5 | 496 | return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, |
41205539 | 497 | length); |
f13193f5 CL |
498 | } |
499 | ||
500 | /** | |
501 | * svc_rdma_send_write_chunk - Write all segments in a Write chunk | |
502 | * @rdma: controlling RDMA transport | |
503 | * @wr_ch: Write chunk provided by client | |
504 | * @xdr: xdr_buf containing the data payload | |
41205539 CL |
505 | * @offset: payload's byte offset in @xdr |
506 | * @length: size of payload, in bytes | |
f13193f5 CL |
507 | * |
508 | * Returns a non-negative number of bytes the chunk consumed, or | |
509 | * %-E2BIG if the payload was larger than the Write chunk, | |
107c1d0a | 510 | * %-EINVAL if client provided too many segments, |
f13193f5 CL |
511 | * %-ENOMEM if rdma_rw context pool was exhausted, |
512 | * %-ENOTCONN if posting failed (connection is lost), | |
513 | * %-EIO if rdma_rw initialization failed (DMA mapping, etc). | |
514 | */ | |
515 | int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, | |
41205539 CL |
516 | struct xdr_buf *xdr, |
517 | unsigned int offset, unsigned long length) | |
f13193f5 CL |
518 | { |
519 | struct svc_rdma_write_info *info; | |
520 | int ret; | |
521 | ||
41205539 | 522 | if (!length) |
f13193f5 CL |
523 | return 0; |
524 | ||
525 | info = svc_rdma_write_info_alloc(rdma, wr_ch); | |
526 | if (!info) | |
527 | return -ENOMEM; | |
528 | ||
41205539 | 529 | ret = svc_rdma_send_xdr_pagelist(info, xdr, offset, length); |
f13193f5 CL |
530 | if (ret < 0) |
531 | goto out_err; | |
532 | ||
533 | ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); | |
534 | if (ret < 0) | |
535 | goto out_err; | |
98895edb | 536 | |
a406c563 | 537 | trace_svcrdma_send_write_chunk(xdr->page_len); |
41205539 | 538 | return length; |
f13193f5 CL |
539 | |
540 | out_err: | |
541 | svc_rdma_write_info_free(info); | |
542 | return ret; | |
543 | } | |
544 | ||
545 | /** | |
546 | * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk | |
547 | * @rdma: controlling RDMA transport | |
6fa5785e | 548 | * @rctxt: Write and Reply chunks from client |
f13193f5 CL |
549 | * @xdr: xdr_buf containing an RPC Reply |
550 | * | |
551 | * Returns a non-negative number of bytes the chunk consumed, or | |
552 | * %-E2BIG if the payload was larger than the Reply chunk, | |
107c1d0a | 553 | * %-EINVAL if client provided too many segments, |
f13193f5 CL |
554 | * %-ENOMEM if rdma_rw context pool was exhausted, |
555 | * %-ENOTCONN if posting failed (connection is lost), | |
556 | * %-EIO if rdma_rw initialization failed (DMA mapping, etc). | |
557 | */ | |
6fa5785e CL |
558 | int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, |
559 | const struct svc_rdma_recv_ctxt *rctxt, | |
560 | struct xdr_buf *xdr) | |
f13193f5 CL |
561 | { |
562 | struct svc_rdma_write_info *info; | |
563 | int consumed, ret; | |
564 | ||
6fa5785e | 565 | info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk); |
f13193f5 CL |
566 | if (!info) |
567 | return -ENOMEM; | |
568 | ||
569 | ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]); | |
570 | if (ret < 0) | |
571 | goto out_err; | |
572 | consumed = xdr->head[0].iov_len; | |
573 | ||
574 | /* Send the page list in the Reply chunk only if the | |
575 | * client did not provide Write chunks. | |
576 | */ | |
6fa5785e | 577 | if (!rctxt->rc_write_list && xdr->page_len) { |
41205539 CL |
578 | ret = svc_rdma_send_xdr_pagelist(info, xdr, |
579 | xdr->head[0].iov_len, | |
580 | xdr->page_len); | |
f13193f5 CL |
581 | if (ret < 0) |
582 | goto out_err; | |
583 | consumed += xdr->page_len; | |
584 | } | |
585 | ||
586 | if (xdr->tail[0].iov_len) { | |
587 | ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]); | |
588 | if (ret < 0) | |
589 | goto out_err; | |
590 | consumed += xdr->tail[0].iov_len; | |
591 | } | |
592 | ||
593 | ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); | |
594 | if (ret < 0) | |
595 | goto out_err; | |
98895edb | 596 | |
a406c563 | 597 | trace_svcrdma_send_reply_chunk(consumed); |
f13193f5 CL |
598 | return consumed; |
599 | ||
600 | out_err: | |
601 | svc_rdma_write_info_free(info); | |
602 | return ret; | |
603 | } | |
026d958b CL |
604 | |
605 | static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, | |
606 | struct svc_rqst *rqstp, | |
607 | u32 rkey, u32 len, u64 offset) | |
608 | { | |
ecf85b23 | 609 | struct svc_rdma_recv_ctxt *head = info->ri_readctxt; |
026d958b CL |
610 | struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; |
611 | struct svc_rdma_rw_ctxt *ctxt; | |
612 | unsigned int sge_no, seg_len; | |
613 | struct scatterlist *sg; | |
614 | int ret; | |
615 | ||
616 | sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; | |
617 | ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); | |
618 | if (!ctxt) | |
619 | goto out_noctx; | |
620 | ctxt->rw_nents = sge_no; | |
621 | ||
026d958b CL |
622 | sg = ctxt->rw_sg_table.sgl; |
623 | for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { | |
624 | seg_len = min_t(unsigned int, len, | |
625 | PAGE_SIZE - info->ri_pageoff); | |
626 | ||
ecf85b23 | 627 | head->rc_arg.pages[info->ri_pageno] = |
026d958b CL |
628 | rqstp->rq_pages[info->ri_pageno]; |
629 | if (!info->ri_pageoff) | |
ecf85b23 | 630 | head->rc_page_count++; |
026d958b CL |
631 | |
632 | sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], | |
633 | seg_len, info->ri_pageoff); | |
634 | sg = sg_next(sg); | |
635 | ||
636 | info->ri_pageoff += seg_len; | |
637 | if (info->ri_pageoff == PAGE_SIZE) { | |
638 | info->ri_pageno++; | |
639 | info->ri_pageoff = 0; | |
640 | } | |
641 | len -= seg_len; | |
642 | ||
643 | /* Safety check */ | |
644 | if (len && | |
645 | &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end) | |
646 | goto out_overrun; | |
647 | } | |
648 | ||
649 | ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp, | |
650 | cc->cc_rdma->sc_port_num, | |
651 | ctxt->rw_sg_table.sgl, ctxt->rw_nents, | |
652 | 0, offset, rkey, DMA_FROM_DEVICE); | |
653 | if (ret < 0) | |
654 | goto out_initerr; | |
655 | ||
656 | list_add(&ctxt->rw_list, &cc->cc_rwctxts); | |
657 | cc->cc_sqecount += ret; | |
658 | return 0; | |
659 | ||
660 | out_noctx: | |
661 | dprintk("svcrdma: no R/W ctxs available\n"); | |
662 | return -ENOMEM; | |
663 | ||
664 | out_overrun: | |
665 | dprintk("svcrdma: request overruns rq_pages\n"); | |
666 | return -EINVAL; | |
667 | ||
668 | out_initerr: | |
bd2abef3 | 669 | trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret); |
026d958b | 670 | svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt); |
026d958b CL |
671 | return -EIO; |
672 | } | |
673 | ||
7075a867 CL |
674 | /* Walk the segments in the Read chunk starting at @p and construct |
675 | * RDMA Read operations to pull the chunk to the server. | |
676 | */ | |
026d958b CL |
677 | static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, |
678 | struct svc_rdma_read_info *info, | |
679 | __be32 *p) | |
680 | { | |
07d0ff3b | 681 | unsigned int i; |
026d958b CL |
682 | int ret; |
683 | ||
7075a867 | 684 | ret = -EINVAL; |
026d958b | 685 | info->ri_chunklen = 0; |
7075a867 | 686 | while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { |
026d958b CL |
687 | u32 rs_handle, rs_length; |
688 | u64 rs_offset; | |
689 | ||
026d958b CL |
690 | rs_handle = be32_to_cpup(p++); |
691 | rs_length = be32_to_cpup(p++); | |
692 | p = xdr_decode_hyper(p, &rs_offset); | |
693 | ||
694 | ret = svc_rdma_build_read_segment(info, rqstp, | |
695 | rs_handle, rs_length, | |
696 | rs_offset); | |
697 | if (ret < 0) | |
698 | break; | |
699 | ||
a406c563 | 700 | trace_svcrdma_send_rseg(rs_handle, rs_length, rs_offset); |
026d958b CL |
701 | info->ri_chunklen += rs_length; |
702 | } | |
703 | ||
07d0ff3b CL |
704 | /* Pages under I/O have been copied to head->rc_pages. |
705 | * Prevent their premature release by svc_xprt_release() . | |
706 | */ | |
707 | for (i = 0; i < info->ri_readctxt->rc_page_count; i++) | |
708 | rqstp->rq_pages[i] = NULL; | |
709 | ||
026d958b CL |
710 | return ret; |
711 | } | |
712 | ||
026d958b | 713 | /* Construct RDMA Reads to pull over a normal Read chunk. The chunk |
ecf85b23 | 714 | * data lands in the page list of head->rc_arg.pages. |
026d958b | 715 | * |
ecf85b23 | 716 | * Currently NFSD does not look at the head->rc_arg.tail[0] iovec. |
026d958b CL |
717 | * Therefore, XDR round-up of the Read chunk and trailing |
718 | * inline content must both be added at the end of the pagelist. | |
719 | */ | |
720 | static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, | |
721 | struct svc_rdma_read_info *info, | |
722 | __be32 *p) | |
723 | { | |
ecf85b23 | 724 | struct svc_rdma_recv_ctxt *head = info->ri_readctxt; |
026d958b CL |
725 | int ret; |
726 | ||
026d958b CL |
727 | ret = svc_rdma_build_read_chunk(rqstp, info, p); |
728 | if (ret < 0) | |
729 | goto out; | |
730 | ||
a406c563 | 731 | trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position); |
98895edb | 732 | |
3316f063 CL |
733 | head->rc_hdr_count = 0; |
734 | ||
193bcb7b CL |
735 | /* Split the Receive buffer between the head and tail |
736 | * buffers at Read chunk's position. XDR roundup of the | |
737 | * chunk is not included in either the pagelist or in | |
738 | * the tail. | |
026d958b | 739 | */ |
ecf85b23 CL |
740 | head->rc_arg.tail[0].iov_base = |
741 | head->rc_arg.head[0].iov_base + info->ri_position; | |
742 | head->rc_arg.tail[0].iov_len = | |
743 | head->rc_arg.head[0].iov_len - info->ri_position; | |
744 | head->rc_arg.head[0].iov_len = info->ri_position; | |
026d958b | 745 | |
175e0310 | 746 | /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). |
193bcb7b | 747 | * |
175e0310 CL |
748 | * If the client already rounded up the chunk length, the |
749 | * length does not change. Otherwise, the length of the page | |
750 | * list is increased to include XDR round-up. | |
751 | * | |
752 | * Currently these chunks always start at page offset 0, | |
753 | * thus the rounded-up length never crosses a page boundary. | |
193bcb7b | 754 | */ |
175e0310 | 755 | info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; |
026d958b | 756 | |
ecf85b23 CL |
757 | head->rc_arg.page_len = info->ri_chunklen; |
758 | head->rc_arg.len += info->ri_chunklen; | |
759 | head->rc_arg.buflen += info->ri_chunklen; | |
026d958b | 760 | |
026d958b CL |
761 | out: |
762 | return ret; | |
763 | } | |
764 | ||
765 | /* Construct RDMA Reads to pull over a Position Zero Read chunk. | |
766 | * The start of the data lands in the first page just after | |
767 | * the Transport header, and the rest lands in the page list of | |
ecf85b23 | 768 | * head->rc_arg.pages. |
026d958b CL |
769 | * |
770 | * Assumptions: | |
771 | * - A PZRC has an XDR-aligned length (no implicit round-up). | |
772 | * - There can be no trailing inline content (IOW, we assume | |
773 | * a PZRC is never sent in an RDMA_MSG message, though it's | |
774 | * allowed by spec). | |
775 | */ | |
776 | static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, | |
777 | struct svc_rdma_read_info *info, | |
778 | __be32 *p) | |
779 | { | |
ecf85b23 | 780 | struct svc_rdma_recv_ctxt *head = info->ri_readctxt; |
026d958b CL |
781 | int ret; |
782 | ||
026d958b CL |
783 | ret = svc_rdma_build_read_chunk(rqstp, info, p); |
784 | if (ret < 0) | |
785 | goto out; | |
786 | ||
a406c563 | 787 | trace_svcrdma_send_pzr(info->ri_chunklen); |
98895edb | 788 | |
ecf85b23 CL |
789 | head->rc_arg.len += info->ri_chunklen; |
790 | head->rc_arg.buflen += info->ri_chunklen; | |
026d958b | 791 | |
3316f063 CL |
792 | head->rc_hdr_count = 1; |
793 | head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); | |
794 | head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, | |
795 | info->ri_chunklen); | |
796 | ||
797 | head->rc_arg.page_len = info->ri_chunklen - | |
798 | head->rc_arg.head[0].iov_len; | |
026d958b CL |
799 | |
800 | out: | |
801 | return ret; | |
802 | } | |
803 | ||
804 | /** | |
805 | * svc_rdma_recv_read_chunk - Pull a Read chunk from the client | |
806 | * @rdma: controlling RDMA transport | |
807 | * @rqstp: set of pages to use as Read sink buffers | |
808 | * @head: pages under I/O collect here | |
809 | * @p: pointer to start of Read chunk | |
810 | * | |
811 | * Returns: | |
812 | * %0 if all needed RDMA Reads were posted successfully, | |
813 | * %-EINVAL if client provided too many segments, | |
814 | * %-ENOMEM if rdma_rw context pool was exhausted, | |
815 | * %-ENOTCONN if posting failed (connection is lost), | |
816 | * %-EIO if rdma_rw initialization failed (DMA mapping, etc). | |
817 | * | |
818 | * Assumptions: | |
819 | * - All Read segments in @p have the same Position value. | |
820 | */ | |
821 | int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, | |
ecf85b23 | 822 | struct svc_rdma_recv_ctxt *head, __be32 *p) |
026d958b CL |
823 | { |
824 | struct svc_rdma_read_info *info; | |
026d958b CL |
825 | int ret; |
826 | ||
827 | /* The request (with page list) is constructed in | |
ecf85b23 | 828 | * head->rc_arg. Pages involved with RDMA Read I/O are |
026d958b CL |
829 | * transferred there. |
830 | */ | |
ecf85b23 CL |
831 | head->rc_arg.head[0] = rqstp->rq_arg.head[0]; |
832 | head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; | |
833 | head->rc_arg.pages = head->rc_pages; | |
834 | head->rc_arg.page_base = 0; | |
835 | head->rc_arg.page_len = 0; | |
836 | head->rc_arg.len = rqstp->rq_arg.len; | |
837 | head->rc_arg.buflen = rqstp->rq_arg.buflen; | |
026d958b CL |
838 | |
839 | info = svc_rdma_read_info_alloc(rdma); | |
840 | if (!info) | |
841 | return -ENOMEM; | |
842 | info->ri_readctxt = head; | |
3316f063 CL |
843 | info->ri_pageno = 0; |
844 | info->ri_pageoff = 0; | |
026d958b CL |
845 | |
846 | info->ri_position = be32_to_cpup(p + 1); | |
847 | if (info->ri_position) | |
848 | ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); | |
849 | else | |
850 | ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); | |
026d958b | 851 | if (ret < 0) |
07d0ff3b | 852 | goto out_err; |
026d958b CL |
853 | |
854 | ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); | |
026d958b | 855 | if (ret < 0) |
07d0ff3b CL |
856 | goto out_err; |
857 | return 0; | |
858 | ||
859 | out_err: | |
860 | svc_rdma_read_info_free(info); | |
026d958b CL |
861 | return ret; |
862 | } |