1 // SPDX-License-Identifier: GPL-2.0-only
3 * Module for pnfs flexfile layout driver.
5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
7 * Tao Peng <bergwolf@primarydata.com>
10 #include <linux/nfs_fs.h>
11 #include <linux/nfs_mount.h>
12 #include <linux/nfs_page.h>
13 #include <linux/module.h>
14 #include <linux/sched/mm.h>
16 #include <linux/sunrpc/metrics.h>
18 #include "flexfilelayout.h"
19 #include "../nfs4session.h"
20 #include "../nfs4idmap.h"
21 #include "../internal.h"
22 #include "../delegation.h"
23 #include "../nfs4trace.h"
24 #include "../iostat.h"
28 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
30 #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
31 #define FF_LAYOUTRETURN_MAXERR 20
33 enum nfs4_ff_op_type
{
34 NFS4_FF_OP_LAYOUTSTATS
,
35 NFS4_FF_OP_LAYOUTRETURN
,
38 static unsigned short io_maxretrans
;
40 static const struct pnfs_commit_ops ff_layout_commit_ops
;
41 static void ff_layout_read_record_layoutstats_done(struct rpc_task
*task
,
42 struct nfs_pgio_header
*hdr
);
44 ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr
*lo
,
45 struct nfs42_layoutstat_devinfo
*devinfo
,
46 int dev_limit
, enum nfs4_ff_op_type type
);
47 static void ff_layout_encode_ff_layoutupdate(struct xdr_stream
*xdr
,
48 const struct nfs42_layoutstat_devinfo
*devinfo
,
49 struct nfs4_ff_layout_mirror
*mirror
);
51 static struct pnfs_layout_hdr
*
52 ff_layout_alloc_layout_hdr(struct inode
*inode
, gfp_t gfp_flags
)
54 struct nfs4_flexfile_layout
*ffl
;
56 ffl
= kzalloc(sizeof(*ffl
), gfp_flags
);
58 pnfs_init_ds_commit_info(&ffl
->commit_info
);
59 INIT_LIST_HEAD(&ffl
->error_list
);
60 INIT_LIST_HEAD(&ffl
->mirrors
);
61 ffl
->last_report_time
= ktime_get();
62 ffl
->commit_info
.ops
= &ff_layout_commit_ops
;
63 return &ffl
->generic_hdr
;
69 ff_layout_free_layout_hdr(struct pnfs_layout_hdr
*lo
)
71 struct nfs4_flexfile_layout
*ffl
= FF_LAYOUT_FROM_HDR(lo
);
72 struct nfs4_ff_layout_ds_err
*err
, *n
;
74 list_for_each_entry_safe(err
, n
, &ffl
->error_list
, list
) {
78 kfree_rcu(ffl
, generic_hdr
.plh_rcu
);
81 static int decode_pnfs_stateid(struct xdr_stream
*xdr
, nfs4_stateid
*stateid
)
85 p
= xdr_inline_decode(xdr
, NFS4_STATEID_SIZE
);
86 if (unlikely(p
== NULL
))
88 stateid
->type
= NFS4_PNFS_DS_STATEID_TYPE
;
89 memcpy(stateid
->data
, p
, NFS4_STATEID_SIZE
);
90 dprintk("%s: stateid id= [%x%x%x%x]\n", __func__
,
91 p
[0], p
[1], p
[2], p
[3]);
95 static int decode_deviceid(struct xdr_stream
*xdr
, struct nfs4_deviceid
*devid
)
99 p
= xdr_inline_decode(xdr
, NFS4_DEVICEID4_SIZE
);
102 memcpy(devid
, p
, NFS4_DEVICEID4_SIZE
);
103 nfs4_print_deviceid(devid
);
107 static int decode_nfs_fh(struct xdr_stream
*xdr
, struct nfs_fh
*fh
)
111 p
= xdr_inline_decode(xdr
, 4);
114 fh
->size
= be32_to_cpup(p
++);
115 if (fh
->size
> NFS_MAXFHSIZE
) {
116 printk(KERN_ERR
"NFS flexfiles: Too big fh received %d\n",
121 p
= xdr_inline_decode(xdr
, fh
->size
);
124 memcpy(&fh
->data
, p
, fh
->size
);
125 dprintk("%s: fh len %d\n", __func__
, fh
->size
);
131 * Currently only stringified uids and gids are accepted.
132 * I.e., kerberos is not supported to the DSes, so no pricipals.
134 * That means that one common function will suffice, but when
135 * principals are added, this should be split to accomodate
136 * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
139 decode_name(struct xdr_stream
*xdr
, u32
*id
)
144 /* opaque_length(4)*/
145 p
= xdr_inline_decode(xdr
, 4);
148 len
= be32_to_cpup(p
++);
152 dprintk("%s: len %u\n", __func__
, len
);
155 p
= xdr_inline_decode(xdr
, len
);
159 if (!nfs_map_string_to_numeric((char *)p
, len
, id
))
165 static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror
*m1
,
166 const struct nfs4_ff_layout_mirror
*m2
)
170 if (m1
->fh_versions_cnt
!= m2
->fh_versions_cnt
)
172 for (i
= 0; i
< m1
->fh_versions_cnt
; i
++) {
173 bool found_fh
= false;
174 for (j
= 0; j
< m2
->fh_versions_cnt
; j
++) {
175 if (nfs_compare_fh(&m1
->fh_versions
[i
],
176 &m2
->fh_versions
[j
]) == 0) {
187 static struct nfs4_ff_layout_mirror
*
188 ff_layout_add_mirror(struct pnfs_layout_hdr
*lo
,
189 struct nfs4_ff_layout_mirror
*mirror
)
191 struct nfs4_flexfile_layout
*ff_layout
= FF_LAYOUT_FROM_HDR(lo
);
192 struct nfs4_ff_layout_mirror
*pos
;
193 struct inode
*inode
= lo
->plh_inode
;
195 spin_lock(&inode
->i_lock
);
196 list_for_each_entry(pos
, &ff_layout
->mirrors
, mirrors
) {
197 if (memcmp(&mirror
->devid
, &pos
->devid
, sizeof(pos
->devid
)) != 0)
199 if (!ff_mirror_match_fh(mirror
, pos
))
201 if (refcount_inc_not_zero(&pos
->ref
)) {
202 spin_unlock(&inode
->i_lock
);
206 list_add(&mirror
->mirrors
, &ff_layout
->mirrors
);
208 spin_unlock(&inode
->i_lock
);
213 ff_layout_remove_mirror(struct nfs4_ff_layout_mirror
*mirror
)
216 if (mirror
->layout
== NULL
)
218 inode
= mirror
->layout
->plh_inode
;
219 spin_lock(&inode
->i_lock
);
220 list_del(&mirror
->mirrors
);
221 spin_unlock(&inode
->i_lock
);
222 mirror
->layout
= NULL
;
225 static struct nfs4_ff_layout_mirror
*ff_layout_alloc_mirror(gfp_t gfp_flags
)
227 struct nfs4_ff_layout_mirror
*mirror
;
229 mirror
= kzalloc(sizeof(*mirror
), gfp_flags
);
230 if (mirror
!= NULL
) {
231 spin_lock_init(&mirror
->lock
);
232 refcount_set(&mirror
->ref
, 1);
233 INIT_LIST_HEAD(&mirror
->mirrors
);
238 static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror
*mirror
)
240 const struct cred
*cred
;
242 ff_layout_remove_mirror(mirror
);
243 kfree(mirror
->fh_versions
);
244 cred
= rcu_access_pointer(mirror
->ro_cred
);
246 cred
= rcu_access_pointer(mirror
->rw_cred
);
248 nfs4_ff_layout_put_deviceid(mirror
->mirror_ds
);
252 static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror
*mirror
)
254 if (mirror
!= NULL
&& refcount_dec_and_test(&mirror
->ref
))
255 ff_layout_free_mirror(mirror
);
258 static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment
*fls
)
262 for (i
= 0; i
< fls
->mirror_array_cnt
; i
++)
263 ff_layout_put_mirror(fls
->mirror_array
[i
]);
266 static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment
*fls
)
269 ff_layout_free_mirror_array(fls
);
275 ff_lseg_match_mirrors(struct pnfs_layout_segment
*l1
,
276 struct pnfs_layout_segment
*l2
)
278 const struct nfs4_ff_layout_segment
*fl1
= FF_LAYOUT_LSEG(l1
);
279 const struct nfs4_ff_layout_segment
*fl2
= FF_LAYOUT_LSEG(l1
);
282 if (fl1
->mirror_array_cnt
!= fl2
->mirror_array_cnt
)
284 for (i
= 0; i
< fl1
->mirror_array_cnt
; i
++) {
285 if (fl1
->mirror_array
[i
] != fl2
->mirror_array
[i
])
292 ff_lseg_range_is_after(const struct pnfs_layout_range
*l1
,
293 const struct pnfs_layout_range
*l2
)
297 if (l1
->iomode
!= l2
->iomode
)
298 return l1
->iomode
!= IOMODE_READ
;
299 end1
= pnfs_calc_offset_end(l1
->offset
, l1
->length
);
300 end2
= pnfs_calc_offset_end(l2
->offset
, l2
->length
);
301 if (end1
< l2
->offset
)
303 if (end2
< l1
->offset
)
305 return l2
->offset
<= l1
->offset
;
309 ff_lseg_merge(struct pnfs_layout_segment
*new,
310 struct pnfs_layout_segment
*old
)
312 u64 new_end
, old_end
;
314 if (test_bit(NFS_LSEG_LAYOUTRETURN
, &old
->pls_flags
))
316 if (new->pls_range
.iomode
!= old
->pls_range
.iomode
)
318 old_end
= pnfs_calc_offset_end(old
->pls_range
.offset
,
319 old
->pls_range
.length
);
320 if (old_end
< new->pls_range
.offset
)
322 new_end
= pnfs_calc_offset_end(new->pls_range
.offset
,
323 new->pls_range
.length
);
324 if (new_end
< old
->pls_range
.offset
)
326 if (!ff_lseg_match_mirrors(new, old
))
329 /* Mergeable: copy info from 'old' to 'new' */
330 if (new_end
< old_end
)
332 if (new->pls_range
.offset
< old
->pls_range
.offset
)
333 new->pls_range
.offset
= old
->pls_range
.offset
;
334 new->pls_range
.length
= pnfs_calc_offset_length(new->pls_range
.offset
,
336 if (test_bit(NFS_LSEG_ROC
, &old
->pls_flags
))
337 set_bit(NFS_LSEG_ROC
, &new->pls_flags
);
342 ff_layout_add_lseg(struct pnfs_layout_hdr
*lo
,
343 struct pnfs_layout_segment
*lseg
,
344 struct list_head
*free_me
)
346 pnfs_generic_layout_insert_lseg(lo
, lseg
,
347 ff_lseg_range_is_after
,
352 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment
*fls
)
356 for (i
= 0; i
< fls
->mirror_array_cnt
- 1; i
++) {
357 for (j
= i
+ 1; j
< fls
->mirror_array_cnt
; j
++)
358 if (fls
->mirror_array
[i
]->efficiency
<
359 fls
->mirror_array
[j
]->efficiency
)
360 swap(fls
->mirror_array
[i
],
361 fls
->mirror_array
[j
]);
365 static struct pnfs_layout_segment
*
366 ff_layout_alloc_lseg(struct pnfs_layout_hdr
*lh
,
367 struct nfs4_layoutget_res
*lgr
,
370 struct pnfs_layout_segment
*ret
;
371 struct nfs4_ff_layout_segment
*fls
= NULL
;
372 struct xdr_stream stream
;
374 struct page
*scratch
;
376 u32 mirror_array_cnt
;
380 dprintk("--> %s\n", __func__
);
381 scratch
= alloc_page(gfp_flags
);
383 return ERR_PTR(-ENOMEM
);
385 xdr_init_decode_pages(&stream
, &buf
, lgr
->layoutp
->pages
,
387 xdr_set_scratch_page(&stream
, scratch
);
389 /* stripe unit and mirror_array_cnt */
391 p
= xdr_inline_decode(&stream
, 8 + 4);
395 p
= xdr_decode_hyper(p
, &stripe_unit
);
396 mirror_array_cnt
= be32_to_cpup(p
++);
397 dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__
,
398 stripe_unit
, mirror_array_cnt
);
400 if (mirror_array_cnt
> NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT
||
401 mirror_array_cnt
== 0)
405 fls
= kzalloc(struct_size(fls
, mirror_array
, mirror_array_cnt
),
410 fls
->mirror_array_cnt
= mirror_array_cnt
;
411 fls
->stripe_unit
= stripe_unit
;
413 for (i
= 0; i
< fls
->mirror_array_cnt
; i
++) {
414 struct nfs4_ff_layout_mirror
*mirror
;
416 const struct cred __rcu
*cred
;
419 u32 ds_count
, fh_count
, id
;
423 p
= xdr_inline_decode(&stream
, 4);
426 ds_count
= be32_to_cpup(p
);
428 /* FIXME: allow for striping? */
432 fls
->mirror_array
[i
] = ff_layout_alloc_mirror(gfp_flags
);
433 if (fls
->mirror_array
[i
] == NULL
) {
438 fls
->mirror_array
[i
]->ds_count
= ds_count
;
441 rc
= decode_deviceid(&stream
, &fls
->mirror_array
[i
]->devid
);
447 p
= xdr_inline_decode(&stream
, 4);
450 fls
->mirror_array
[i
]->efficiency
= be32_to_cpup(p
);
453 rc
= decode_pnfs_stateid(&stream
, &fls
->mirror_array
[i
]->stateid
);
459 p
= xdr_inline_decode(&stream
, 4);
462 fh_count
= be32_to_cpup(p
);
464 fls
->mirror_array
[i
]->fh_versions
=
465 kcalloc(fh_count
, sizeof(struct nfs_fh
),
467 if (fls
->mirror_array
[i
]->fh_versions
== NULL
) {
472 for (j
= 0; j
< fh_count
; j
++) {
473 rc
= decode_nfs_fh(&stream
,
474 &fls
->mirror_array
[i
]->fh_versions
[j
]);
479 fls
->mirror_array
[i
]->fh_versions_cnt
= fh_count
;
482 rc
= decode_name(&stream
, &id
);
486 uid
= make_kuid(&init_user_ns
, id
);
489 rc
= decode_name(&stream
, &id
);
493 gid
= make_kgid(&init_user_ns
, id
);
495 if (gfp_flags
& __GFP_FS
)
496 kcred
= prepare_kernel_cred(&init_task
);
498 unsigned int nofs_flags
= memalloc_nofs_save();
499 kcred
= prepare_kernel_cred(&init_task
);
500 memalloc_nofs_restore(nofs_flags
);
507 cred
= RCU_INITIALIZER(kcred
);
509 if (lgr
->range
.iomode
== IOMODE_READ
)
510 rcu_assign_pointer(fls
->mirror_array
[i
]->ro_cred
, cred
);
512 rcu_assign_pointer(fls
->mirror_array
[i
]->rw_cred
, cred
);
514 mirror
= ff_layout_add_mirror(lh
, fls
->mirror_array
[i
]);
515 if (mirror
!= fls
->mirror_array
[i
]) {
516 /* swap cred ptrs so free_mirror will clean up old */
517 if (lgr
->range
.iomode
== IOMODE_READ
) {
518 cred
= xchg(&mirror
->ro_cred
, cred
);
519 rcu_assign_pointer(fls
->mirror_array
[i
]->ro_cred
, cred
);
521 cred
= xchg(&mirror
->rw_cred
, cred
);
522 rcu_assign_pointer(fls
->mirror_array
[i
]->rw_cred
, cred
);
524 ff_layout_free_mirror(fls
->mirror_array
[i
]);
525 fls
->mirror_array
[i
] = mirror
;
528 dprintk("%s: iomode %s uid %u gid %u\n", __func__
,
529 lgr
->range
.iomode
== IOMODE_READ
? "READ" : "RW",
530 from_kuid(&init_user_ns
, uid
),
531 from_kgid(&init_user_ns
, gid
));
534 p
= xdr_inline_decode(&stream
, 4);
536 goto out_sort_mirrors
;
537 fls
->flags
= be32_to_cpup(p
);
539 p
= xdr_inline_decode(&stream
, 4);
541 goto out_sort_mirrors
;
542 for (i
=0; i
< fls
->mirror_array_cnt
; i
++)
543 fls
->mirror_array
[i
]->report_interval
= be32_to_cpup(p
);
546 ff_layout_sort_mirrors(fls
);
547 ret
= &fls
->generic_hdr
;
548 dprintk("<-- %s (success)\n", __func__
);
550 __free_page(scratch
);
553 _ff_layout_free_lseg(fls
);
555 dprintk("<-- %s (%d)\n", __func__
, rc
);
560 ff_layout_free_lseg(struct pnfs_layout_segment
*lseg
)
562 struct nfs4_ff_layout_segment
*fls
= FF_LAYOUT_LSEG(lseg
);
564 dprintk("--> %s\n", __func__
);
566 if (lseg
->pls_range
.iomode
== IOMODE_RW
) {
567 struct nfs4_flexfile_layout
*ffl
;
570 ffl
= FF_LAYOUT_FROM_HDR(lseg
->pls_layout
);
571 inode
= ffl
->generic_hdr
.plh_inode
;
572 spin_lock(&inode
->i_lock
);
573 pnfs_generic_ds_cinfo_release_lseg(&ffl
->commit_info
, lseg
);
574 spin_unlock(&inode
->i_lock
);
576 _ff_layout_free_lseg(fls
);
580 nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer
*timer
, ktime_t now
)
582 /* first IO request? */
583 if (atomic_inc_return(&timer
->n_ops
) == 1) {
584 timer
->start_time
= now
;
589 nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer
*timer
, ktime_t now
)
593 if (atomic_dec_return(&timer
->n_ops
) < 0)
596 start
= timer
->start_time
;
597 timer
->start_time
= now
;
598 return ktime_sub(now
, start
);
602 nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror
*mirror
,
603 struct nfs4_ff_layoutstat
*layoutstat
,
606 s64 report_interval
= FF_LAYOUTSTATS_REPORT_INTERVAL
;
607 struct nfs4_flexfile_layout
*ffl
= FF_LAYOUT_FROM_HDR(mirror
->layout
);
609 nfs4_ff_start_busy_timer(&layoutstat
->busy_timer
, now
);
610 if (!mirror
->start_time
)
611 mirror
->start_time
= now
;
612 if (mirror
->report_interval
!= 0)
613 report_interval
= (s64
)mirror
->report_interval
* 1000LL;
614 else if (layoutstats_timer
!= 0)
615 report_interval
= (s64
)layoutstats_timer
* 1000LL;
616 if (ktime_to_ms(ktime_sub(now
, ffl
->last_report_time
)) >=
618 ffl
->last_report_time
= now
;
626 nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat
*layoutstat
,
629 struct nfs4_ff_io_stat
*iostat
= &layoutstat
->io_stat
;
631 iostat
->ops_requested
++;
632 iostat
->bytes_requested
+= requested
;
636 nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat
*layoutstat
,
639 ktime_t time_completed
,
640 ktime_t time_started
)
642 struct nfs4_ff_io_stat
*iostat
= &layoutstat
->io_stat
;
643 ktime_t completion_time
= ktime_sub(time_completed
, time_started
);
646 iostat
->ops_completed
++;
647 iostat
->bytes_completed
+= completed
;
648 iostat
->bytes_not_delivered
+= requested
- completed
;
650 timer
= nfs4_ff_end_busy_timer(&layoutstat
->busy_timer
, time_completed
);
651 iostat
->total_busy_time
=
652 ktime_add(iostat
->total_busy_time
, timer
);
653 iostat
->aggregate_completion_time
=
654 ktime_add(iostat
->aggregate_completion_time
,
659 nfs4_ff_layout_stat_io_start_read(struct inode
*inode
,
660 struct nfs4_ff_layout_mirror
*mirror
,
661 __u64 requested
, ktime_t now
)
665 spin_lock(&mirror
->lock
);
666 report
= nfs4_ff_layoutstat_start_io(mirror
, &mirror
->read_stat
, now
);
667 nfs4_ff_layout_stat_io_update_requested(&mirror
->read_stat
, requested
);
668 set_bit(NFS4_FF_MIRROR_STAT_AVAIL
, &mirror
->flags
);
669 spin_unlock(&mirror
->lock
);
672 pnfs_report_layoutstat(inode
, nfs_io_gfp_mask());
676 nfs4_ff_layout_stat_io_end_read(struct rpc_task
*task
,
677 struct nfs4_ff_layout_mirror
*mirror
,
681 spin_lock(&mirror
->lock
);
682 nfs4_ff_layout_stat_io_update_completed(&mirror
->read_stat
,
683 requested
, completed
,
684 ktime_get(), task
->tk_start
);
685 set_bit(NFS4_FF_MIRROR_STAT_AVAIL
, &mirror
->flags
);
686 spin_unlock(&mirror
->lock
);
690 nfs4_ff_layout_stat_io_start_write(struct inode
*inode
,
691 struct nfs4_ff_layout_mirror
*mirror
,
692 __u64 requested
, ktime_t now
)
696 spin_lock(&mirror
->lock
);
697 report
= nfs4_ff_layoutstat_start_io(mirror
, &mirror
->write_stat
, now
);
698 nfs4_ff_layout_stat_io_update_requested(&mirror
->write_stat
, requested
);
699 set_bit(NFS4_FF_MIRROR_STAT_AVAIL
, &mirror
->flags
);
700 spin_unlock(&mirror
->lock
);
703 pnfs_report_layoutstat(inode
, nfs_io_gfp_mask());
707 nfs4_ff_layout_stat_io_end_write(struct rpc_task
*task
,
708 struct nfs4_ff_layout_mirror
*mirror
,
711 enum nfs3_stable_how committed
)
713 if (committed
== NFS_UNSTABLE
)
714 requested
= completed
= 0;
716 spin_lock(&mirror
->lock
);
717 nfs4_ff_layout_stat_io_update_completed(&mirror
->write_stat
,
718 requested
, completed
, ktime_get(), task
->tk_start
);
719 set_bit(NFS4_FF_MIRROR_STAT_AVAIL
, &mirror
->flags
);
720 spin_unlock(&mirror
->lock
);
724 ff_layout_mark_ds_unreachable(struct pnfs_layout_segment
*lseg
, u32 idx
)
726 struct nfs4_deviceid_node
*devid
= FF_LAYOUT_DEVID_NODE(lseg
, idx
);
729 nfs4_mark_deviceid_unavailable(devid
);
733 ff_layout_mark_ds_reachable(struct pnfs_layout_segment
*lseg
, u32 idx
)
735 struct nfs4_deviceid_node
*devid
= FF_LAYOUT_DEVID_NODE(lseg
, idx
);
738 nfs4_mark_deviceid_available(devid
);
741 static struct nfs4_pnfs_ds
*
742 ff_layout_choose_ds_for_read(struct pnfs_layout_segment
*lseg
,
743 u32 start_idx
, u32
*best_idx
,
746 struct nfs4_ff_layout_segment
*fls
= FF_LAYOUT_LSEG(lseg
);
747 struct nfs4_ff_layout_mirror
*mirror
;
748 struct nfs4_pnfs_ds
*ds
;
751 /* mirrors are initially sorted by efficiency */
752 for (idx
= start_idx
; idx
< fls
->mirror_array_cnt
; idx
++) {
753 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
754 ds
= nfs4_ff_layout_prepare_ds(lseg
, mirror
, false);
759 nfs4_test_deviceid_unavailable(&mirror
->mirror_ds
->id_node
))
769 static struct nfs4_pnfs_ds
*
770 ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment
*lseg
,
771 u32 start_idx
, u32
*best_idx
)
773 return ff_layout_choose_ds_for_read(lseg
, start_idx
, best_idx
, false);
776 static struct nfs4_pnfs_ds
*
777 ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment
*lseg
,
778 u32 start_idx
, u32
*best_idx
)
780 return ff_layout_choose_ds_for_read(lseg
, start_idx
, best_idx
, true);
783 static struct nfs4_pnfs_ds
*
784 ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment
*lseg
,
785 u32 start_idx
, u32
*best_idx
)
787 struct nfs4_pnfs_ds
*ds
;
789 ds
= ff_layout_choose_valid_ds_for_read(lseg
, start_idx
, best_idx
);
792 return ff_layout_choose_any_ds_for_read(lseg
, start_idx
, best_idx
);
795 static struct nfs4_pnfs_ds
*
796 ff_layout_get_ds_for_read(struct nfs_pageio_descriptor
*pgio
,
799 struct pnfs_layout_segment
*lseg
= pgio
->pg_lseg
;
800 struct nfs4_pnfs_ds
*ds
;
802 ds
= ff_layout_choose_best_ds_for_read(lseg
, pgio
->pg_mirror_idx
,
804 if (ds
|| !pgio
->pg_mirror_idx
)
806 return ff_layout_choose_best_ds_for_read(lseg
, 0, best_idx
);
810 ff_layout_pg_get_read(struct nfs_pageio_descriptor
*pgio
,
811 struct nfs_page
*req
,
814 pnfs_put_lseg(pgio
->pg_lseg
);
816 pnfs_update_layout(pgio
->pg_inode
, nfs_req_openctx(req
),
817 req_offset(req
), req
->wb_bytes
, IOMODE_READ
,
818 strict_iomode
, nfs_io_gfp_mask());
819 if (IS_ERR(pgio
->pg_lseg
)) {
820 pgio
->pg_error
= PTR_ERR(pgio
->pg_lseg
);
821 pgio
->pg_lseg
= NULL
;
826 ff_layout_pg_check_layout(struct nfs_pageio_descriptor
*pgio
,
827 struct nfs_page
*req
)
829 pnfs_generic_pg_check_layout(pgio
);
830 pnfs_generic_pg_check_range(pgio
, req
);
834 ff_layout_pg_init_read(struct nfs_pageio_descriptor
*pgio
,
835 struct nfs_page
*req
)
837 struct nfs_pgio_mirror
*pgm
;
838 struct nfs4_ff_layout_mirror
*mirror
;
839 struct nfs4_pnfs_ds
*ds
;
843 ff_layout_pg_check_layout(pgio
, req
);
844 /* Use full layout for now */
845 if (!pgio
->pg_lseg
) {
846 ff_layout_pg_get_read(pgio
, req
, false);
850 if (ff_layout_avoid_read_on_rw(pgio
->pg_lseg
)) {
851 ff_layout_pg_get_read(pgio
, req
, true);
856 ds
= ff_layout_get_ds_for_read(pgio
, &ds_idx
);
858 if (!ff_layout_no_fallback_to_mds(pgio
->pg_lseg
))
860 pnfs_generic_pg_cleanup(pgio
);
861 /* Sleep for 1 second before retrying */
866 mirror
= FF_LAYOUT_COMP(pgio
->pg_lseg
, ds_idx
);
867 pgm
= &pgio
->pg_mirrors
[0];
868 pgm
->pg_bsize
= mirror
->mirror_ds
->ds_versions
[0].rsize
;
870 pgio
->pg_mirror_idx
= ds_idx
;
872 if (NFS_SERVER(pgio
->pg_inode
)->flags
&
873 (NFS_MOUNT_SOFT
|NFS_MOUNT_SOFTERR
))
874 pgio
->pg_maxretrans
= io_maxretrans
;
877 if (pgio
->pg_error
< 0)
880 trace_pnfs_mds_fallback_pg_init_read(pgio
->pg_inode
,
881 0, NFS4_MAX_UINT64
, IOMODE_READ
,
882 NFS_I(pgio
->pg_inode
)->layout
,
884 pgio
->pg_maxretrans
= 0;
885 nfs_pageio_reset_read_mds(pgio
);
889 ff_layout_pg_init_write(struct nfs_pageio_descriptor
*pgio
,
890 struct nfs_page
*req
)
892 struct nfs4_ff_layout_mirror
*mirror
;
893 struct nfs_pgio_mirror
*pgm
;
894 struct nfs4_pnfs_ds
*ds
;
898 ff_layout_pg_check_layout(pgio
, req
);
899 if (!pgio
->pg_lseg
) {
901 pnfs_update_layout(pgio
->pg_inode
, nfs_req_openctx(req
),
902 req_offset(req
), req
->wb_bytes
,
903 IOMODE_RW
, false, nfs_io_gfp_mask());
904 if (IS_ERR(pgio
->pg_lseg
)) {
905 pgio
->pg_error
= PTR_ERR(pgio
->pg_lseg
);
906 pgio
->pg_lseg
= NULL
;
910 /* If no lseg, fall back to write through mds */
911 if (pgio
->pg_lseg
== NULL
)
914 /* Use a direct mapping of ds_idx to pgio mirror_idx */
915 if (pgio
->pg_mirror_count
!= FF_LAYOUT_MIRROR_COUNT(pgio
->pg_lseg
))
918 for (i
= 0; i
< pgio
->pg_mirror_count
; i
++) {
919 mirror
= FF_LAYOUT_COMP(pgio
->pg_lseg
, i
);
920 ds
= nfs4_ff_layout_prepare_ds(pgio
->pg_lseg
, mirror
, true);
922 if (!ff_layout_no_fallback_to_mds(pgio
->pg_lseg
))
924 pnfs_generic_pg_cleanup(pgio
);
925 /* Sleep for 1 second before retrying */
929 pgm
= &pgio
->pg_mirrors
[i
];
930 pgm
->pg_bsize
= mirror
->mirror_ds
->ds_versions
[0].wsize
;
933 if (NFS_SERVER(pgio
->pg_inode
)->flags
&
934 (NFS_MOUNT_SOFT
|NFS_MOUNT_SOFTERR
))
935 pgio
->pg_maxretrans
= io_maxretrans
;
938 pnfs_generic_pg_cleanup(pgio
);
939 pgio
->pg_error
= -EAGAIN
;
942 trace_pnfs_mds_fallback_pg_init_write(pgio
->pg_inode
,
943 0, NFS4_MAX_UINT64
, IOMODE_RW
,
944 NFS_I(pgio
->pg_inode
)->layout
,
946 pgio
->pg_maxretrans
= 0;
947 nfs_pageio_reset_write_mds(pgio
);
948 pgio
->pg_error
= -EAGAIN
;
952 ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor
*pgio
,
953 struct nfs_page
*req
)
955 if (!pgio
->pg_lseg
) {
957 pnfs_update_layout(pgio
->pg_inode
, nfs_req_openctx(req
),
958 req_offset(req
), req
->wb_bytes
,
959 IOMODE_RW
, false, nfs_io_gfp_mask());
960 if (IS_ERR(pgio
->pg_lseg
)) {
961 pgio
->pg_error
= PTR_ERR(pgio
->pg_lseg
);
962 pgio
->pg_lseg
= NULL
;
967 return FF_LAYOUT_MIRROR_COUNT(pgio
->pg_lseg
);
969 trace_pnfs_mds_fallback_pg_get_mirror_count(pgio
->pg_inode
,
970 0, NFS4_MAX_UINT64
, IOMODE_RW
,
971 NFS_I(pgio
->pg_inode
)->layout
,
973 /* no lseg means that pnfs is not in use, so no mirroring here */
974 nfs_pageio_reset_write_mds(pgio
);
980 ff_layout_pg_set_mirror_write(struct nfs_pageio_descriptor
*desc
, u32 idx
)
982 u32 old
= desc
->pg_mirror_idx
;
984 desc
->pg_mirror_idx
= idx
;
988 static struct nfs_pgio_mirror
*
989 ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor
*desc
, u32 idx
)
991 return &desc
->pg_mirrors
[idx
];
994 static const struct nfs_pageio_ops ff_layout_pg_read_ops
= {
995 .pg_init
= ff_layout_pg_init_read
,
996 .pg_test
= pnfs_generic_pg_test
,
997 .pg_doio
= pnfs_generic_pg_readpages
,
998 .pg_cleanup
= pnfs_generic_pg_cleanup
,
1001 static const struct nfs_pageio_ops ff_layout_pg_write_ops
= {
1002 .pg_init
= ff_layout_pg_init_write
,
1003 .pg_test
= pnfs_generic_pg_test
,
1004 .pg_doio
= pnfs_generic_pg_writepages
,
1005 .pg_get_mirror_count
= ff_layout_pg_get_mirror_count_write
,
1006 .pg_cleanup
= pnfs_generic_pg_cleanup
,
1007 .pg_get_mirror
= ff_layout_pg_get_mirror_write
,
1008 .pg_set_mirror
= ff_layout_pg_set_mirror_write
,
1011 static void ff_layout_reset_write(struct nfs_pgio_header
*hdr
, bool retry_pnfs
)
1013 struct rpc_task
*task
= &hdr
->task
;
1015 pnfs_layoutcommit_inode(hdr
->inode
, false);
1018 dprintk("%s Reset task %5u for i/o through pNFS "
1019 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__
,
1021 hdr
->inode
->i_sb
->s_id
,
1022 (unsigned long long)NFS_FILEID(hdr
->inode
),
1024 (unsigned long long)hdr
->args
.offset
);
1026 hdr
->completion_ops
->reschedule_io(hdr
);
1030 if (!test_and_set_bit(NFS_IOHDR_REDO
, &hdr
->flags
)) {
1031 dprintk("%s Reset task %5u for i/o through MDS "
1032 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__
,
1034 hdr
->inode
->i_sb
->s_id
,
1035 (unsigned long long)NFS_FILEID(hdr
->inode
),
1037 (unsigned long long)hdr
->args
.offset
);
1039 trace_pnfs_mds_fallback_write_done(hdr
->inode
,
1040 hdr
->args
.offset
, hdr
->args
.count
,
1041 IOMODE_RW
, NFS_I(hdr
->inode
)->layout
,
1043 task
->tk_status
= pnfs_write_done_resend_to_mds(hdr
);
1047 static void ff_layout_resend_pnfs_read(struct nfs_pgio_header
*hdr
)
1049 u32 idx
= hdr
->pgio_mirror_idx
+ 1;
1052 if (ff_layout_choose_any_ds_for_read(hdr
->lseg
, idx
, &new_idx
))
1053 ff_layout_send_layouterror(hdr
->lseg
);
1055 pnfs_error_mark_layout_for_return(hdr
->inode
, hdr
->lseg
);
1056 pnfs_read_resend_pnfs(hdr
, new_idx
);
1059 static void ff_layout_reset_read(struct nfs_pgio_header
*hdr
)
1061 struct rpc_task
*task
= &hdr
->task
;
1063 pnfs_layoutcommit_inode(hdr
->inode
, false);
1064 pnfs_error_mark_layout_for_return(hdr
->inode
, hdr
->lseg
);
1066 if (!test_and_set_bit(NFS_IOHDR_REDO
, &hdr
->flags
)) {
1067 dprintk("%s Reset task %5u for i/o through MDS "
1068 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__
,
1070 hdr
->inode
->i_sb
->s_id
,
1071 (unsigned long long)NFS_FILEID(hdr
->inode
),
1073 (unsigned long long)hdr
->args
.offset
);
1075 trace_pnfs_mds_fallback_read_done(hdr
->inode
,
1076 hdr
->args
.offset
, hdr
->args
.count
,
1077 IOMODE_READ
, NFS_I(hdr
->inode
)->layout
,
1079 task
->tk_status
= pnfs_read_done_resend_to_mds(hdr
);
1083 static int ff_layout_async_handle_error_v4(struct rpc_task
*task
,
1084 struct nfs4_state
*state
,
1085 struct nfs_client
*clp
,
1086 struct pnfs_layout_segment
*lseg
,
1089 struct pnfs_layout_hdr
*lo
= lseg
->pls_layout
;
1090 struct inode
*inode
= lo
->plh_inode
;
1091 struct nfs4_deviceid_node
*devid
= FF_LAYOUT_DEVID_NODE(lseg
, idx
);
1092 struct nfs4_slot_table
*tbl
= &clp
->cl_session
->fc_slot_table
;
1094 switch (task
->tk_status
) {
1095 case -NFS4ERR_BADSESSION
:
1096 case -NFS4ERR_BADSLOT
:
1097 case -NFS4ERR_BAD_HIGH_SLOT
:
1098 case -NFS4ERR_DEADSESSION
:
1099 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION
:
1100 case -NFS4ERR_SEQ_FALSE_RETRY
:
1101 case -NFS4ERR_SEQ_MISORDERED
:
1102 dprintk("%s ERROR %d, Reset session. Exchangeid "
1103 "flags 0x%x\n", __func__
, task
->tk_status
,
1104 clp
->cl_exchange_flags
);
1105 nfs4_schedule_session_recovery(clp
->cl_session
, task
->tk_status
);
1107 case -NFS4ERR_DELAY
:
1108 case -NFS4ERR_GRACE
:
1109 rpc_delay(task
, FF_LAYOUT_POLL_RETRY_MAX
);
1111 case -NFS4ERR_RETRY_UNCACHED_REP
:
1113 /* Invalidate Layout errors */
1114 case -NFS4ERR_PNFS_NO_LAYOUT
:
1115 case -ESTALE
: /* mapped NFS4ERR_STALE */
1116 case -EBADHANDLE
: /* mapped NFS4ERR_BADHANDLE */
1117 case -EISDIR
: /* mapped NFS4ERR_ISDIR */
1118 case -NFS4ERR_FHEXPIRED
:
1119 case -NFS4ERR_WRONG_TYPE
:
1120 dprintk("%s Invalid layout error %d\n", __func__
,
1123 * Destroy layout so new i/o will get a new layout.
1124 * Layout will not be destroyed until all current lseg
1125 * references are put. Mark layout as invalid to resend failed
1126 * i/o and all i/o waiting on the slot table to the MDS until
1127 * layout is destroyed and a new valid layout is obtained.
1129 pnfs_destroy_layout(NFS_I(inode
));
1130 rpc_wake_up(&tbl
->slot_tbl_waitq
);
1132 /* RPC connection errors */
1142 dprintk("%s DS connection error %d\n", __func__
,
1144 nfs4_delete_deviceid(devid
->ld
, devid
->nfs_client
,
1146 rpc_wake_up(&tbl
->slot_tbl_waitq
);
1149 if (ff_layout_avoid_mds_available_ds(lseg
))
1150 return -NFS4ERR_RESET_TO_PNFS
;
1152 dprintk("%s Retry through MDS. Error %d\n", __func__
,
1154 return -NFS4ERR_RESET_TO_MDS
;
1156 task
->tk_status
= 0;
1160 /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
1161 static int ff_layout_async_handle_error_v3(struct rpc_task
*task
,
1162 struct pnfs_layout_segment
*lseg
,
1165 struct nfs4_deviceid_node
*devid
= FF_LAYOUT_DEVID_NODE(lseg
, idx
);
1167 switch (task
->tk_status
) {
1168 /* File access problems. Don't mark the device as unavailable */
1177 nfs_inc_stats(lseg
->pls_layout
->plh_inode
, NFSIOS_DELAY
);
1180 dprintk("%s DS connection error %d\n", __func__
,
1182 nfs4_delete_deviceid(devid
->ld
, devid
->nfs_client
,
1185 /* FIXME: Need to prevent infinite looping here. */
1186 return -NFS4ERR_RESET_TO_PNFS
;
1188 task
->tk_status
= 0;
1189 rpc_restart_call_prepare(task
);
1190 rpc_delay(task
, NFS_JUKEBOX_RETRY_TIME
);
1194 static int ff_layout_async_handle_error(struct rpc_task
*task
,
1195 struct nfs4_state
*state
,
1196 struct nfs_client
*clp
,
1197 struct pnfs_layout_segment
*lseg
,
1200 int vers
= clp
->cl_nfs_mod
->rpc_vers
->number
;
1202 if (task
->tk_status
>= 0) {
1203 ff_layout_mark_ds_reachable(lseg
, idx
);
1207 /* Handle the case of an invalid layout segment */
1208 if (!pnfs_is_valid_lseg(lseg
))
1209 return -NFS4ERR_RESET_TO_PNFS
;
1213 return ff_layout_async_handle_error_v3(task
, lseg
, idx
);
1215 return ff_layout_async_handle_error_v4(task
, state
, clp
,
1218 /* should never happen */
1224 static void ff_layout_io_track_ds_error(struct pnfs_layout_segment
*lseg
,
1225 u32 idx
, u64 offset
, u64 length
,
1226 u32
*op_status
, int opnum
, int error
)
1228 struct nfs4_ff_layout_mirror
*mirror
;
1229 u32 status
= *op_status
;
1236 case -EPROTONOSUPPORT
:
1250 *op_status
= status
= NFS4ERR_NXIO
;
1253 *op_status
= status
= NFS4ERR_ACCESS
;
1260 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
1261 err
= ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg
->pls_layout
),
1262 mirror
, offset
, length
, status
, opnum
,
1270 ff_layout_mark_ds_unreachable(lseg
, idx
);
1272 * Don't return the layout if this is a read and we still
1273 * have layouts to try
1275 if (opnum
== OP_READ
)
1279 pnfs_error_mark_layout_for_return(lseg
->pls_layout
->plh_inode
,
1283 dprintk("%s: err %d op %d status %u\n", __func__
, err
, opnum
, status
);
1286 /* NFS_PROTO call done callback routines */
1287 static int ff_layout_read_done_cb(struct rpc_task
*task
,
1288 struct nfs_pgio_header
*hdr
)
1292 if (task
->tk_status
< 0) {
1293 ff_layout_io_track_ds_error(hdr
->lseg
, hdr
->pgio_mirror_idx
,
1294 hdr
->args
.offset
, hdr
->args
.count
,
1295 &hdr
->res
.op_status
, OP_READ
,
1297 trace_ff_layout_read_error(hdr
);
1300 err
= ff_layout_async_handle_error(task
, hdr
->args
.context
->state
,
1301 hdr
->ds_clp
, hdr
->lseg
,
1302 hdr
->pgio_mirror_idx
);
1304 trace_nfs4_pnfs_read(hdr
, err
);
1305 clear_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
);
1306 clear_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
);
1308 case -NFS4ERR_RESET_TO_PNFS
:
1309 set_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
);
1310 return task
->tk_status
;
1311 case -NFS4ERR_RESET_TO_MDS
:
1312 set_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
);
1313 return task
->tk_status
;
1320 rpc_restart_call_prepare(task
);
1325 ff_layout_need_layoutcommit(struct pnfs_layout_segment
*lseg
)
1327 return !(FF_LAYOUT_LSEG(lseg
)->flags
& FF_FLAGS_NO_LAYOUTCOMMIT
);
1331 * We reference the rpc_cred of the first WRITE that triggers the need for
1332 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
1333 * rfc5661 is not clear about which credential should be used.
1335 * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
1336 * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
1337 * we always send layoutcommit after DS writes.
1340 ff_layout_set_layoutcommit(struct inode
*inode
,
1341 struct pnfs_layout_segment
*lseg
,
1344 if (!ff_layout_need_layoutcommit(lseg
))
1347 pnfs_set_layoutcommit(inode
, lseg
, end_offset
);
1348 dprintk("%s inode %lu pls_end_pos %llu\n", __func__
, inode
->i_ino
,
1349 (unsigned long long) NFS_I(inode
)->layout
->plh_lwb
);
1352 static void ff_layout_read_record_layoutstats_start(struct rpc_task
*task
,
1353 struct nfs_pgio_header
*hdr
)
1355 if (test_and_set_bit(NFS_IOHDR_STAT
, &hdr
->flags
))
1357 nfs4_ff_layout_stat_io_start_read(hdr
->inode
,
1358 FF_LAYOUT_COMP(hdr
->lseg
, hdr
->pgio_mirror_idx
),
1363 static void ff_layout_read_record_layoutstats_done(struct rpc_task
*task
,
1364 struct nfs_pgio_header
*hdr
)
1366 if (!test_and_clear_bit(NFS_IOHDR_STAT
, &hdr
->flags
))
1368 nfs4_ff_layout_stat_io_end_read(task
,
1369 FF_LAYOUT_COMP(hdr
->lseg
, hdr
->pgio_mirror_idx
),
1372 set_bit(NFS_LSEG_LAYOUTRETURN
, &hdr
->lseg
->pls_flags
);
1375 static int ff_layout_read_prepare_common(struct rpc_task
*task
,
1376 struct nfs_pgio_header
*hdr
)
1378 if (unlikely(test_bit(NFS_CONTEXT_BAD
, &hdr
->args
.context
->flags
))) {
1379 rpc_exit(task
, -EIO
);
1383 if (!pnfs_is_valid_lseg(hdr
->lseg
)) {
1384 rpc_exit(task
, -EAGAIN
);
1388 ff_layout_read_record_layoutstats_start(task
, hdr
);
1393 * Call ops for the async read/write cases
1394 * In the case of dense layouts, the offset needs to be reset to its
1397 static void ff_layout_read_prepare_v3(struct rpc_task
*task
, void *data
)
1399 struct nfs_pgio_header
*hdr
= data
;
1401 if (ff_layout_read_prepare_common(task
, hdr
))
1404 rpc_call_start(task
);
1407 static void ff_layout_read_prepare_v4(struct rpc_task
*task
, void *data
)
1409 struct nfs_pgio_header
*hdr
= data
;
1411 if (nfs4_setup_sequence(hdr
->ds_clp
,
1412 &hdr
->args
.seq_args
,
1417 ff_layout_read_prepare_common(task
, hdr
);
1420 static void ff_layout_read_call_done(struct rpc_task
*task
, void *data
)
1422 struct nfs_pgio_header
*hdr
= data
;
1424 if (test_bit(NFS_IOHDR_REDO
, &hdr
->flags
) &&
1425 task
->tk_status
== 0) {
1426 nfs4_sequence_done(task
, &hdr
->res
.seq_res
);
1430 /* Note this may cause RPC to be resent */
1431 hdr
->mds_ops
->rpc_call_done(task
, hdr
);
1434 static void ff_layout_read_count_stats(struct rpc_task
*task
, void *data
)
1436 struct nfs_pgio_header
*hdr
= data
;
1438 ff_layout_read_record_layoutstats_done(task
, hdr
);
1439 rpc_count_iostats_metrics(task
,
1440 &NFS_CLIENT(hdr
->inode
)->cl_metrics
[NFSPROC4_CLNT_READ
]);
1443 static void ff_layout_read_release(void *data
)
1445 struct nfs_pgio_header
*hdr
= data
;
1447 ff_layout_read_record_layoutstats_done(&hdr
->task
, hdr
);
1448 if (test_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
))
1449 ff_layout_resend_pnfs_read(hdr
);
1450 else if (test_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
))
1451 ff_layout_reset_read(hdr
);
1452 pnfs_generic_rw_release(data
);
1456 static int ff_layout_write_done_cb(struct rpc_task
*task
,
1457 struct nfs_pgio_header
*hdr
)
1459 loff_t end_offs
= 0;
1462 if (task
->tk_status
< 0) {
1463 ff_layout_io_track_ds_error(hdr
->lseg
, hdr
->pgio_mirror_idx
,
1464 hdr
->args
.offset
, hdr
->args
.count
,
1465 &hdr
->res
.op_status
, OP_WRITE
,
1467 trace_ff_layout_write_error(hdr
);
1470 err
= ff_layout_async_handle_error(task
, hdr
->args
.context
->state
,
1471 hdr
->ds_clp
, hdr
->lseg
,
1472 hdr
->pgio_mirror_idx
);
1474 trace_nfs4_pnfs_write(hdr
, err
);
1475 clear_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
);
1476 clear_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
);
1478 case -NFS4ERR_RESET_TO_PNFS
:
1479 set_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
);
1480 return task
->tk_status
;
1481 case -NFS4ERR_RESET_TO_MDS
:
1482 set_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
);
1483 return task
->tk_status
;
1488 if (hdr
->res
.verf
->committed
== NFS_FILE_SYNC
||
1489 hdr
->res
.verf
->committed
== NFS_DATA_SYNC
)
1490 end_offs
= hdr
->mds_offset
+ (loff_t
)hdr
->res
.count
;
1492 /* Note: if the write is unstable, don't set end_offs until commit */
1493 ff_layout_set_layoutcommit(hdr
->inode
, hdr
->lseg
, end_offs
);
1495 /* zero out fattr since we don't care DS attr at all */
1496 hdr
->fattr
.valid
= 0;
1497 if (task
->tk_status
>= 0)
1498 nfs_writeback_update_inode(hdr
);
1503 static int ff_layout_commit_done_cb(struct rpc_task
*task
,
1504 struct nfs_commit_data
*data
)
1508 if (task
->tk_status
< 0) {
1509 ff_layout_io_track_ds_error(data
->lseg
, data
->ds_commit_index
,
1510 data
->args
.offset
, data
->args
.count
,
1511 &data
->res
.op_status
, OP_COMMIT
,
1513 trace_ff_layout_commit_error(data
);
1516 err
= ff_layout_async_handle_error(task
, NULL
, data
->ds_clp
,
1517 data
->lseg
, data
->ds_commit_index
);
1519 trace_nfs4_pnfs_commit_ds(data
, err
);
1521 case -NFS4ERR_RESET_TO_PNFS
:
1522 pnfs_generic_prepare_to_resend_writes(data
);
1524 case -NFS4ERR_RESET_TO_MDS
:
1525 pnfs_generic_prepare_to_resend_writes(data
);
1528 rpc_restart_call_prepare(task
);
1532 ff_layout_set_layoutcommit(data
->inode
, data
->lseg
, data
->lwb
);
1537 static void ff_layout_write_record_layoutstats_start(struct rpc_task
*task
,
1538 struct nfs_pgio_header
*hdr
)
1540 if (test_and_set_bit(NFS_IOHDR_STAT
, &hdr
->flags
))
1542 nfs4_ff_layout_stat_io_start_write(hdr
->inode
,
1543 FF_LAYOUT_COMP(hdr
->lseg
, hdr
->pgio_mirror_idx
),
1548 static void ff_layout_write_record_layoutstats_done(struct rpc_task
*task
,
1549 struct nfs_pgio_header
*hdr
)
1551 if (!test_and_clear_bit(NFS_IOHDR_STAT
, &hdr
->flags
))
1553 nfs4_ff_layout_stat_io_end_write(task
,
1554 FF_LAYOUT_COMP(hdr
->lseg
, hdr
->pgio_mirror_idx
),
1555 hdr
->args
.count
, hdr
->res
.count
,
1556 hdr
->res
.verf
->committed
);
1557 set_bit(NFS_LSEG_LAYOUTRETURN
, &hdr
->lseg
->pls_flags
);
1560 static int ff_layout_write_prepare_common(struct rpc_task
*task
,
1561 struct nfs_pgio_header
*hdr
)
1563 if (unlikely(test_bit(NFS_CONTEXT_BAD
, &hdr
->args
.context
->flags
))) {
1564 rpc_exit(task
, -EIO
);
1568 if (!pnfs_is_valid_lseg(hdr
->lseg
)) {
1569 rpc_exit(task
, -EAGAIN
);
1573 ff_layout_write_record_layoutstats_start(task
, hdr
);
1577 static void ff_layout_write_prepare_v3(struct rpc_task
*task
, void *data
)
1579 struct nfs_pgio_header
*hdr
= data
;
1581 if (ff_layout_write_prepare_common(task
, hdr
))
1584 rpc_call_start(task
);
1587 static void ff_layout_write_prepare_v4(struct rpc_task
*task
, void *data
)
1589 struct nfs_pgio_header
*hdr
= data
;
1591 if (nfs4_setup_sequence(hdr
->ds_clp
,
1592 &hdr
->args
.seq_args
,
1597 ff_layout_write_prepare_common(task
, hdr
);
1600 static void ff_layout_write_call_done(struct rpc_task
*task
, void *data
)
1602 struct nfs_pgio_header
*hdr
= data
;
1604 if (test_bit(NFS_IOHDR_REDO
, &hdr
->flags
) &&
1605 task
->tk_status
== 0) {
1606 nfs4_sequence_done(task
, &hdr
->res
.seq_res
);
1610 /* Note this may cause RPC to be resent */
1611 hdr
->mds_ops
->rpc_call_done(task
, hdr
);
1614 static void ff_layout_write_count_stats(struct rpc_task
*task
, void *data
)
1616 struct nfs_pgio_header
*hdr
= data
;
1618 ff_layout_write_record_layoutstats_done(task
, hdr
);
1619 rpc_count_iostats_metrics(task
,
1620 &NFS_CLIENT(hdr
->inode
)->cl_metrics
[NFSPROC4_CLNT_WRITE
]);
1623 static void ff_layout_write_release(void *data
)
1625 struct nfs_pgio_header
*hdr
= data
;
1627 ff_layout_write_record_layoutstats_done(&hdr
->task
, hdr
);
1628 if (test_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
)) {
1629 ff_layout_send_layouterror(hdr
->lseg
);
1630 ff_layout_reset_write(hdr
, true);
1631 } else if (test_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
))
1632 ff_layout_reset_write(hdr
, false);
1633 pnfs_generic_rw_release(data
);
1636 static void ff_layout_commit_record_layoutstats_start(struct rpc_task
*task
,
1637 struct nfs_commit_data
*cdata
)
1639 if (test_and_set_bit(NFS_IOHDR_STAT
, &cdata
->flags
))
1641 nfs4_ff_layout_stat_io_start_write(cdata
->inode
,
1642 FF_LAYOUT_COMP(cdata
->lseg
, cdata
->ds_commit_index
),
1646 static void ff_layout_commit_record_layoutstats_done(struct rpc_task
*task
,
1647 struct nfs_commit_data
*cdata
)
1649 struct nfs_page
*req
;
1652 if (!test_and_clear_bit(NFS_IOHDR_STAT
, &cdata
->flags
))
1655 if (task
->tk_status
== 0) {
1656 list_for_each_entry(req
, &cdata
->pages
, wb_list
)
1657 count
+= req
->wb_bytes
;
1659 nfs4_ff_layout_stat_io_end_write(task
,
1660 FF_LAYOUT_COMP(cdata
->lseg
, cdata
->ds_commit_index
),
1661 count
, count
, NFS_FILE_SYNC
);
1662 set_bit(NFS_LSEG_LAYOUTRETURN
, &cdata
->lseg
->pls_flags
);
1665 static int ff_layout_commit_prepare_common(struct rpc_task
*task
,
1666 struct nfs_commit_data
*cdata
)
1668 if (!pnfs_is_valid_lseg(cdata
->lseg
)) {
1669 rpc_exit(task
, -EAGAIN
);
1673 ff_layout_commit_record_layoutstats_start(task
, cdata
);
1677 static void ff_layout_commit_prepare_v3(struct rpc_task
*task
, void *data
)
1679 if (ff_layout_commit_prepare_common(task
, data
))
1682 rpc_call_start(task
);
1685 static void ff_layout_commit_prepare_v4(struct rpc_task
*task
, void *data
)
1687 struct nfs_commit_data
*wdata
= data
;
1689 if (nfs4_setup_sequence(wdata
->ds_clp
,
1690 &wdata
->args
.seq_args
,
1691 &wdata
->res
.seq_res
,
1694 ff_layout_commit_prepare_common(task
, data
);
1697 static void ff_layout_commit_done(struct rpc_task
*task
, void *data
)
1699 pnfs_generic_write_commit_done(task
, data
);
1702 static void ff_layout_commit_count_stats(struct rpc_task
*task
, void *data
)
1704 struct nfs_commit_data
*cdata
= data
;
1706 ff_layout_commit_record_layoutstats_done(task
, cdata
);
1707 rpc_count_iostats_metrics(task
,
1708 &NFS_CLIENT(cdata
->inode
)->cl_metrics
[NFSPROC4_CLNT_COMMIT
]);
1711 static void ff_layout_commit_release(void *data
)
1713 struct nfs_commit_data
*cdata
= data
;
1715 ff_layout_commit_record_layoutstats_done(&cdata
->task
, cdata
);
1716 pnfs_generic_commit_release(data
);
1719 static const struct rpc_call_ops ff_layout_read_call_ops_v3
= {
1720 .rpc_call_prepare
= ff_layout_read_prepare_v3
,
1721 .rpc_call_done
= ff_layout_read_call_done
,
1722 .rpc_count_stats
= ff_layout_read_count_stats
,
1723 .rpc_release
= ff_layout_read_release
,
1726 static const struct rpc_call_ops ff_layout_read_call_ops_v4
= {
1727 .rpc_call_prepare
= ff_layout_read_prepare_v4
,
1728 .rpc_call_done
= ff_layout_read_call_done
,
1729 .rpc_count_stats
= ff_layout_read_count_stats
,
1730 .rpc_release
= ff_layout_read_release
,
1733 static const struct rpc_call_ops ff_layout_write_call_ops_v3
= {
1734 .rpc_call_prepare
= ff_layout_write_prepare_v3
,
1735 .rpc_call_done
= ff_layout_write_call_done
,
1736 .rpc_count_stats
= ff_layout_write_count_stats
,
1737 .rpc_release
= ff_layout_write_release
,
1740 static const struct rpc_call_ops ff_layout_write_call_ops_v4
= {
1741 .rpc_call_prepare
= ff_layout_write_prepare_v4
,
1742 .rpc_call_done
= ff_layout_write_call_done
,
1743 .rpc_count_stats
= ff_layout_write_count_stats
,
1744 .rpc_release
= ff_layout_write_release
,
1747 static const struct rpc_call_ops ff_layout_commit_call_ops_v3
= {
1748 .rpc_call_prepare
= ff_layout_commit_prepare_v3
,
1749 .rpc_call_done
= ff_layout_commit_done
,
1750 .rpc_count_stats
= ff_layout_commit_count_stats
,
1751 .rpc_release
= ff_layout_commit_release
,
1754 static const struct rpc_call_ops ff_layout_commit_call_ops_v4
= {
1755 .rpc_call_prepare
= ff_layout_commit_prepare_v4
,
1756 .rpc_call_done
= ff_layout_commit_done
,
1757 .rpc_count_stats
= ff_layout_commit_count_stats
,
1758 .rpc_release
= ff_layout_commit_release
,
1761 static enum pnfs_try_status
1762 ff_layout_read_pagelist(struct nfs_pgio_header
*hdr
)
1764 struct pnfs_layout_segment
*lseg
= hdr
->lseg
;
1765 struct nfs4_pnfs_ds
*ds
;
1766 struct rpc_clnt
*ds_clnt
;
1767 struct nfs4_ff_layout_mirror
*mirror
;
1768 const struct cred
*ds_cred
;
1769 loff_t offset
= hdr
->args
.offset
;
1770 u32 idx
= hdr
->pgio_mirror_idx
;
1774 dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
1775 __func__
, hdr
->inode
->i_ino
,
1776 hdr
->args
.pgbase
, (size_t)hdr
->args
.count
, offset
);
1778 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
1779 ds
= nfs4_ff_layout_prepare_ds(lseg
, mirror
, false);
1783 ds_clnt
= nfs4_ff_find_or_create_ds_client(mirror
, ds
->ds_clp
,
1785 if (IS_ERR(ds_clnt
))
1788 ds_cred
= ff_layout_get_ds_cred(mirror
, &lseg
->pls_range
, hdr
->cred
);
1792 vers
= nfs4_ff_layout_ds_version(mirror
);
1794 dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__
,
1795 ds
->ds_remotestr
, refcount_read(&ds
->ds_clp
->cl_count
), vers
);
1797 hdr
->pgio_done_cb
= ff_layout_read_done_cb
;
1798 refcount_inc(&ds
->ds_clp
->cl_count
);
1799 hdr
->ds_clp
= ds
->ds_clp
;
1800 fh
= nfs4_ff_layout_select_ds_fh(mirror
);
1804 nfs4_ff_layout_select_ds_stateid(mirror
, &hdr
->args
.stateid
);
1807 * Note that if we ever decide to split across DSes,
1808 * then we may need to handle dense-like offsets.
1810 hdr
->args
.offset
= offset
;
1811 hdr
->mds_offset
= offset
;
1813 /* Perform an asynchronous read to ds */
1814 nfs_initiate_pgio(ds_clnt
, hdr
, ds_cred
, ds
->ds_clp
->rpc_ops
,
1815 vers
== 3 ? &ff_layout_read_call_ops_v3
:
1816 &ff_layout_read_call_ops_v4
,
1817 0, RPC_TASK_SOFTCONN
);
1819 return PNFS_ATTEMPTED
;
1822 if (ff_layout_avoid_mds_available_ds(lseg
))
1823 return PNFS_TRY_AGAIN
;
1824 trace_pnfs_mds_fallback_read_pagelist(hdr
->inode
,
1825 hdr
->args
.offset
, hdr
->args
.count
,
1826 IOMODE_READ
, NFS_I(hdr
->inode
)->layout
, lseg
);
1827 return PNFS_NOT_ATTEMPTED
;
1830 /* Perform async writes. */
1831 static enum pnfs_try_status
1832 ff_layout_write_pagelist(struct nfs_pgio_header
*hdr
, int sync
)
1834 struct pnfs_layout_segment
*lseg
= hdr
->lseg
;
1835 struct nfs4_pnfs_ds
*ds
;
1836 struct rpc_clnt
*ds_clnt
;
1837 struct nfs4_ff_layout_mirror
*mirror
;
1838 const struct cred
*ds_cred
;
1839 loff_t offset
= hdr
->args
.offset
;
1842 u32 idx
= hdr
->pgio_mirror_idx
;
1844 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
1845 ds
= nfs4_ff_layout_prepare_ds(lseg
, mirror
, true);
1849 ds_clnt
= nfs4_ff_find_or_create_ds_client(mirror
, ds
->ds_clp
,
1851 if (IS_ERR(ds_clnt
))
1854 ds_cred
= ff_layout_get_ds_cred(mirror
, &lseg
->pls_range
, hdr
->cred
);
1858 vers
= nfs4_ff_layout_ds_version(mirror
);
1860 dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
1861 __func__
, hdr
->inode
->i_ino
, sync
, (size_t) hdr
->args
.count
,
1862 offset
, ds
->ds_remotestr
, refcount_read(&ds
->ds_clp
->cl_count
),
1865 hdr
->pgio_done_cb
= ff_layout_write_done_cb
;
1866 refcount_inc(&ds
->ds_clp
->cl_count
);
1867 hdr
->ds_clp
= ds
->ds_clp
;
1868 hdr
->ds_commit_idx
= idx
;
1869 fh
= nfs4_ff_layout_select_ds_fh(mirror
);
1873 nfs4_ff_layout_select_ds_stateid(mirror
, &hdr
->args
.stateid
);
1876 * Note that if we ever decide to split across DSes,
1877 * then we may need to handle dense-like offsets.
1879 hdr
->args
.offset
= offset
;
1881 /* Perform an asynchronous write */
1882 nfs_initiate_pgio(ds_clnt
, hdr
, ds_cred
, ds
->ds_clp
->rpc_ops
,
1883 vers
== 3 ? &ff_layout_write_call_ops_v3
:
1884 &ff_layout_write_call_ops_v4
,
1885 sync
, RPC_TASK_SOFTCONN
);
1887 return PNFS_ATTEMPTED
;
1890 if (ff_layout_avoid_mds_available_ds(lseg
))
1891 return PNFS_TRY_AGAIN
;
1892 trace_pnfs_mds_fallback_write_pagelist(hdr
->inode
,
1893 hdr
->args
.offset
, hdr
->args
.count
,
1894 IOMODE_RW
, NFS_I(hdr
->inode
)->layout
, lseg
);
1895 return PNFS_NOT_ATTEMPTED
;
1898 static u32
calc_ds_index_from_commit(struct pnfs_layout_segment
*lseg
, u32 i
)
1903 static struct nfs_fh
*
1904 select_ds_fh_from_commit(struct pnfs_layout_segment
*lseg
, u32 i
)
1906 struct nfs4_ff_layout_segment
*flseg
= FF_LAYOUT_LSEG(lseg
);
1908 /* FIXME: Assume that there is only one NFS version available
1911 return &flseg
->mirror_array
[i
]->fh_versions
[0];
1914 static int ff_layout_initiate_commit(struct nfs_commit_data
*data
, int how
)
1916 struct pnfs_layout_segment
*lseg
= data
->lseg
;
1917 struct nfs4_pnfs_ds
*ds
;
1918 struct rpc_clnt
*ds_clnt
;
1919 struct nfs4_ff_layout_mirror
*mirror
;
1920 const struct cred
*ds_cred
;
1925 if (!lseg
|| !(pnfs_is_valid_lseg(lseg
) ||
1926 test_bit(NFS_LSEG_LAYOUTRETURN
, &lseg
->pls_flags
)))
1929 idx
= calc_ds_index_from_commit(lseg
, data
->ds_commit_index
);
1930 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
1931 ds
= nfs4_ff_layout_prepare_ds(lseg
, mirror
, true);
1935 ds_clnt
= nfs4_ff_find_or_create_ds_client(mirror
, ds
->ds_clp
,
1937 if (IS_ERR(ds_clnt
))
1940 ds_cred
= ff_layout_get_ds_cred(mirror
, &lseg
->pls_range
, data
->cred
);
1944 vers
= nfs4_ff_layout_ds_version(mirror
);
1946 dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__
,
1947 data
->inode
->i_ino
, how
, refcount_read(&ds
->ds_clp
->cl_count
),
1949 data
->commit_done_cb
= ff_layout_commit_done_cb
;
1950 data
->cred
= ds_cred
;
1951 refcount_inc(&ds
->ds_clp
->cl_count
);
1952 data
->ds_clp
= ds
->ds_clp
;
1953 fh
= select_ds_fh_from_commit(lseg
, data
->ds_commit_index
);
1957 ret
= nfs_initiate_commit(ds_clnt
, data
, ds
->ds_clp
->rpc_ops
,
1958 vers
== 3 ? &ff_layout_commit_call_ops_v3
:
1959 &ff_layout_commit_call_ops_v4
,
1960 how
, RPC_TASK_SOFTCONN
);
1964 pnfs_generic_prepare_to_resend_writes(data
);
1965 pnfs_generic_commit_release(data
);
1970 ff_layout_commit_pagelist(struct inode
*inode
, struct list_head
*mds_pages
,
1971 int how
, struct nfs_commit_info
*cinfo
)
1973 return pnfs_generic_commit_pagelist(inode
, mds_pages
, how
, cinfo
,
1974 ff_layout_initiate_commit
);
1977 static bool ff_layout_match_rw(const struct rpc_task
*task
,
1978 const struct nfs_pgio_header
*hdr
,
1979 const struct pnfs_layout_segment
*lseg
)
1981 return hdr
->lseg
== lseg
;
1984 static bool ff_layout_match_commit(const struct rpc_task
*task
,
1985 const struct nfs_commit_data
*cdata
,
1986 const struct pnfs_layout_segment
*lseg
)
1988 return cdata
->lseg
== lseg
;
1991 static bool ff_layout_match_io(const struct rpc_task
*task
, const void *data
)
1993 const struct rpc_call_ops
*ops
= task
->tk_ops
;
1995 if (ops
== &ff_layout_read_call_ops_v3
||
1996 ops
== &ff_layout_read_call_ops_v4
||
1997 ops
== &ff_layout_write_call_ops_v3
||
1998 ops
== &ff_layout_write_call_ops_v4
)
1999 return ff_layout_match_rw(task
, task
->tk_calldata
, data
);
2000 if (ops
== &ff_layout_commit_call_ops_v3
||
2001 ops
== &ff_layout_commit_call_ops_v4
)
2002 return ff_layout_match_commit(task
, task
->tk_calldata
, data
);
2006 static void ff_layout_cancel_io(struct pnfs_layout_segment
*lseg
)
2008 struct nfs4_ff_layout_segment
*flseg
= FF_LAYOUT_LSEG(lseg
);
2009 struct nfs4_ff_layout_mirror
*mirror
;
2010 struct nfs4_ff_layout_ds
*mirror_ds
;
2011 struct nfs4_pnfs_ds
*ds
;
2012 struct nfs_client
*ds_clp
;
2013 struct rpc_clnt
*clnt
;
2016 for (idx
= 0; idx
< flseg
->mirror_array_cnt
; idx
++) {
2017 mirror
= flseg
->mirror_array
[idx
];
2018 mirror_ds
= mirror
->mirror_ds
;
2021 ds
= mirror
->mirror_ds
->ds
;
2024 ds_clp
= ds
->ds_clp
;
2027 clnt
= ds_clp
->cl_rpcclient
;
2030 if (!rpc_cancel_tasks(clnt
, -EAGAIN
, ff_layout_match_io
, lseg
))
2032 rpc_clnt_disconnect(clnt
);
2036 static struct pnfs_ds_commit_info
*
2037 ff_layout_get_ds_info(struct inode
*inode
)
2039 struct pnfs_layout_hdr
*layout
= NFS_I(inode
)->layout
;
2044 return &FF_LAYOUT_FROM_HDR(layout
)->commit_info
;
2048 ff_layout_setup_ds_info(struct pnfs_ds_commit_info
*fl_cinfo
,
2049 struct pnfs_layout_segment
*lseg
)
2051 struct nfs4_ff_layout_segment
*flseg
= FF_LAYOUT_LSEG(lseg
);
2052 struct inode
*inode
= lseg
->pls_layout
->plh_inode
;
2053 struct pnfs_commit_array
*array
, *new;
2055 new = pnfs_alloc_commit_array(flseg
->mirror_array_cnt
,
2058 spin_lock(&inode
->i_lock
);
2059 array
= pnfs_add_commit_array(fl_cinfo
, new, lseg
);
2060 spin_unlock(&inode
->i_lock
);
2062 pnfs_free_commit_array(new);
2067 ff_layout_release_ds_info(struct pnfs_ds_commit_info
*fl_cinfo
,
2068 struct inode
*inode
)
2070 spin_lock(&inode
->i_lock
);
2071 pnfs_generic_ds_cinfo_destroy(fl_cinfo
);
2072 spin_unlock(&inode
->i_lock
);
2076 ff_layout_free_deviceid_node(struct nfs4_deviceid_node
*d
)
2078 nfs4_ff_layout_free_deviceid(container_of(d
, struct nfs4_ff_layout_ds
,
2082 static int ff_layout_encode_ioerr(struct xdr_stream
*xdr
,
2083 const struct nfs4_layoutreturn_args
*args
,
2084 const struct nfs4_flexfile_layoutreturn_args
*ff_args
)
2088 start
= xdr_reserve_space(xdr
, 4);
2089 if (unlikely(!start
))
2092 *start
= cpu_to_be32(ff_args
->num_errors
);
2093 /* This assume we always return _ALL_ layouts */
2094 return ff_layout_encode_ds_ioerr(xdr
, &ff_args
->errors
);
2098 encode_opaque_fixed(struct xdr_stream
*xdr
, const void *buf
, size_t len
)
2100 WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr
, buf
, len
) < 0);
2104 ff_layout_encode_ff_iostat_head(struct xdr_stream
*xdr
,
2105 const nfs4_stateid
*stateid
,
2106 const struct nfs42_layoutstat_devinfo
*devinfo
)
2110 p
= xdr_reserve_space(xdr
, 8 + 8);
2111 p
= xdr_encode_hyper(p
, devinfo
->offset
);
2112 p
= xdr_encode_hyper(p
, devinfo
->length
);
2113 encode_opaque_fixed(xdr
, stateid
->data
, NFS4_STATEID_SIZE
);
2114 p
= xdr_reserve_space(xdr
, 4*8);
2115 p
= xdr_encode_hyper(p
, devinfo
->read_count
);
2116 p
= xdr_encode_hyper(p
, devinfo
->read_bytes
);
2117 p
= xdr_encode_hyper(p
, devinfo
->write_count
);
2118 p
= xdr_encode_hyper(p
, devinfo
->write_bytes
);
2119 encode_opaque_fixed(xdr
, devinfo
->dev_id
.data
, NFS4_DEVICEID4_SIZE
);
2123 ff_layout_encode_ff_iostat(struct xdr_stream
*xdr
,
2124 const nfs4_stateid
*stateid
,
2125 const struct nfs42_layoutstat_devinfo
*devinfo
)
2127 ff_layout_encode_ff_iostat_head(xdr
, stateid
, devinfo
);
2128 ff_layout_encode_ff_layoutupdate(xdr
, devinfo
,
2129 devinfo
->ld_private
.data
);
2132 /* report nothing for now */
2133 static void ff_layout_encode_iostats_array(struct xdr_stream
*xdr
,
2134 const struct nfs4_layoutreturn_args
*args
,
2135 struct nfs4_flexfile_layoutreturn_args
*ff_args
)
2140 p
= xdr_reserve_space(xdr
, 4);
2141 *p
= cpu_to_be32(ff_args
->num_dev
);
2142 for (i
= 0; i
< ff_args
->num_dev
; i
++)
2143 ff_layout_encode_ff_iostat(xdr
,
2144 &args
->layout
->plh_stateid
,
2145 &ff_args
->devinfo
[i
]);
2149 ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo
*devinfo
,
2150 unsigned int num_entries
)
2154 for (i
= 0; i
< num_entries
; i
++) {
2155 if (!devinfo
[i
].ld_private
.ops
)
2157 if (!devinfo
[i
].ld_private
.ops
->free
)
2159 devinfo
[i
].ld_private
.ops
->free(&devinfo
[i
].ld_private
);
2163 static struct nfs4_deviceid_node
*
2164 ff_layout_alloc_deviceid_node(struct nfs_server
*server
,
2165 struct pnfs_device
*pdev
, gfp_t gfp_flags
)
2167 struct nfs4_ff_layout_ds
*dsaddr
;
2169 dsaddr
= nfs4_ff_alloc_deviceid_node(server
, pdev
, gfp_flags
);
2172 return &dsaddr
->id_node
;
2176 ff_layout_encode_layoutreturn(struct xdr_stream
*xdr
,
2177 const void *voidargs
,
2178 const struct nfs4_xdr_opaque_data
*ff_opaque
)
2180 const struct nfs4_layoutreturn_args
*args
= voidargs
;
2181 struct nfs4_flexfile_layoutreturn_args
*ff_args
= ff_opaque
->data
;
2182 struct xdr_buf tmp_buf
= {
2185 .iov_base
= page_address(ff_args
->pages
[0]),
2188 .buflen
= PAGE_SIZE
,
2190 struct xdr_stream tmp_xdr
;
2193 dprintk("%s: Begin\n", __func__
);
2195 xdr_init_encode(&tmp_xdr
, &tmp_buf
, NULL
, NULL
);
2197 ff_layout_encode_ioerr(&tmp_xdr
, args
, ff_args
);
2198 ff_layout_encode_iostats_array(&tmp_xdr
, args
, ff_args
);
2200 start
= xdr_reserve_space(xdr
, 4);
2201 *start
= cpu_to_be32(tmp_buf
.len
);
2202 xdr_write_pages(xdr
, ff_args
->pages
, 0, tmp_buf
.len
);
2204 dprintk("%s: Return\n", __func__
);
2208 ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data
*args
)
2210 struct nfs4_flexfile_layoutreturn_args
*ff_args
;
2214 ff_args
= args
->data
;
2217 ff_layout_free_ds_ioerr(&ff_args
->errors
);
2218 ff_layout_free_iostats_array(ff_args
->devinfo
, ff_args
->num_dev
);
2220 put_page(ff_args
->pages
[0]);
2224 static const struct nfs4_xdr_opaque_ops layoutreturn_ops
= {
2225 .encode
= ff_layout_encode_layoutreturn
,
2226 .free
= ff_layout_free_layoutreturn
,
2230 ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args
*args
)
2232 struct nfs4_flexfile_layoutreturn_args
*ff_args
;
2233 struct nfs4_flexfile_layout
*ff_layout
= FF_LAYOUT_FROM_HDR(args
->layout
);
2235 ff_args
= kmalloc(sizeof(*ff_args
), nfs_io_gfp_mask());
2238 ff_args
->pages
[0] = alloc_page(nfs_io_gfp_mask());
2239 if (!ff_args
->pages
[0])
2240 goto out_nomem_free
;
2242 INIT_LIST_HEAD(&ff_args
->errors
);
2243 ff_args
->num_errors
= ff_layout_fetch_ds_ioerr(args
->layout
,
2244 &args
->range
, &ff_args
->errors
,
2245 FF_LAYOUTRETURN_MAXERR
);
2247 spin_lock(&args
->inode
->i_lock
);
2248 ff_args
->num_dev
= ff_layout_mirror_prepare_stats(
2249 &ff_layout
->generic_hdr
, &ff_args
->devinfo
[0],
2250 ARRAY_SIZE(ff_args
->devinfo
), NFS4_FF_OP_LAYOUTRETURN
);
2251 spin_unlock(&args
->inode
->i_lock
);
2253 args
->ld_private
->ops
= &layoutreturn_ops
;
2254 args
->ld_private
->data
= ff_args
;
2262 #ifdef CONFIG_NFS_V4_2
2264 ff_layout_send_layouterror(struct pnfs_layout_segment
*lseg
)
2266 struct pnfs_layout_hdr
*lo
= lseg
->pls_layout
;
2267 struct nfs42_layout_error
*errors
;
2270 if (!nfs_server_capable(lo
->plh_inode
, NFS_CAP_LAYOUTERROR
))
2272 ff_layout_fetch_ds_ioerr(lo
, &lseg
->pls_range
, &head
, -1);
2273 if (list_empty(&head
))
2276 errors
= kmalloc_array(NFS42_LAYOUTERROR_MAX
, sizeof(*errors
),
2278 if (errors
!= NULL
) {
2279 const struct nfs4_ff_layout_ds_err
*pos
;
2282 list_for_each_entry(pos
, &head
, list
) {
2283 errors
[n
].offset
= pos
->offset
;
2284 errors
[n
].length
= pos
->length
;
2285 nfs4_stateid_copy(&errors
[n
].stateid
, &pos
->stateid
);
2286 errors
[n
].errors
[0].dev_id
= pos
->deviceid
;
2287 errors
[n
].errors
[0].status
= pos
->status
;
2288 errors
[n
].errors
[0].opnum
= pos
->opnum
;
2290 if (!list_is_last(&pos
->list
, &head
) &&
2291 n
< NFS42_LAYOUTERROR_MAX
)
2293 if (nfs42_proc_layouterror(lseg
, errors
, n
) < 0)
2299 ff_layout_free_ds_ioerr(&head
);
2303 ff_layout_send_layouterror(struct pnfs_layout_segment
*lseg
)
2309 ff_layout_ntop4(const struct sockaddr
*sap
, char *buf
, const size_t buflen
)
2311 const struct sockaddr_in
*sin
= (struct sockaddr_in
*)sap
;
2313 return snprintf(buf
, buflen
, "%pI4", &sin
->sin_addr
);
2317 ff_layout_ntop6_noscopeid(const struct sockaddr
*sap
, char *buf
,
2320 const struct sockaddr_in6
*sin6
= (struct sockaddr_in6
*)sap
;
2321 const struct in6_addr
*addr
= &sin6
->sin6_addr
;
2324 * RFC 4291, Section 2.2.2
2326 * Shorthanded ANY address
2328 if (ipv6_addr_any(addr
))
2329 return snprintf(buf
, buflen
, "::");
2332 * RFC 4291, Section 2.2.2
2334 * Shorthanded loopback address
2336 if (ipv6_addr_loopback(addr
))
2337 return snprintf(buf
, buflen
, "::1");
2340 * RFC 4291, Section 2.2.3
2342 * Special presentation address format for mapped v4
2345 if (ipv6_addr_v4mapped(addr
))
2346 return snprintf(buf
, buflen
, "::ffff:%pI4",
2347 &addr
->s6_addr32
[3]);
2350 * RFC 4291, Section 2.2.1
2352 return snprintf(buf
, buflen
, "%pI6c", addr
);
2355 /* Derived from rpc_sockaddr2uaddr */
2357 ff_layout_encode_netaddr(struct xdr_stream
*xdr
, struct nfs4_pnfs_ds_addr
*da
)
2359 struct sockaddr
*sap
= (struct sockaddr
*)&da
->da_addr
;
2360 char portbuf
[RPCBIND_MAXUADDRPLEN
];
2361 char addrbuf
[RPCBIND_MAXUADDRLEN
];
2362 unsigned short port
;
2366 switch (sap
->sa_family
) {
2368 if (ff_layout_ntop4(sap
, addrbuf
, sizeof(addrbuf
)) == 0)
2370 port
= ntohs(((struct sockaddr_in
*)sap
)->sin_port
);
2373 if (ff_layout_ntop6_noscopeid(sap
, addrbuf
, sizeof(addrbuf
)) == 0)
2375 port
= ntohs(((struct sockaddr_in6
*)sap
)->sin6_port
);
2382 snprintf(portbuf
, sizeof(portbuf
), ".%u.%u", port
>> 8, port
& 0xff);
2383 len
= strlcat(addrbuf
, portbuf
, sizeof(addrbuf
));
2385 netid_len
= strlen(da
->da_netid
);
2386 p
= xdr_reserve_space(xdr
, 4 + netid_len
);
2387 xdr_encode_opaque(p
, da
->da_netid
, netid_len
);
2389 p
= xdr_reserve_space(xdr
, 4 + len
);
2390 xdr_encode_opaque(p
, addrbuf
, len
);
2394 ff_layout_encode_nfstime(struct xdr_stream
*xdr
,
2397 struct timespec64 ts
;
2400 p
= xdr_reserve_space(xdr
, 12);
2401 ts
= ktime_to_timespec64(t
);
2402 p
= xdr_encode_hyper(p
, ts
.tv_sec
);
2403 *p
++ = cpu_to_be32(ts
.tv_nsec
);
2407 ff_layout_encode_io_latency(struct xdr_stream
*xdr
,
2408 struct nfs4_ff_io_stat
*stat
)
2412 p
= xdr_reserve_space(xdr
, 5 * 8);
2413 p
= xdr_encode_hyper(p
, stat
->ops_requested
);
2414 p
= xdr_encode_hyper(p
, stat
->bytes_requested
);
2415 p
= xdr_encode_hyper(p
, stat
->ops_completed
);
2416 p
= xdr_encode_hyper(p
, stat
->bytes_completed
);
2417 p
= xdr_encode_hyper(p
, stat
->bytes_not_delivered
);
2418 ff_layout_encode_nfstime(xdr
, stat
->total_busy_time
);
2419 ff_layout_encode_nfstime(xdr
, stat
->aggregate_completion_time
);
2423 ff_layout_encode_ff_layoutupdate(struct xdr_stream
*xdr
,
2424 const struct nfs42_layoutstat_devinfo
*devinfo
,
2425 struct nfs4_ff_layout_mirror
*mirror
)
2427 struct nfs4_pnfs_ds_addr
*da
;
2428 struct nfs4_pnfs_ds
*ds
= mirror
->mirror_ds
->ds
;
2429 struct nfs_fh
*fh
= &mirror
->fh_versions
[0];
2432 da
= list_first_entry(&ds
->ds_addrs
, struct nfs4_pnfs_ds_addr
, da_node
);
2433 dprintk("%s: DS %s: encoding address %s\n",
2434 __func__
, ds
->ds_remotestr
, da
->da_remotestr
);
2436 ff_layout_encode_netaddr(xdr
, da
);
2438 p
= xdr_reserve_space(xdr
, 4 + fh
->size
);
2439 xdr_encode_opaque(p
, fh
->data
, fh
->size
);
2440 /* ff_io_latency4 read */
2441 spin_lock(&mirror
->lock
);
2442 ff_layout_encode_io_latency(xdr
, &mirror
->read_stat
.io_stat
);
2443 /* ff_io_latency4 write */
2444 ff_layout_encode_io_latency(xdr
, &mirror
->write_stat
.io_stat
);
2445 spin_unlock(&mirror
->lock
);
2447 ff_layout_encode_nfstime(xdr
, ktime_sub(ktime_get(), mirror
->start_time
));
2449 p
= xdr_reserve_space(xdr
, 4);
2450 *p
= cpu_to_be32(false);
2454 ff_layout_encode_layoutstats(struct xdr_stream
*xdr
, const void *args
,
2455 const struct nfs4_xdr_opaque_data
*opaque
)
2457 struct nfs42_layoutstat_devinfo
*devinfo
= container_of(opaque
,
2458 struct nfs42_layoutstat_devinfo
, ld_private
);
2461 /* layoutupdate length */
2462 start
= xdr_reserve_space(xdr
, 4);
2463 ff_layout_encode_ff_layoutupdate(xdr
, devinfo
, opaque
->data
);
2465 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
2469 ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data
*opaque
)
2471 struct nfs4_ff_layout_mirror
*mirror
= opaque
->data
;
2473 ff_layout_put_mirror(mirror
);
2476 static const struct nfs4_xdr_opaque_ops layoutstat_ops
= {
2477 .encode
= ff_layout_encode_layoutstats
,
2478 .free
= ff_layout_free_layoutstats
,
2482 ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr
*lo
,
2483 struct nfs42_layoutstat_devinfo
*devinfo
,
2484 int dev_limit
, enum nfs4_ff_op_type type
)
2486 struct nfs4_flexfile_layout
*ff_layout
= FF_LAYOUT_FROM_HDR(lo
);
2487 struct nfs4_ff_layout_mirror
*mirror
;
2488 struct nfs4_deviceid_node
*dev
;
2491 list_for_each_entry(mirror
, &ff_layout
->mirrors
, mirrors
) {
2494 if (IS_ERR_OR_NULL(mirror
->mirror_ds
))
2496 if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL
,
2498 type
!= NFS4_FF_OP_LAYOUTRETURN
)
2500 /* mirror refcount put in cleanup_layoutstats */
2501 if (!refcount_inc_not_zero(&mirror
->ref
))
2503 dev
= &mirror
->mirror_ds
->id_node
;
2504 memcpy(&devinfo
->dev_id
, &dev
->deviceid
, NFS4_DEVICEID4_SIZE
);
2505 devinfo
->offset
= 0;
2506 devinfo
->length
= NFS4_MAX_UINT64
;
2507 spin_lock(&mirror
->lock
);
2508 devinfo
->read_count
= mirror
->read_stat
.io_stat
.ops_completed
;
2509 devinfo
->read_bytes
= mirror
->read_stat
.io_stat
.bytes_completed
;
2510 devinfo
->write_count
= mirror
->write_stat
.io_stat
.ops_completed
;
2511 devinfo
->write_bytes
= mirror
->write_stat
.io_stat
.bytes_completed
;
2512 spin_unlock(&mirror
->lock
);
2513 devinfo
->layout_type
= LAYOUT_FLEX_FILES
;
2514 devinfo
->ld_private
.ops
= &layoutstat_ops
;
2515 devinfo
->ld_private
.data
= mirror
;
2523 static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args
*args
)
2525 struct pnfs_layout_hdr
*lo
;
2526 struct nfs4_flexfile_layout
*ff_layout
;
2527 const int dev_count
= PNFS_LAYOUTSTATS_MAXDEV
;
2529 /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
2530 args
->devinfo
= kmalloc_array(dev_count
, sizeof(*args
->devinfo
),
2535 spin_lock(&args
->inode
->i_lock
);
2536 lo
= NFS_I(args
->inode
)->layout
;
2537 if (lo
&& pnfs_layout_is_valid(lo
)) {
2538 ff_layout
= FF_LAYOUT_FROM_HDR(lo
);
2539 args
->num_dev
= ff_layout_mirror_prepare_stats(
2540 &ff_layout
->generic_hdr
, &args
->devinfo
[0], dev_count
,
2541 NFS4_FF_OP_LAYOUTSTATS
);
2544 spin_unlock(&args
->inode
->i_lock
);
2545 if (!args
->num_dev
) {
2546 kfree(args
->devinfo
);
2547 args
->devinfo
= NULL
;
2555 ff_layout_set_layoutdriver(struct nfs_server
*server
,
2556 const struct nfs_fh
*dummy
)
2558 #if IS_ENABLED(CONFIG_NFS_V4_2)
2559 server
->caps
|= NFS_CAP_LAYOUTSTATS
;
2564 static const struct pnfs_commit_ops ff_layout_commit_ops
= {
2565 .setup_ds_info
= ff_layout_setup_ds_info
,
2566 .release_ds_info
= ff_layout_release_ds_info
,
2567 .mark_request_commit
= pnfs_layout_mark_request_commit
,
2568 .clear_request_commit
= pnfs_generic_clear_request_commit
,
2569 .scan_commit_lists
= pnfs_generic_scan_commit_lists
,
2570 .recover_commit_reqs
= pnfs_generic_recover_commit_reqs
,
2571 .commit_pagelist
= ff_layout_commit_pagelist
,
2574 static struct pnfs_layoutdriver_type flexfilelayout_type
= {
2575 .id
= LAYOUT_FLEX_FILES
,
2576 .name
= "LAYOUT_FLEX_FILES",
2577 .owner
= THIS_MODULE
,
2578 .flags
= PNFS_LAYOUTGET_ON_OPEN
,
2579 .max_layoutget_response
= 4096, /* 1 page or so... */
2580 .set_layoutdriver
= ff_layout_set_layoutdriver
,
2581 .alloc_layout_hdr
= ff_layout_alloc_layout_hdr
,
2582 .free_layout_hdr
= ff_layout_free_layout_hdr
,
2583 .alloc_lseg
= ff_layout_alloc_lseg
,
2584 .free_lseg
= ff_layout_free_lseg
,
2585 .add_lseg
= ff_layout_add_lseg
,
2586 .pg_read_ops
= &ff_layout_pg_read_ops
,
2587 .pg_write_ops
= &ff_layout_pg_write_ops
,
2588 .get_ds_info
= ff_layout_get_ds_info
,
2589 .free_deviceid_node
= ff_layout_free_deviceid_node
,
2590 .read_pagelist
= ff_layout_read_pagelist
,
2591 .write_pagelist
= ff_layout_write_pagelist
,
2592 .alloc_deviceid_node
= ff_layout_alloc_deviceid_node
,
2593 .prepare_layoutreturn
= ff_layout_prepare_layoutreturn
,
2594 .sync
= pnfs_nfs_generic_sync
,
2595 .prepare_layoutstats
= ff_layout_prepare_layoutstats
,
2596 .cancel_io
= ff_layout_cancel_io
,
2599 static int __init
nfs4flexfilelayout_init(void)
2601 printk(KERN_INFO
"%s: NFSv4 Flexfile Layout Driver Registering...\n",
2603 return pnfs_register_layoutdriver(&flexfilelayout_type
);
2606 static void __exit
nfs4flexfilelayout_exit(void)
2608 printk(KERN_INFO
"%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
2610 pnfs_unregister_layoutdriver(&flexfilelayout_type
);
2613 MODULE_ALIAS("nfs-layouttype4-4");
2615 MODULE_LICENSE("GPL");
2616 MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
2618 module_init(nfs4flexfilelayout_init
);
2619 module_exit(nfs4flexfilelayout_exit
);
2621 module_param(io_maxretrans
, ushort
, 0644);
2622 MODULE_PARM_DESC(io_maxretrans
, "The number of times the NFSv4.1 client "
2623 "retries an I/O request before returning an error. ");