From: Dai Ngo Date: Fri, 13 Feb 2026 18:36:30 +0000 (-0800) Subject: NFSD: Enforce timeout on layout recall and integrate lease manager fencing X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f52792f484ba2316853736856dde19b7e7458861;p=thirdparty%2Flinux.git NFSD: Enforce timeout on layout recall and integrate lease manager fencing When a layout conflict triggers a recall, enforcing a timeout is necessary to prevent excessive nfsd threads from being blocked in __break_lease ensuring the server continues servicing incoming requests efficiently. This patch introduces a new function to lease_manager_operations: lm_breaker_timedout: Invoked when a lease recall times out and is about to be disposed of. This function enables the lease manager to inform the caller whether the file_lease should remain on the flc_list or be disposed of. For the NFSD lease manager, this function now handles layout recall timeouts. If the layout type supports fencing and the client has not been fenced, a fence operation is triggered to prevent the client from accessing the block device. While the fencing operation is in progress, the conflicting file_lease remains on the flc_list until fencing is complete. This guarantees that no other clients can access the file, and the client with exclusive access is properly blocked before disposal. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- diff --git a/Documentation/admin-guide/nfs/pnfs-block-server.rst b/Documentation/admin-guide/nfs/pnfs-block-server.rst index 20fe9f5117fe..b4f5997009af 100644 --- a/Documentation/admin-guide/nfs/pnfs-block-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-block-server.rst @@ -40,3 +40,33 @@ how to translate the device into a serial number from SCSI EVPD 0x80:: echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log EOF + +If the nfsd server needs to fence a non-responding client and the +fencing operation fails, the server logs a warning message in the +system log with the following format: + + FENCE failed client[IP_address] clid[#n] device[dev_name] + + Where: + + IP_address: refers to the IP address of the affected client. + #n: indicates the unique client identifier. + dev_name: specifies the name of the block device related + to the fencing attempt. + +The server will repeatedly retry the operation indefinitely. During +this time, access to the affected file is restricted for all other +clients. This is to prevent potential data corruption if multiple +clients access the same file simultaneously. + +To restore access to the affected file for other clients, the admin +needs to take the following actions: + + . shutdown or power off the client being fenced. + . manually expire the client to release all its state on the server: + + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + + Where: + + clid: is the unique client identifier displayed in the system log. diff --git a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst index b2eec2288329..db34afbf67a9 100644 --- a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst @@ -22,3 +22,34 @@ option and the underlying SCSI device support persistent reservations. On the client make sure the kernel has the CONFIG_PNFS_BLOCK option enabled, and the file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1). + +If the nfsd server needs to fence a non-responding client and the +fencing operation fails, the server logs a warning message in the +system log with the following format: + + FENCE failed client[IP_address] clid[#n] device[dev_name] + + Where: + + IP_address: refers to the IP address of the affected client. + #n: indicates the unique client identifier. + dev_name: specifies the name of the block device related + to the fencing attempt. + +The server will repeatedly retry the operation indefinitely. During +this time, access to the affected file is restricted for all other +clients. This is to prevent potential data corruption if multiple +clients access the same file simultaneously. + +To restore access to the affected file for other clients, the admin +needs to take the following actions: + + . shutdown or power off the client being fenced. + . manually expire the client to release all its state on the server: + + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + + Where: + + clid: is the unique client identifier displayed in the system log. + diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 8025df6e6499..8421ea21bd35 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -398,6 +398,7 @@ prototypes:: bool (*lm_breaker_owns_lease)(struct file_lock *); bool (*lm_lock_expirable)(struct file_lock *); void (*lm_expire_lock)(void); + bool (*lm_breaker_timedout)(struct file_lease *); locking rules: @@ -412,6 +413,7 @@ lm_breaker_owns_lease: yes no no lm_lock_expirable yes no no lm_expire_lock no no yes lm_open_conflict yes no no +lm_breaker_timedout yes no no ====================== ============= ================= ========= buffer_head diff --git a/fs/locks.c b/fs/locks.c index d13ec930b7bb..8e44b1f6c15a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1534,6 +1534,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) { struct file_lock_context *ctx = inode->i_flctx; struct file_lease *fl, *tmp; + bool remove; lockdep_assert_held(&ctx->flc_lock); @@ -1541,8 +1542,19 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(fl, F_RDLCK, dispose); - if (past_time(fl->fl_break_time)) - lease_modify(fl, F_UNLCK, dispose); + + remove = true; + if (past_time(fl->fl_break_time)) { + /* + * Consult the lease manager when a lease break times + * out to determine whether the lease should be disposed + * of. + */ + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) + remove = fl->fl_lmops->lm_breaker_timedout(fl); + if (remove) + lease_modify(fl, F_UNLCK, dispose); + } } } @@ -1670,9 +1682,13 @@ int __break_lease(struct inode *inode, unsigned int flags) restart: fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list); break_time = fl->fl_break_time; - if (break_time != 0) - break_time -= jiffies; - if (break_time == 0) + if (break_time != 0) { + if (time_after(jiffies, break_time)) { + fl->fl_break_time = jiffies + lease_break_time * HZ; + break_time = lease_break_time * HZ; + } else + break_time -= jiffies; + } else break_time++; locks_insert_block(&fl->c, &new_fl->c, leases_conflict); trace_break_lease_block(inode, new_fl); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 8b987fca1e60..9d829c84f374 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -297,6 +297,7 @@ static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp, ret = 0; } xa_unlock(xa); + clp->cl_fence_retry_warn = false; return ret; } @@ -443,15 +444,33 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); } -static void +/* + * Perform the fence operation to prevent the client from accessing the + * block device. If a fence operation is already in progress, wait for + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, + * update the layout stateid by setting the ls_fenced flag to indicate + * that the client has been fenced. + * + * The cl_fence_mutex ensures that the fence operation has been fully + * completed, rather than just in progress, when returning from this + * function. + * + * Return true if client was fenced otherwise return false. + */ +static bool nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; int status; + bool ret; - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) - return; + mutex_lock(&clp->cl_fence_mutex); + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { + mutex_unlock(&clp->cl_fence_mutex); + return true; + } status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), @@ -470,13 +489,22 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) * PR_STS_RESERVATION_CONFLICT, which would cause an infinite * retry loop. */ - if (status < 0 || - status == PR_STS_PATH_FAILED || - status == PR_STS_PATH_FAST_FAILED || - status == PR_STS_RETRY_PATH_FAILURE) + switch (status) { + case 0: + case PR_STS_IOERR: + case PR_STS_RESERVATION_CONFLICT: + ret = true; + break; + default: + /* retry-able and other errors */ + ret = false; nfsd4_scsi_fence_clear(clp, bdev->bd_dev); + break; + } + mutex_unlock(&clp->cl_fence_mutex); trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); + return ret; } const struct nfsd4_layout_ops scsi_layout_ops = { diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ad7af8cfcf1f..69e41105efdd 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -27,6 +27,8 @@ static struct kmem_cache *nfs4_layout_stateid_cache; static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lease_manager_operations nfsd4_layouts_lm_ops; +static void nfsd4_layout_fence_worker(struct work_struct *work); + const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { #ifdef CONFIG_NFSD_FLEXFILELAYOUT [LAYOUT_FLEX_FILES] = &ff_layout_ops, @@ -177,6 +179,13 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid); + spin_lock(&ls->ls_lock); + if (delayed_work_pending(&ls->ls_fence_work)) { + spin_unlock(&ls->ls_lock); + cancel_delayed_work_sync(&ls->ls_fence_work); + } else + spin_unlock(&ls->ls_lock); + spin_lock(&clp->cl_lock); list_del_init(&ls->ls_perclnt); spin_unlock(&clp->cl_lock); @@ -271,6 +280,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, list_add(&ls->ls_perfile, &fp->fi_lo_states); spin_unlock(&fp->fi_lock); + ls->ls_fenced = false; + ls->ls_fence_delay = 0; + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); + trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); return ls; } @@ -747,11 +760,9 @@ static bool nfsd4_layout_lm_break(struct file_lease *fl) { /* - * We don't want the locks code to timeout the lease for us; - * we'll remove it ourself if a layout isn't returned - * in time: + * Enforce break lease timeout to prevent NFSD + * thread from hanging in __break_lease. */ - fl->fl_break_time = 0; nfsd4_recall_file_layout(fl->c.flc_owner); return false; } @@ -782,10 +793,143 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) return 0; } +static void +nfsd4_layout_fence_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct nfs4_layout_stateid *ls = container_of(dwork, + struct nfs4_layout_stateid, ls_fence_work); + struct nfsd_file *nf; + struct block_device *bdev; + struct nfs4_client *clp; + struct nfsd_net *nn; + + /* + * The workqueue clears WORK_STRUCT_PENDING before invoking + * this callback. Re-arm immediately so that + * delayed_work_pending() returns true while the fence + * operation is in progress, preventing + * lm_breaker_timedout() from taking a duplicate reference. + */ + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); + + spin_lock(&ls->ls_lock); + if (list_empty(&ls->ls_layouts)) { + spin_unlock(&ls->ls_lock); +dispose: + cancel_delayed_work(&ls->ls_fence_work); + /* unlock the lease so that tasks waiting on it can proceed */ + nfsd4_close_layout(ls); + + ls->ls_fenced = true; + nfs4_put_stid(&ls->ls_stid); + return; + } + spin_unlock(&ls->ls_lock); + + rcu_read_lock(); + nf = nfsd_file_get(ls->ls_file); + rcu_read_unlock(); + if (!nf) + goto dispose; + + clp = ls->ls_stid.sc_client; + nn = net_generic(clp->net, nfsd_net_id); + bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev; + if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) { + /* fenced ok */ + nfsd_file_put(nf); + pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n", + __func__, (struct sockaddr *)&clp->cl_addr, + clp->cl_clientid.cl_id - nn->clientid_base, + bdev->bd_disk->disk_name); + goto dispose; + } + /* fence failed */ + nfsd_file_put(nf); + + if (!clp->cl_fence_retry_warn) { + pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n", + __func__, (struct sockaddr *)&clp->cl_addr, + clp->cl_clientid.cl_id - nn->clientid_base, + bdev->bd_disk->disk_name); + clp->cl_fence_retry_warn = true; + } + /* + * The fence worker retries the fencing operation indefinitely to + * prevent data corruption. The admin needs to take the following + * actions to restore access to the file for other clients: + * + * . shutdown or power off the client being fenced. + * . manually expire the client to release all its state on the server; + * echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + * + * Where: + * + * clid: is the unique client identifier displayed in + * the warning message above. + */ + if (!ls->ls_fence_delay) + ls->ls_fence_delay = HZ; + else + ls->ls_fence_delay = min(ls->ls_fence_delay << 1, + MAX_FENCE_DELAY); + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay); +} + +/** + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. + * @fl: file to check + * + * If the layout type supports a fence operation, schedule a worker to + * fence the client from accessing the block device. + * + * This function runs under the protection of the spin_lock flc_lock. + * At this time, the file_lease associated with the layout stateid is + * on the flc_list. A reference count is incremented on the layout + * stateid to prevent it from being freed while the fence worker is + * executing. Once the fence worker finishes its operation, it releases + * this reference. + * + * The fence worker continues to run until either the client has been + * fenced or the layout becomes invalid. The layout can become invalid + * as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback + * has completed. + * + * Return true if the file_lease should be disposed of by the caller; + * otherwise, return false. + */ +static bool +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) +{ + struct nfs4_layout_stateid *ls = fl->c.flc_owner; + + if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) || + ls->ls_fenced) + return true; + if (delayed_work_pending(&ls->ls_fence_work)) + return false; + /* + * Make sure layout has not been returned yet before + * taking a reference count on the layout stateid. + */ + spin_lock(&ls->ls_lock); + if (list_empty(&ls->ls_layouts) || + !refcount_inc_not_zero(&ls->ls_stid.sc_count)) { + spin_unlock(&ls->ls_lock); + return true; + } + spin_unlock(&ls->ls_lock); + + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); + return false; +} + static const struct lease_manager_operations nfsd4_layouts_lm_ops = { .lm_break = nfsd4_layout_lm_break, .lm_change = nfsd4_layout_lm_change, .lm_open_conflict = nfsd4_layout_lm_open_conflict, + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, }; int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1b4c101ff04b..1d31f2bb2162 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2386,6 +2386,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, #endif #ifdef CONFIG_NFSD_SCSILAYOUT xa_init(&clp->cl_dev_fences); + mutex_init(&clp->cl_fence_mutex); #endif INIT_LIST_HEAD(&clp->async_copies); spin_lock_init(&clp->async_lock); diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index db9af780438b..f7bee4dc5d3d 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -11,6 +11,9 @@ struct xdr_stream; +/* Cap exponential backoff between fence retries at 3 minutes */ +#define MAX_FENCE_DELAY ((unsigned int)(3 * 60 * HZ)) + struct nfsd4_deviceid_map { struct list_head hash; u64 idx; @@ -38,7 +41,7 @@ struct nfsd4_layout_ops { struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp); - void (*fence_client)(struct nfs4_layout_stateid *ls, + bool (*fence_client)(struct nfs4_layout_stateid *ls, struct nfsd_file *file); }; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 99aeaab9cf2b..ec1c5467012e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -456,6 +456,7 @@ struct nfs4_client { struct list_head cl_lru; /* tail queue */ #ifdef CONFIG_NFSD_PNFS struct list_head cl_lo_states; /* outstanding layout states */ + bool cl_fence_retry_warn; #endif struct xdr_netobj cl_name; /* id generated by client */ nfs4_verifier cl_verifier; /* generated by client */ @@ -529,6 +530,7 @@ struct nfs4_client { time64_t cl_ra_time; #ifdef CONFIG_NFSD_SCSILAYOUT struct xarray cl_dev_fences; + struct mutex cl_fence_mutex; #endif }; @@ -745,6 +747,10 @@ struct nfs4_layout_stateid { stateid_t ls_recall_sid; bool ls_recalled; struct mutex ls_mutex; + + struct delayed_work ls_fence_work; + unsigned int ls_fence_delay; + bool ls_fenced; }; static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index d2c9740e26a8..5f0a2fb31450 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -50,6 +50,7 @@ struct lease_manager_operations { void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); int (*lm_open_conflict)(struct file *, int); + bool (*lm_breaker_timedout)(struct file_lease *fl); }; struct lock_manager {