From: Dai Ngo <dai.ngo@oracle.com>
Date: Fri, 13 Feb 2026 18:36:30 +0000 (-0800)
Subject: NFSD: Enforce timeout on layout recall and integrate lease manager fencing
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f52792f484ba2316853736856dde19b7e7458861;p=thirdparty%2Flinux.git

NFSD: Enforce timeout on layout recall and integrate lease manager fencing

When a layout conflict triggers a recall, enforcing a timeout is
necessary to prevent excessive nfsd threads from being blocked in
__break_lease ensuring the server continues servicing incoming
requests efficiently.

This patch introduces a new function to lease_manager_operations:

lm_breaker_timedout: Invoked when a lease recall times out and is
about to be disposed of. This function enables the lease manager
to inform the caller whether the file_lease should remain on the
flc_list or be disposed of.

For the NFSD lease manager, this function now handles layout recall
timeouts. If the layout type supports fencing and the client has not
been fenced, a fence operation is triggered to prevent the client
from accessing the block device.

While the fencing operation is in progress, the conflicting file_lease
remains on the flc_list until fencing is complete. This guarantees
that no other clients can access the file, and the client with
exclusive access is properly blocked before disposal.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---

diff --git a/Documentation/admin-guide/nfs/pnfs-block-server.rst b/Documentation/admin-guide/nfs/pnfs-block-server.rst
index 20fe9f5117fe..b4f5997009af 100644
--- a/Documentation/admin-guide/nfs/pnfs-block-server.rst
+++ b/Documentation/admin-guide/nfs/pnfs-block-server.rst
@@ -40,3 +40,33 @@ how to translate the device into a serial number from SCSI EVPD 0x80::
 
 	echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
 	EOF
+
+If the nfsd server needs to fence a non-responding client and the
+fencing operation fails, the server logs a warning message in the
+system log with the following format:
+
+    FENCE failed client[IP_address] clid[#n] device[dev_name]
+
+    Where:
+
+    IP_address: refers to the IP address of the affected client.
+    #n: indicates the unique client identifier.
+    dev_name: specifies the name of the block device related
+              to the fencing attempt.
+
+The server will repeatedly retry the operation indefinitely. During
+this time, access to the affected file is restricted for all other
+clients. This is to prevent potential data corruption if multiple
+clients access the same file simultaneously.
+
+To restore access to the affected file for other clients, the admin
+needs to take the following actions:
+
+    . shutdown or power off the client being fenced.
+    . manually expire the client to release all its state on the server:
+
+      echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
+
+      Where:
+
+      clid: is the unique client identifier displayed in the system log.
diff --git a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst
index b2eec2288329..db34afbf67a9 100644
--- a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst
+++ b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst
@@ -22,3 +22,34 @@ option and the underlying SCSI device support persistent reservations.
 On the client make sure the kernel has the CONFIG_PNFS_BLOCK option
 enabled, and the file system is mounted using the NFSv4.1 protocol
 version (mount -o vers=4.1).
+
+If the nfsd server needs to fence a non-responding client and the
+fencing operation fails, the server logs a warning message in the
+system log with the following format:
+
+    FENCE failed client[IP_address] clid[#n] device[dev_name]
+
+    Where:
+
+    IP_address: refers to the IP address of the affected client.
+    #n: indicates the unique client identifier.
+    dev_name: specifies the name of the block device related
+              to the fencing attempt.
+
+The server will repeatedly retry the operation indefinitely. During
+this time, access to the affected file is restricted for all other
+clients. This is to prevent potential data corruption if multiple
+clients access the same file simultaneously.
+
+To restore access to the affected file for other clients, the admin
+needs to take the following actions:
+
+    . shutdown or power off the client being fenced.
+    . manually expire the client to release all its state on the server:
+
+      echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
+
+      Where:
+
+      clid: is the unique client identifier displayed in the system log.
+
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 8025df6e6499..8421ea21bd35 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -398,6 +398,7 @@ prototypes::
 	bool (*lm_breaker_owns_lease)(struct file_lock *);
         bool (*lm_lock_expirable)(struct file_lock *);
         void (*lm_expire_lock)(void);
+        bool (*lm_breaker_timedout)(struct file_lease *);
 
 locking rules:
 
@@ -412,6 +413,7 @@ lm_breaker_owns_lease:	yes     	no			no
 lm_lock_expirable	yes		no			no
 lm_expire_lock		no		no			yes
 lm_open_conflict	yes		no			no
+lm_breaker_timedout     yes             no                      no
 ======================	=============	=================	=========
 
 buffer_head
diff --git a/fs/locks.c b/fs/locks.c
index d13ec930b7bb..8e44b1f6c15a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1534,6 +1534,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 {
 	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lease *fl, *tmp;
+	bool remove;
 
 	lockdep_assert_held(&ctx->flc_lock);
 
@@ -1541,8 +1542,19 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 		trace_time_out_leases(inode, fl);
 		if (past_time(fl->fl_downgrade_time))
 			lease_modify(fl, F_RDLCK, dispose);
-		if (past_time(fl->fl_break_time))
-			lease_modify(fl, F_UNLCK, dispose);
+
+		remove = true;
+		if (past_time(fl->fl_break_time)) {
+			/*
+			 * Consult the lease manager when a lease break times
+			 * out to determine whether the lease should be disposed
+			 * of.
+			 */
+			if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
+				remove = fl->fl_lmops->lm_breaker_timedout(fl);
+			if (remove)
+				lease_modify(fl, F_UNLCK, dispose);
+		}
 	}
 }
 
@@ -1670,9 +1682,13 @@ int __break_lease(struct inode *inode, unsigned int flags)
 restart:
 	fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
 	break_time = fl->fl_break_time;
-	if (break_time != 0)
-		break_time -= jiffies;
-	if (break_time == 0)
+	if (break_time != 0) {
+		if (time_after(jiffies, break_time)) {
+			fl->fl_break_time = jiffies + lease_break_time * HZ;
+			break_time = lease_break_time * HZ;
+		} else
+			break_time -= jiffies;
+	} else
 		break_time++;
 	locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
 	trace_break_lease_block(inode, new_fl);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 8b987fca1e60..9d829c84f374 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -297,6 +297,7 @@ static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp,
 		ret = 0;
 	}
 	xa_unlock(xa);
+	clp->cl_fence_retry_warn = false;
 	return ret;
 }
 
@@ -443,15 +444,33 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
 	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
 }
 
-static void
+/*
+ * Perform the fence operation to prevent the client from accessing the
+ * block device. If a fence operation is already in progress, wait for
+ * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
+ * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
+ * update the layout stateid by setting the ls_fenced flag to indicate
+ * that the client has been fenced.
+ *
+ * The cl_fence_mutex ensures that the fence operation has been fully
+ * completed, rather than just in progress, when returning from this
+ * function.
+ *
+ * Return true if client was fenced otherwise return false.
+ */
+static bool
 nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
 	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
 	int status;
+	bool ret;
 
-	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
-		return;
+	mutex_lock(&clp->cl_fence_mutex);
+	if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
+		mutex_unlock(&clp->cl_fence_mutex);
+		return true;
+	}
 
 	status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
 			nfsd4_scsi_pr_key(clp),
@@ -470,13 +489,22 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 	 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
 	 * retry loop.
 	 */
-	if (status < 0 ||
-	    status == PR_STS_PATH_FAILED ||
-	    status == PR_STS_PATH_FAST_FAILED ||
-	    status == PR_STS_RETRY_PATH_FAILURE)
+	switch (status) {
+	case 0:
+	case PR_STS_IOERR:
+	case PR_STS_RESERVATION_CONFLICT:
+		ret = true;
+		break;
+	default:
+		/* retry-able and other errors */
+		ret = false;
 		nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
+		break;
+	}
+	mutex_unlock(&clp->cl_fence_mutex);
 
 	trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
+	return ret;
 }
 
 const struct nfsd4_layout_ops scsi_layout_ops = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ad7af8cfcf1f..69e41105efdd 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -27,6 +27,8 @@ static struct kmem_cache *nfs4_layout_stateid_cache;
 static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
 static const struct lease_manager_operations nfsd4_layouts_lm_ops;
 
+static void nfsd4_layout_fence_worker(struct work_struct *work);
+
 const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
 #ifdef CONFIG_NFSD_FLEXFILELAYOUT
 	[LAYOUT_FLEX_FILES]	= &ff_layout_ops,
@@ -177,6 +179,13 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 
 	trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid);
 
+	spin_lock(&ls->ls_lock);
+	if (delayed_work_pending(&ls->ls_fence_work)) {
+		spin_unlock(&ls->ls_lock);
+		cancel_delayed_work_sync(&ls->ls_fence_work);
+	} else
+		spin_unlock(&ls->ls_lock);
+
 	spin_lock(&clp->cl_lock);
 	list_del_init(&ls->ls_perclnt);
 	spin_unlock(&clp->cl_lock);
@@ -271,6 +280,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	list_add(&ls->ls_perfile, &fp->fi_lo_states);
 	spin_unlock(&fp->fi_lock);
 
+	ls->ls_fenced = false;
+	ls->ls_fence_delay = 0;
+	INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
+
 	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
 	return ls;
 }
@@ -747,11 +760,9 @@ static bool
 nfsd4_layout_lm_break(struct file_lease *fl)
 {
 	/*
-	 * We don't want the locks code to timeout the lease for us;
-	 * we'll remove it ourself if a layout isn't returned
-	 * in time:
+	 * Enforce break lease timeout to prevent NFSD
+	 * thread from hanging in __break_lease.
 	 */
-	fl->fl_break_time = 0;
 	nfsd4_recall_file_layout(fl->c.flc_owner);
 	return false;
 }
@@ -782,10 +793,143 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
 	return 0;
 }
 
+static void
+nfsd4_layout_fence_worker(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct nfs4_layout_stateid *ls = container_of(dwork,
+			struct nfs4_layout_stateid, ls_fence_work);
+	struct nfsd_file *nf;
+	struct block_device *bdev;
+	struct nfs4_client *clp;
+	struct nfsd_net *nn;
+
+	/*
+	 * The workqueue clears WORK_STRUCT_PENDING before invoking
+	 * this callback. Re-arm immediately so that
+	 * delayed_work_pending() returns true while the fence
+	 * operation is in progress, preventing
+	 * lm_breaker_timedout() from taking a duplicate reference.
+	 */
+	mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
+
+	spin_lock(&ls->ls_lock);
+	if (list_empty(&ls->ls_layouts)) {
+		spin_unlock(&ls->ls_lock);
+dispose:
+		cancel_delayed_work(&ls->ls_fence_work);
+		/* unlock the lease so that tasks waiting on it can proceed */
+		nfsd4_close_layout(ls);
+
+		ls->ls_fenced = true;
+		nfs4_put_stid(&ls->ls_stid);
+		return;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	rcu_read_lock();
+	nf = nfsd_file_get(ls->ls_file);
+	rcu_read_unlock();
+	if (!nf)
+		goto dispose;
+
+	clp = ls->ls_stid.sc_client;
+	nn = net_generic(clp->net, nfsd_net_id);
+	bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev;
+	if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) {
+		/* fenced ok */
+		nfsd_file_put(nf);
+		pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n",
+			__func__, (struct sockaddr *)&clp->cl_addr,
+			clp->cl_clientid.cl_id - nn->clientid_base,
+			bdev->bd_disk->disk_name);
+		goto dispose;
+	}
+	/* fence failed */
+	nfsd_file_put(nf);
+
+	if (!clp->cl_fence_retry_warn) {
+		pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n",
+			__func__, (struct sockaddr *)&clp->cl_addr,
+			clp->cl_clientid.cl_id - nn->clientid_base,
+			bdev->bd_disk->disk_name);
+		clp->cl_fence_retry_warn = true;
+	}
+	/*
+	 * The fence worker retries the fencing operation indefinitely to
+	 * prevent data corruption. The admin needs to take the following
+	 * actions to restore access to the file for other clients:
+	 *
+	 *  . shutdown or power off the client being fenced.
+	 *  . manually expire the client to release all its state on the server;
+	 *    echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
+	 *
+	 *    Where:
+	 *
+	 *    clid: is the unique client identifier displayed in
+	 *          the warning message above.
+	 */
+	if (!ls->ls_fence_delay)
+		ls->ls_fence_delay = HZ;
+	else
+		ls->ls_fence_delay = min(ls->ls_fence_delay << 1,
+					 MAX_FENCE_DELAY);
+	mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay);
+}
+
+/**
+ * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
+ * @fl: file to check
+ *
+ * If the layout type supports a fence operation, schedule a worker to
+ * fence the client from accessing the block device.
+ *
+ * This function runs under the protection of the spin_lock flc_lock.
+ * At this time, the file_lease associated with the layout stateid is
+ * on the flc_list. A reference count is incremented on the layout
+ * stateid to prevent it from being freed while the fence worker is
+ * executing. Once the fence worker finishes its operation, it releases
+ * this reference.
+ *
+ * The fence worker continues to run until either the client has been
+ * fenced or the layout becomes invalid. The layout can become invalid
+ * as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback
+ * has completed.
+ *
+ * Return true if the file_lease should be disposed of by the caller;
+ * otherwise, return false.
+ */
+static bool
+nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
+{
+	struct nfs4_layout_stateid *ls = fl->c.flc_owner;
+
+	if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) ||
+			ls->ls_fenced)
+		return true;
+	if (delayed_work_pending(&ls->ls_fence_work))
+		return false;
+	/*
+	 * Make sure layout has not been returned yet before
+	 * taking a reference count on the layout stateid.
+	 */
+	spin_lock(&ls->ls_lock);
+	if (list_empty(&ls->ls_layouts) ||
+			!refcount_inc_not_zero(&ls->ls_stid.sc_count)) {
+		spin_unlock(&ls->ls_lock);
+		return true;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
+	return false;
+}
+
 static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
 	.lm_break		= nfsd4_layout_lm_break,
 	.lm_change		= nfsd4_layout_lm_change,
 	.lm_open_conflict	= nfsd4_layout_lm_open_conflict,
+	.lm_breaker_timedout	= nfsd4_layout_lm_breaker_timedout,
 };
 
 int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1b4c101ff04b..1d31f2bb2162 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2386,6 +2386,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
 #endif
 #ifdef CONFIG_NFSD_SCSILAYOUT
 	xa_init(&clp->cl_dev_fences);
+	mutex_init(&clp->cl_fence_mutex);
 #endif
 	INIT_LIST_HEAD(&clp->async_copies);
 	spin_lock_init(&clp->async_lock);
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index db9af780438b..f7bee4dc5d3d 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -11,6 +11,9 @@
 
 struct xdr_stream;
 
+/* Cap exponential backoff between fence retries at 3 minutes */
+#define	MAX_FENCE_DELAY		((unsigned int)(3 * 60 * HZ))
+
 struct nfsd4_deviceid_map {
 	struct list_head	hash;
 	u64			idx;
@@ -38,7 +41,7 @@ struct nfsd4_layout_ops {
 			struct svc_rqst *rqstp,
 			struct nfsd4_layoutcommit *lcp);
 
-	void (*fence_client)(struct nfs4_layout_stateid *ls,
+	bool (*fence_client)(struct nfs4_layout_stateid *ls,
 			     struct nfsd_file *file);
 };
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 99aeaab9cf2b..ec1c5467012e 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -456,6 +456,7 @@ struct nfs4_client {
 	struct list_head        cl_lru;         /* tail queue */
 #ifdef CONFIG_NFSD_PNFS
 	struct list_head	cl_lo_states;	/* outstanding layout states */
+	bool			cl_fence_retry_warn;
 #endif
 	struct xdr_netobj	cl_name; 	/* id generated by client */
 	nfs4_verifier		cl_verifier; 	/* generated by client */
@@ -529,6 +530,7 @@ struct nfs4_client {
 	time64_t		cl_ra_time;
 #ifdef CONFIG_NFSD_SCSILAYOUT
 	struct xarray		cl_dev_fences;
+	struct mutex		cl_fence_mutex;
 #endif
 };
 
@@ -745,6 +747,10 @@ struct nfs4_layout_stateid {
 	stateid_t			ls_recall_sid;
 	bool				ls_recalled;
 	struct mutex			ls_mutex;
+
+	struct delayed_work		ls_fence_work;
+	unsigned int			ls_fence_delay;
+	bool				ls_fenced;
 };
 
 static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index d2c9740e26a8..5f0a2fb31450 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -50,6 +50,7 @@ struct lease_manager_operations {
 	void (*lm_setup)(struct file_lease *, void **);
 	bool (*lm_breaker_owns_lease)(struct file_lease *);
 	int (*lm_open_conflict)(struct file *, int);
+	bool (*lm_breaker_timedout)(struct file_lease *fl);
 };
 
 struct lock_manager {