]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
NFSD: Enforce timeout on layout recall and integrate lease manager fencing
authorDai Ngo <dai.ngo@oracle.com>
Fri, 13 Feb 2026 18:36:30 +0000 (10:36 -0800)
committerChuck Lever <chuck.lever@oracle.com>
Mon, 30 Mar 2026 01:25:09 +0000 (21:25 -0400)
When a layout conflict triggers a recall, enforcing a timeout is
necessary to prevent excessive nfsd threads from being blocked in
__break_lease ensuring the server continues servicing incoming
requests efficiently.

This patch introduces a new function to lease_manager_operations:

lm_breaker_timedout: Invoked when a lease recall times out and is
about to be disposed of. This function enables the lease manager
to inform the caller whether the file_lease should remain on the
flc_list or be disposed of.

For the NFSD lease manager, this function now handles layout recall
timeouts. If the layout type supports fencing and the client has not
been fenced, a fence operation is triggered to prevent the client
from accessing the block device.

While the fencing operation is in progress, the conflicting file_lease
remains on the flc_list until fencing is complete. This guarantees
that no other clients can access the file, and the client with
exclusive access is properly blocked before disposal.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Documentation/admin-guide/nfs/pnfs-block-server.rst
Documentation/admin-guide/nfs/pnfs-scsi-server.rst
Documentation/filesystems/locking.rst
fs/locks.c
fs/nfsd/blocklayout.c
fs/nfsd/nfs4layouts.c
fs/nfsd/nfs4state.c
fs/nfsd/pnfs.h
fs/nfsd/state.h
include/linux/filelock.h

index 20fe9f5117fe05dcc7571a3f4cdf1e5760bcd713..b4f5997009af7122a2275e7e3fbc7a6e02ab0f5c 100644 (file)
@@ -40,3 +40,33 @@ how to translate the device into a serial number from SCSI EVPD 0x80::
 
        echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
        EOF
+
+If the nfsd server needs to fence a non-responding client and the
+fencing operation fails, the server logs a warning message in the
+system log with the following format:
+
+    FENCE failed client[IP_address] clid[#n] device[dev_name]
+
+    Where:
+
+    IP_address: refers to the IP address of the affected client.
+    #n: indicates the unique client identifier.
+    dev_name: specifies the name of the block device related
+              to the fencing attempt.
+
+The server will repeatedly retry the operation indefinitely. During
+this time, access to the affected file is restricted for all other
+clients. This is to prevent potential data corruption if multiple
+clients access the same file simultaneously.
+
+To restore access to the affected file for other clients, the admin
+needs to take the following actions:
+
+    . shutdown or power off the client being fenced.
+    . manually expire the client to release all its state on the server:
+
+      echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
+
+      Where:
+
+      clid: is the unique client identifier displayed in the system log.
index b2eec22883291be6d2755152b175ed259085e8f0..db34afbf67a9626ed346b7c12b065faf4b961236 100644 (file)
@@ -22,3 +22,34 @@ option and the underlying SCSI device support persistent reservations.
 On the client make sure the kernel has the CONFIG_PNFS_BLOCK option
 enabled, and the file system is mounted using the NFSv4.1 protocol
 version (mount -o vers=4.1).
+
+If the nfsd server needs to fence a non-responding client and the
+fencing operation fails, the server logs a warning message in the
+system log with the following format:
+
+    FENCE failed client[IP_address] clid[#n] device[dev_name]
+
+    Where:
+
+    IP_address: refers to the IP address of the affected client.
+    #n: indicates the unique client identifier.
+    dev_name: specifies the name of the block device related
+              to the fencing attempt.
+
+The server will repeatedly retry the operation indefinitely. During
+this time, access to the affected file is restricted for all other
+clients. This is to prevent potential data corruption if multiple
+clients access the same file simultaneously.
+
+To restore access to the affected file for other clients, the admin
+needs to take the following actions:
+
+    . shutdown or power off the client being fenced.
+    . manually expire the client to release all its state on the server:
+
+      echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
+
+      Where:
+
+      clid: is the unique client identifier displayed in the system log.
+
index 8025df6e64997a9a3f9a407b40056822e810a6a5..8421ea21bd35e3bbe0a07cef8a2c971c88c75da9 100644 (file)
@@ -398,6 +398,7 @@ prototypes::
        bool (*lm_breaker_owns_lease)(struct file_lock *);
         bool (*lm_lock_expirable)(struct file_lock *);
         void (*lm_expire_lock)(void);
+        bool (*lm_breaker_timedout)(struct file_lease *);
 
 locking rules:
 
@@ -412,6 +413,7 @@ lm_breaker_owns_lease:      yes             no                      no
 lm_lock_expirable      yes             no                      no
 lm_expire_lock         no              no                      yes
 lm_open_conflict       yes             no                      no
+lm_breaker_timedout     yes             no                      no
 ====================== =============   =================       =========
 
 buffer_head
index d13ec930b7bb03905df50bdce30b84749edab8d0..8e44b1f6c15a7b532cabf0a42b791a58999297a7 100644 (file)
@@ -1534,6 +1534,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 {
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lease *fl, *tmp;
+       bool remove;
 
        lockdep_assert_held(&ctx->flc_lock);
 
@@ -1541,8 +1542,19 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
                trace_time_out_leases(inode, fl);
                if (past_time(fl->fl_downgrade_time))
                        lease_modify(fl, F_RDLCK, dispose);
-               if (past_time(fl->fl_break_time))
-                       lease_modify(fl, F_UNLCK, dispose);
+
+               remove = true;
+               if (past_time(fl->fl_break_time)) {
+                       /*
+                        * Consult the lease manager when a lease break times
+                        * out to determine whether the lease should be disposed
+                        * of.
+                        */
+                       if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
+                               remove = fl->fl_lmops->lm_breaker_timedout(fl);
+                       if (remove)
+                               lease_modify(fl, F_UNLCK, dispose);
+               }
        }
 }
 
@@ -1670,9 +1682,13 @@ int __break_lease(struct inode *inode, unsigned int flags)
 restart:
        fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
        break_time = fl->fl_break_time;
-       if (break_time != 0)
-               break_time -= jiffies;
-       if (break_time == 0)
+       if (break_time != 0) {
+               if (time_after(jiffies, break_time)) {
+                       fl->fl_break_time = jiffies + lease_break_time * HZ;
+                       break_time = lease_break_time * HZ;
+               } else
+                       break_time -= jiffies;
+       } else
                break_time++;
        locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
        trace_break_lease_block(inode, new_fl);
index 8b987fca1e600f39232a1e84042151404999187d..9d829c84f374ec3b8b862279a0ad59c8675bc7a4 100644 (file)
@@ -297,6 +297,7 @@ static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp,
                ret = 0;
        }
        xa_unlock(xa);
+       clp->cl_fence_retry_warn = false;
        return ret;
 }
 
@@ -443,15 +444,33 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
        return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
 }
 
-static void
+/*
+ * Perform the fence operation to prevent the client from accessing the
+ * block device. If a fence operation is already in progress, wait for
+ * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
+ * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
+ * update the layout stateid by setting the ls_fenced flag to indicate
+ * that the client has been fenced.
+ *
+ * The cl_fence_mutex ensures that the fence operation has been fully
+ * completed, rather than just in progress, when returning from this
+ * function.
+ *
+ * Return true if client was fenced otherwise return false.
+ */
+static bool
 nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 {
        struct nfs4_client *clp = ls->ls_stid.sc_client;
        struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
        int status;
+       bool ret;
 
-       if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
-               return;
+       mutex_lock(&clp->cl_fence_mutex);
+       if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
+               mutex_unlock(&clp->cl_fence_mutex);
+               return true;
+       }
 
        status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
                        nfsd4_scsi_pr_key(clp),
@@ -470,13 +489,22 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
         * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
         * retry loop.
         */
-       if (status < 0 ||
-           status == PR_STS_PATH_FAILED ||
-           status == PR_STS_PATH_FAST_FAILED ||
-           status == PR_STS_RETRY_PATH_FAILURE)
+       switch (status) {
+       case 0:
+       case PR_STS_IOERR:
+       case PR_STS_RESERVATION_CONFLICT:
+               ret = true;
+               break;
+       default:
+               /* retry-able and other errors */
+               ret = false;
                nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
+               break;
+       }
+       mutex_unlock(&clp->cl_fence_mutex);
 
        trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
+       return ret;
 }
 
 const struct nfsd4_layout_ops scsi_layout_ops = {
index ad7af8cfcf1f9019f290a22214f27c3ceeee33a4..69e41105efdd5b06f5e30dd4a64d7cdfc72fa2e2 100644 (file)
@@ -27,6 +27,8 @@ static struct kmem_cache *nfs4_layout_stateid_cache;
 static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
 static const struct lease_manager_operations nfsd4_layouts_lm_ops;
 
+static void nfsd4_layout_fence_worker(struct work_struct *work);
+
 const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
 #ifdef CONFIG_NFSD_FLEXFILELAYOUT
        [LAYOUT_FLEX_FILES]     = &ff_layout_ops,
@@ -177,6 +179,13 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 
        trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid);
 
+       spin_lock(&ls->ls_lock);
+       if (delayed_work_pending(&ls->ls_fence_work)) {
+               spin_unlock(&ls->ls_lock);
+               cancel_delayed_work_sync(&ls->ls_fence_work);
+       } else
+               spin_unlock(&ls->ls_lock);
+
        spin_lock(&clp->cl_lock);
        list_del_init(&ls->ls_perclnt);
        spin_unlock(&clp->cl_lock);
@@ -271,6 +280,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
        list_add(&ls->ls_perfile, &fp->fi_lo_states);
        spin_unlock(&fp->fi_lock);
 
+       ls->ls_fenced = false;
+       ls->ls_fence_delay = 0;
+       INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
+
        trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
        return ls;
 }
@@ -747,11 +760,9 @@ static bool
 nfsd4_layout_lm_break(struct file_lease *fl)
 {
        /*
-        * We don't want the locks code to timeout the lease for us;
-        * we'll remove it ourself if a layout isn't returned
-        * in time:
+        * Enforce break lease timeout to prevent NFSD
+        * thread from hanging in __break_lease.
         */
-       fl->fl_break_time = 0;
        nfsd4_recall_file_layout(fl->c.flc_owner);
        return false;
 }
@@ -782,10 +793,143 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
        return 0;
 }
 
+static void
+nfsd4_layout_fence_worker(struct work_struct *work)
+{
+       struct delayed_work *dwork = to_delayed_work(work);
+       struct nfs4_layout_stateid *ls = container_of(dwork,
+                       struct nfs4_layout_stateid, ls_fence_work);
+       struct nfsd_file *nf;
+       struct block_device *bdev;
+       struct nfs4_client *clp;
+       struct nfsd_net *nn;
+
+       /*
+        * The workqueue clears WORK_STRUCT_PENDING before invoking
+        * this callback. Re-arm immediately so that
+        * delayed_work_pending() returns true while the fence
+        * operation is in progress, preventing
+        * lm_breaker_timedout() from taking a duplicate reference.
+        */
+       mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
+
+       spin_lock(&ls->ls_lock);
+       if (list_empty(&ls->ls_layouts)) {
+               spin_unlock(&ls->ls_lock);
+dispose:
+               cancel_delayed_work(&ls->ls_fence_work);
+               /* unlock the lease so that tasks waiting on it can proceed */
+               nfsd4_close_layout(ls);
+
+               ls->ls_fenced = true;
+               nfs4_put_stid(&ls->ls_stid);
+               return;
+       }
+       spin_unlock(&ls->ls_lock);
+
+       rcu_read_lock();
+       nf = nfsd_file_get(ls->ls_file);
+       rcu_read_unlock();
+       if (!nf)
+               goto dispose;
+
+       clp = ls->ls_stid.sc_client;
+       nn = net_generic(clp->net, nfsd_net_id);
+       bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev;
+       if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) {
+               /* fenced ok */
+               nfsd_file_put(nf);
+               pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n",
+                       __func__, (struct sockaddr *)&clp->cl_addr,
+                       clp->cl_clientid.cl_id - nn->clientid_base,
+                       bdev->bd_disk->disk_name);
+               goto dispose;
+       }
+       /* fence failed */
+       nfsd_file_put(nf);
+
+       if (!clp->cl_fence_retry_warn) {
+               pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n",
+                       __func__, (struct sockaddr *)&clp->cl_addr,
+                       clp->cl_clientid.cl_id - nn->clientid_base,
+                       bdev->bd_disk->disk_name);
+               clp->cl_fence_retry_warn = true;
+       }
+       /*
+        * The fence worker retries the fencing operation indefinitely to
+        * prevent data corruption. The admin needs to take the following
+        * actions to restore access to the file for other clients:
+        *
+        *  . shutdown or power off the client being fenced.
+        *  . manually expire the client to release all its state on the server;
+        *    echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
+        *
+        *    Where:
+        *
+        *    clid: is the unique client identifier displayed in
+        *          the warning message above.
+        */
+       if (!ls->ls_fence_delay)
+               ls->ls_fence_delay = HZ;
+       else
+               ls->ls_fence_delay = min(ls->ls_fence_delay << 1,
+                                        MAX_FENCE_DELAY);
+       mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay);
+}
+
+/**
+ * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
+ * @fl: file to check
+ *
+ * If the layout type supports a fence operation, schedule a worker to
+ * fence the client from accessing the block device.
+ *
+ * This function runs under the protection of the spin_lock flc_lock.
+ * At this time, the file_lease associated with the layout stateid is
+ * on the flc_list. A reference count is incremented on the layout
+ * stateid to prevent it from being freed while the fence worker is
+ * executing. Once the fence worker finishes its operation, it releases
+ * this reference.
+ *
+ * The fence worker continues to run until either the client has been
+ * fenced or the layout becomes invalid. The layout can become invalid
+ * as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback
+ * has completed.
+ *
+ * Return true if the file_lease should be disposed of by the caller;
+ * otherwise, return false.
+ */
+static bool
+nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
+{
+       struct nfs4_layout_stateid *ls = fl->c.flc_owner;
+
+       if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) ||
+                       ls->ls_fenced)
+               return true;
+       if (delayed_work_pending(&ls->ls_fence_work))
+               return false;
+       /*
+        * Make sure layout has not been returned yet before
+        * taking a reference count on the layout stateid.
+        */
+       spin_lock(&ls->ls_lock);
+       if (list_empty(&ls->ls_layouts) ||
+                       !refcount_inc_not_zero(&ls->ls_stid.sc_count)) {
+               spin_unlock(&ls->ls_lock);
+               return true;
+       }
+       spin_unlock(&ls->ls_lock);
+
+       mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
+       return false;
+}
+
 static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
        .lm_break               = nfsd4_layout_lm_break,
        .lm_change              = nfsd4_layout_lm_change,
        .lm_open_conflict       = nfsd4_layout_lm_open_conflict,
+       .lm_breaker_timedout    = nfsd4_layout_lm_breaker_timedout,
 };
 
 int
index 1b4c101ff04b9f1dda6ce0d40f9cc0a68c61f6a9..1d31f2bb21622680e04238be861d4040da7d2f01 100644 (file)
@@ -2386,6 +2386,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
 #endif
 #ifdef CONFIG_NFSD_SCSILAYOUT
        xa_init(&clp->cl_dev_fences);
+       mutex_init(&clp->cl_fence_mutex);
 #endif
        INIT_LIST_HEAD(&clp->async_copies);
        spin_lock_init(&clp->async_lock);
index db9af780438b5c512b8ada7af8fb43b528dcc64c..f7bee4dc5d3d209d810e75798207b6c119c8cd88 100644 (file)
@@ -11,6 +11,9 @@
 
 struct xdr_stream;
 
+/* Cap exponential backoff between fence retries at 3 minutes */
+#define        MAX_FENCE_DELAY         ((unsigned int)(3 * 60 * HZ))
+
 struct nfsd4_deviceid_map {
        struct list_head        hash;
        u64                     idx;
@@ -38,7 +41,7 @@ struct nfsd4_layout_ops {
                        struct svc_rqst *rqstp,
                        struct nfsd4_layoutcommit *lcp);
 
-       void (*fence_client)(struct nfs4_layout_stateid *ls,
+       bool (*fence_client)(struct nfs4_layout_stateid *ls,
                             struct nfsd_file *file);
 };
 
index 99aeaab9cf2b194ad5b62add147bcc2471181f0e..ec1c5467012ecc881e479e1a0441c726c198721a 100644 (file)
@@ -456,6 +456,7 @@ struct nfs4_client {
        struct list_head        cl_lru;         /* tail queue */
 #ifdef CONFIG_NFSD_PNFS
        struct list_head        cl_lo_states;   /* outstanding layout states */
+       bool                    cl_fence_retry_warn;
 #endif
        struct xdr_netobj       cl_name;        /* id generated by client */
        nfs4_verifier           cl_verifier;    /* generated by client */
@@ -529,6 +530,7 @@ struct nfs4_client {
        time64_t                cl_ra_time;
 #ifdef CONFIG_NFSD_SCSILAYOUT
        struct xarray           cl_dev_fences;
+       struct mutex            cl_fence_mutex;
 #endif
 };
 
@@ -745,6 +747,10 @@ struct nfs4_layout_stateid {
        stateid_t                       ls_recall_sid;
        bool                            ls_recalled;
        struct mutex                    ls_mutex;
+
+       struct delayed_work             ls_fence_work;
+       unsigned int                    ls_fence_delay;
+       bool                            ls_fenced;
 };
 
 static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
index d2c9740e26a8eb71c33d6fa6325fcab3183e2409..5f0a2fb31450607ae23d7f06b6372f4f63c1bf62 100644 (file)
@@ -50,6 +50,7 @@ struct lease_manager_operations {
        void (*lm_setup)(struct file_lease *, void **);
        bool (*lm_breaker_owns_lease)(struct file_lease *);
        int (*lm_open_conflict)(struct file *, int);
+       bool (*lm_breaker_timedout)(struct file_lease *fl);
 };
 
 struct lock_manager {