NFSD: Enforce timeout on layout recall and integrate lease manager fencing

author Dai Ngo <dai.ngo@oracle.com>

Fri, 13 Feb 2026 18:36:30 +0000 (10:36 -0800)

committer Chuck Lever <chuck.lever@oracle.com>

Mon, 30 Mar 2026 01:25:09 +0000 (21:25 -0400)
author Dai Ngo <dai.ngo@oracle.com>
Fri, 13 Feb 2026 18:36:30 +0000 (10:36 -0800)
committer Chuck Lever <chuck.lever@oracle.com>
Mon, 30 Mar 2026 01:25:09 +0000 (21:25 -0400)
diff --git a/Documentation/admin-guide/nfs/pnfs-block-server.rst b/Documentation/admin-guide/nfs/pnfs-block-server.rst

index 20fe9f5117fe05dcc7571a3f4cdf1e5760bcd713..b4f5997009af7122a2275e7e3fbc7a6e02ab0f5c 100644 (file)
--- a/Documentation/admin-guide/nfs/pnfs-block-server.rst
+++ b/Documentation/admin-guide/nfs/pnfs-block-server.rst
@@ -40,3 +40,33 @@ how to translate the device into a serial number from SCSI EVPD 0x80::
  
         echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
         EOF
+
+If the nfsd server needs to fence a non-responding client and the
+fencing operation fails, the server logs a warning message in the
+system log with the following format:
+
+    FENCE failed client[IP_address] clid[#n] device[dev_name]
+
+    Where:
+
+    IP_address: refers to the IP address of the affected client.
+    #n: indicates the unique client identifier.
+    dev_name: specifies the name of the block device related
+              to the fencing attempt.
+
+The server will repeatedly retry the operation indefinitely. During
+this time, access to the affected file is restricted for all other
+clients. This is to prevent potential data corruption if multiple
+clients access the same file simultaneously.
+
+To restore access to the affected file for other clients, the admin
+needs to take the following actions:
+
+    . shutdown or power off the client being fenced.
+    . manually expire the client to release all its state on the server:
+
+      echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
+
+      Where:
+
+      clid: is the unique client identifier displayed in the system log.
diff --git a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst

index b2eec22883291be6d2755152b175ed259085e8f0..db34afbf67a9626ed346b7c12b065faf4b961236 100644 (file)
--- a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst
+++ b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst
@@ -22,3 +22,34 @@ option and the underlying SCSI device support persistent reservations.
  On the client make sure the kernel has the CONFIG_PNFS_BLOCK option
  enabled, and the file system is mounted using the NFSv4.1 protocol
  version (mount -o vers=4.1).
+
+If the nfsd server needs to fence a non-responding client and the
+fencing operation fails, the server logs a warning message in the
+system log with the following format:
+
+    FENCE failed client[IP_address] clid[#n] device[dev_name]
+
+    Where:
+
+    IP_address: refers to the IP address of the affected client.
+    #n: indicates the unique client identifier.
+    dev_name: specifies the name of the block device related
+              to the fencing attempt.
+
+The server will repeatedly retry the operation indefinitely. During
+this time, access to the affected file is restricted for all other
+clients. This is to prevent potential data corruption if multiple
+clients access the same file simultaneously.
+
+To restore access to the affected file for other clients, the admin
+needs to take the following actions:
+
+    . shutdown or power off the client being fenced.
+    . manually expire the client to release all its state on the server:
+
+      echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
+
+      Where:
+
+      clid: is the unique client identifier displayed in the system log.
+
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst

index 8025df6e64997a9a3f9a407b40056822e810a6a5..8421ea21bd35e3bbe0a07cef8a2c971c88c75da9 100644 (file)
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -398,6 +398,7 @@ prototypes::
         bool (*lm_breaker_owns_lease)(struct file_lock *);
          bool (*lm_lock_expirable)(struct file_lock *);
          void (*lm_expire_lock)(void);
+        bool (*lm_breaker_timedout)(struct file_lease *);
  
  locking rules:
  
@@ -412,6 +413,7 @@ lm_breaker_owns_lease:      yes             no                      no
  lm_lock_expirable      yes             no                      no
  lm_expire_lock         no              no                      yes
  lm_open_conflict       yes             no                      no
+lm_breaker_timedout     yes             no                      no
  ====================== =============   =================       =========
  
  buffer_head
diff --git a/fs/locks.c b/fs/locks.c

index d13ec930b7bb03905df50bdce30b84749edab8d0..8e44b1f6c15a7b532cabf0a42b791a58999297a7 100644 (file)
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1534,6 +1534,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
  {
         struct file_lock_context *ctx = inode->i_flctx;
         struct file_lease *fl, *tmp;
+       bool remove;
  
         lockdep_assert_held(&ctx->flc_lock);
  
@@ -1541,8 +1542,19 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
                 trace_time_out_leases(inode, fl);
                 if (past_time(fl->fl_downgrade_time))
                         lease_modify(fl, F_RDLCK, dispose);
-               if (past_time(fl->fl_break_time))
-                       lease_modify(fl, F_UNLCK, dispose);
+
+               remove = true;
+               if (past_time(fl->fl_break_time)) {
+                       /*
+                        * Consult the lease manager when a lease break times
+                        * out to determine whether the lease should be disposed
+                        * of.
+                        */
+                       if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
+                               remove = fl->fl_lmops->lm_breaker_timedout(fl);
+                       if (remove)
+                               lease_modify(fl, F_UNLCK, dispose);
+               }
         }
  }
  
@@ -1670,9 +1682,13 @@ int __break_lease(struct inode *inode, unsigned int flags)
  restart:
         fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
         break_time = fl->fl_break_time;
-       if (break_time != 0)
-               break_time -= jiffies;
-       if (break_time == 0)
+       if (break_time != 0) {
+               if (time_after(jiffies, break_time)) {
+                       fl->fl_break_time = jiffies + lease_break_time * HZ;
+                       break_time = lease_break_time * HZ;
+               } else
+                       break_time -= jiffies;
+       } else
                 break_time++;
         locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
         trace_break_lease_block(inode, new_fl);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c

index 8b987fca1e600f39232a1e84042151404999187d..9d829c84f374ec3b8b862279a0ad59c8675bc7a4 100644 (file)
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -297,6 +297,7 @@ static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp,
                 ret = 0;
         }
         xa_unlock(xa);
+       clp->cl_fence_retry_warn = false;
         return ret;
  }
  
@@ -443,15 +444,33 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
         return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
  }
  
-static void
+/*
+ * Perform the fence operation to prevent the client from accessing the
+ * block device. If a fence operation is already in progress, wait for
+ * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
+ * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
+ * update the layout stateid by setting the ls_fenced flag to indicate
+ * that the client has been fenced.
+ *
+ * The cl_fence_mutex ensures that the fence operation has been fully
+ * completed, rather than just in progress, when returning from this
+ * function.
+ *
+ * Return true if client was fenced otherwise return false.
+ */
+static bool
  nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
  {
         struct nfs4_client *clp = ls->ls_stid.sc_client;
         struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
         int status;
+       bool ret;
  
-       if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
-               return;
+       mutex_lock(&clp->cl_fence_mutex);
+       if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
+               mutex_unlock(&clp->cl_fence_mutex);
+               return true;
+       }
  
         status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
                         nfsd4_scsi_pr_key(clp),
@@ -470,13 +489,22 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
          * PR_STS_RESERVATION_CONFLICT, which would cause an infinite
          * retry loop.
          */
-       if (status < 0 ||
-           status == PR_STS_PATH_FAILED ||
-           status == PR_STS_PATH_FAST_FAILED ||
-           status == PR_STS_RETRY_PATH_FAILURE)
+       switch (status) {
+       case 0:
+       case PR_STS_IOERR:
+       case PR_STS_RESERVATION_CONFLICT:
+               ret = true;
+               break;
+       default:
+               /* retry-able and other errors */
+               ret = false;
                 nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
+               break;
+       }
+       mutex_unlock(&clp->cl_fence_mutex);
  
         trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
+       return ret;
  }
  
  const struct nfsd4_layout_ops scsi_layout_ops = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c

index ad7af8cfcf1f9019f290a22214f27c3ceeee33a4..69e41105efdd5b06f5e30dd4a64d7cdfc72fa2e2 100644 (file)
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -27,6 +27,8 @@ static struct kmem_cache *nfs4_layout_stateid_cache;
  static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
  static const struct lease_manager_operations nfsd4_layouts_lm_ops;
  
+static void nfsd4_layout_fence_worker(struct work_struct *work);
+
  const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
  #ifdef CONFIG_NFSD_FLEXFILELAYOUT
         [LAYOUT_FLEX_FILES]     = &ff_layout_ops,
@@ -177,6 +179,13 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
  
         trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid);
  
+       spin_lock(&ls->ls_lock);
+       if (delayed_work_pending(&ls->ls_fence_work)) {
+               spin_unlock(&ls->ls_lock);
+               cancel_delayed_work_sync(&ls->ls_fence_work);
+       } else
+               spin_unlock(&ls->ls_lock);
+
         spin_lock(&clp->cl_lock);
         list_del_init(&ls->ls_perclnt);
         spin_unlock(&clp->cl_lock);
@@ -271,6 +280,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
         list_add(&ls->ls_perfile, &fp->fi_lo_states);
         spin_unlock(&fp->fi_lock);
  
+       ls->ls_fenced = false;
+       ls->ls_fence_delay = 0;
+       INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
+
         trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
         return ls;
  }
@@ -747,11 +760,9 @@ static bool
  nfsd4_layout_lm_break(struct file_lease *fl)
  {
         /*
-        * We don't want the locks code to timeout the lease for us;
-        * we'll remove it ourself if a layout isn't returned
-        * in time:
+        * Enforce break lease timeout to prevent NFSD
+        * thread from hanging in __break_lease.
          */
-       fl->fl_break_time = 0;
         nfsd4_recall_file_layout(fl->c.flc_owner);
         return false;
  }
@@ -782,10 +793,143 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
         return 0;
  }
  
+static void
+nfsd4_layout_fence_worker(struct work_struct *work)
+{
+       struct delayed_work *dwork = to_delayed_work(work);
+       struct nfs4_layout_stateid *ls = container_of(dwork,
+                       struct nfs4_layout_stateid, ls_fence_work);
+       struct nfsd_file *nf;
+       struct block_device *bdev;
+       struct nfs4_client *clp;
+       struct nfsd_net *nn;
+
+       /*
+        * The workqueue clears WORK_STRUCT_PENDING before invoking
+        * this callback. Re-arm immediately so that
+        * delayed_work_pending() returns true while the fence
+        * operation is in progress, preventing
+        * lm_breaker_timedout() from taking a duplicate reference.
+        */
+       mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
+
+       spin_lock(&ls->ls_lock);
+       if (list_empty(&ls->ls_layouts)) {
+               spin_unlock(&ls->ls_lock);
+dispose:
+               cancel_delayed_work(&ls->ls_fence_work);
+               /* unlock the lease so that tasks waiting on it can proceed */
+               nfsd4_close_layout(ls);
+
+               ls->ls_fenced = true;
+               nfs4_put_stid(&ls->ls_stid);
+               return;
+       }
+       spin_unlock(&ls->ls_lock);
+
+       rcu_read_lock();
+       nf = nfsd_file_get(ls->ls_file);
+       rcu_read_unlock();
+       if (!nf)
+               goto dispose;
+
+       clp = ls->ls_stid.sc_client;
+       nn = net_generic(clp->net, nfsd_net_id);
+       bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev;
+       if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) {
+               /* fenced ok */
+               nfsd_file_put(nf);
+               pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n",
+                       __func__, (struct sockaddr *)&clp->cl_addr,
+                       clp->cl_clientid.cl_id - nn->clientid_base,
+                       bdev->bd_disk->disk_name);
+               goto dispose;
+       }
+       /* fence failed */
+       nfsd_file_put(nf);
+
+       if (!clp->cl_fence_retry_warn) {
+               pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n",
+                       __func__, (struct sockaddr *)&clp->cl_addr,
+                       clp->cl_clientid.cl_id - nn->clientid_base,
+                       bdev->bd_disk->disk_name);
+               clp->cl_fence_retry_warn = true;
+       }
+       /*
+        * The fence worker retries the fencing operation indefinitely to
+        * prevent data corruption. The admin needs to take the following
+        * actions to restore access to the file for other clients:
+        *
+        *  . shutdown or power off the client being fenced.
+        *  . manually expire the client to release all its state on the server;
+        *    echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
+        *
+        *    Where:
+        *
+        *    clid: is the unique client identifier displayed in
+        *          the warning message above.
+        */
+       if (!ls->ls_fence_delay)
+               ls->ls_fence_delay = HZ;
+       else
+               ls->ls_fence_delay = min(ls->ls_fence_delay << 1,
+                                        MAX_FENCE_DELAY);
+       mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay);
+}
+
+/**
+ * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
+ * @fl: file to check
+ *
+ * If the layout type supports a fence operation, schedule a worker to
+ * fence the client from accessing the block device.
+ *
+ * This function runs under the protection of the spin_lock flc_lock.
+ * At this time, the file_lease associated with the layout stateid is
+ * on the flc_list. A reference count is incremented on the layout
+ * stateid to prevent it from being freed while the fence worker is
+ * executing. Once the fence worker finishes its operation, it releases
+ * this reference.
+ *
+ * The fence worker continues to run until either the client has been
+ * fenced or the layout becomes invalid. The layout can become invalid
+ * as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback
+ * has completed.
+ *
+ * Return true if the file_lease should be disposed of by the caller;
+ * otherwise, return false.
+ */
+static bool
+nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
+{
+       struct nfs4_layout_stateid *ls = fl->c.flc_owner;
+
+       if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) ||
+                       ls->ls_fenced)
+               return true;
+       if (delayed_work_pending(&ls->ls_fence_work))
+               return false;
+       /*
+        * Make sure layout has not been returned yet before
+        * taking a reference count on the layout stateid.
+        */
+       spin_lock(&ls->ls_lock);
+       if (list_empty(&ls->ls_layouts) ||
+                       !refcount_inc_not_zero(&ls->ls_stid.sc_count)) {
+               spin_unlock(&ls->ls_lock);
+               return true;
+       }
+       spin_unlock(&ls->ls_lock);
+
+       mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
+       return false;
+}
+
  static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
         .lm_break               = nfsd4_layout_lm_break,
         .lm_change              = nfsd4_layout_lm_change,
         .lm_open_conflict       = nfsd4_layout_lm_open_conflict,
+       .lm_breaker_timedout    = nfsd4_layout_lm_breaker_timedout,
  };
  
  int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c

index 1b4c101ff04b9f1dda6ce0d40f9cc0a68c61f6a9..1d31f2bb21622680e04238be861d4040da7d2f01 100644 (file)
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2386,6 +2386,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
  #endif
  #ifdef CONFIG_NFSD_SCSILAYOUT
         xa_init(&clp->cl_dev_fences);
+       mutex_init(&clp->cl_fence_mutex);
  #endif
         INIT_LIST_HEAD(&clp->async_copies);
         spin_lock_init(&clp->async_lock);
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h

index db9af780438b5c512b8ada7af8fb43b528dcc64c..f7bee4dc5d3d209d810e75798207b6c119c8cd88 100644 (file)
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -11,6 +11,9 @@
  
  struct xdr_stream;
  
+/* Cap exponential backoff between fence retries at 3 minutes */
+#define        MAX_FENCE_DELAY         ((unsigned int)(3 * 60 * HZ))
+
  struct nfsd4_deviceid_map {
         struct list_head        hash;
         u64                     idx;
@@ -38,7 +41,7 @@ struct nfsd4_layout_ops {
                         struct svc_rqst *rqstp,
                         struct nfsd4_layoutcommit *lcp);
  
-       void (*fence_client)(struct nfs4_layout_stateid *ls,
+       bool (*fence_client)(struct nfs4_layout_stateid *ls,
                              struct nfsd_file *file);
  };
  
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h

index 99aeaab9cf2b194ad5b62add147bcc2471181f0e..ec1c5467012ecc881e479e1a0441c726c198721a 100644 (file)
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -456,6 +456,7 @@ struct nfs4_client {
         struct list_head        cl_lru;         /* tail queue */
  #ifdef CONFIG_NFSD_PNFS
         struct list_head        cl_lo_states;   /* outstanding layout states */
+       bool                    cl_fence_retry_warn;
  #endif
         struct xdr_netobj       cl_name;        /* id generated by client */
         nfs4_verifier           cl_verifier;    /* generated by client */
@@ -529,6 +530,7 @@ struct nfs4_client {
         time64_t                cl_ra_time;
  #ifdef CONFIG_NFSD_SCSILAYOUT
         struct xarray           cl_dev_fences;
+       struct mutex            cl_fence_mutex;
  #endif
  };
  
@@ -745,6 +747,10 @@ struct nfs4_layout_stateid {
         stateid_t                       ls_recall_sid;
         bool                            ls_recalled;
         struct mutex                    ls_mutex;
+
+       struct delayed_work             ls_fence_work;
+       unsigned int                    ls_fence_delay;
+       bool                            ls_fenced;
  };
  
  static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
diff --git a/include/linux/filelock.h b/include/linux/filelock.h

index d2c9740e26a8eb71c33d6fa6325fcab3183e2409..5f0a2fb31450607ae23d7f06b6372f4f63c1bf62 100644 (file)
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -50,6 +50,7 @@ struct lease_manager_operations {
         void (*lm_setup)(struct file_lease *, void **);
         bool (*lm_breaker_owns_lease)(struct file_lease *);
         int (*lm_open_conflict)(struct file *, int);
+       bool (*lm_breaker_timedout)(struct file_lease *fl);
  };
  
  struct lock_manager {
author	Dai Ngo <dai.ngo@oracle.com>
	Fri, 13 Feb 2026 18:36:30 +0000 (10:36 -0800)
committer	Chuck Lever <chuck.lever@oracle.com>
	Mon, 30 Mar 2026 01:25:09 +0000 (21:25 -0400)
Documentation/admin-guide/nfs/pnfs-block-server.rst		patch \| blob \| blame \| history
Documentation/admin-guide/nfs/pnfs-scsi-server.rst		patch \| blob \| blame \| history
Documentation/filesystems/locking.rst		patch \| blob \| blame \| history
fs/locks.c		patch \| blob \| blame \| history
fs/nfsd/blocklayout.c		patch \| blob \| blame \| history
fs/nfsd/nfs4layouts.c		patch \| blob \| blame \| history
fs/nfsd/nfs4state.c		patch \| blob \| blame \| history
fs/nfsd/pnfs.h		patch \| blob \| blame \| history
fs/nfsd/state.h		patch \| blob \| blame \| history
include/linux/filelock.h		patch \| blob \| blame \| history