]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
NFSv4/pnfs: defer return_range callbacks until after inode unlock
authorDai Ngo <dai.ngo@oracle.com>
Tue, 26 May 2026 23:29:53 +0000 (16:29 -0700)
committerAnna Schumaker <anna.schumaker@hammerspace.com>
Mon, 8 Jun 2026 14:21:56 +0000 (10:21 -0400)
Sometimes unmounting an NFS filesystem mounted with pNFS SCSI
layouts triggers the following warning:

     BUG: scheduling while atomic: umount.nfs4/...

    __schedule_bug+0xbd/0x100
     schedule_debug.constprop.0+0x19f/0x220
     __schedule+0x10d/0x10a0
     schedule+0x74/0x190
     schedule_timeout+0xf5/0x220
     io_schedule_timeout+0xd5/0x160
     __wait_for_common+0x186/0x4b0
     blk_execute_rq+0x2ef/0x3a0
     scsi_execute_cmd+0x1ff/0x700
     sd_pr_out_command.isra.0+0x242/0x380 [sd_mod]
     bl_unregister_scsi.constprop.0+0x109/0x3c0 [blocklayoutdriver]
     bl_unregister_dev+0x175/0x1c0 [blocklayoutdriver]
     bl_free_device+0x1f/0x1b0 [blocklayoutdriver]
     bl_free_deviceid_node+0x12/0x30 [blocklayoutdriver]
     nfs4_put_deviceid_node+0x171/0x360 [nfsv4]
     ext_tree_remove+0x11c/0x1d0 [blocklayoutdriver]
     _pnfs_return_layout+0x416/0x900 [nfsv4]
     nfs4_evict_inode+0x108/0x130 [nfsv4]
     evict+0x316/0x750
     dispose_list+0xf1/0x1a0
     evict_inodes+0x33f/0x440
     generic_shutdown_super+0xc9/0x4e0
     kill_anon_super+0x3a/0x90
     nfs_kill_super+0x44/0x60 [nfs]
     deactivate_locked_super+0xb8/0x1b0
     cleanup_mnt+0x25a/0x380
     task_work_run+0x13e/0x210
     exit_to_user_mode_loop+0x169/0x400
     do_syscall_64+0x467/0x1550
     entry_SYSCALL_64_after_hwframe+0x76/0x7e

The warning occurs because the block layout driver unregisters the SCSI
device while the inode lock is still held. Device unregistration issues
a SCSI PR command, which may sleep, resulting in a "scheduling while
atomic" warning.

During layout return, ext_tree_remove() invokes the layout driver's
return_range callback while holding the inode lock. For block layouts,
this callback eventually calls bl_unregister_scsi(), which may block in
scsi_execute_cmd() while issuing PR commands to the device.

Fix this by deferring the return_range callbacks until after the inode
lock has been released. The layout header reference count is incremented
before invoking return_range(), ensuring that the layout header remains
valid while the layout driver removes extents from the extent tree.

Fixes: c88953d87f5c8 ("pnfs: add return_range method")
Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Anna Schumaker <anna.schumaker@hammerspace.com>
fs/nfs/callback_proc.c
fs/nfs/pnfs.c

index 4ea9221ded4264b8ea744f1bedd8d215faec1ffe..10f2354ba3048fcdbd9e9a0dd81eaabc2a3ea987 100644 (file)
@@ -257,6 +257,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        struct pnfs_layout_hdr *lo;
        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
        LIST_HEAD(free_me_list);
+       bool return_range = false;
 
        ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid);
        if (IS_ERR(ino)) {
@@ -301,13 +302,13 @@ static u32 initiate_file_draining(struct nfs_client *clp,
                /* Embrace your forgetfulness! */
                rv = NFS4ERR_NOMATCHING_LAYOUT;
 
-               if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
-                       NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
-                               &args->cbl_range);
-               }
+               return_range = true;
        }
 unlock:
        spin_unlock(&ino->i_lock);
+       if (return_range && NFS_SERVER(ino)->pnfs_curr_ld->return_range)
+               NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+                       &args->cbl_range);
        pnfs_free_lseg_list(&free_me_list);
        /* Free all lsegs that are attached to commit buckets */
        nfs_commit_inode(ino, 0);
index cb203821a3971bbf902422b52d0acd72047a9394..7715e2bd5871bbee2207d9d9ad945a0c8604a90a 100644 (file)
@@ -1463,8 +1463,6 @@ _pnfs_return_layout(struct inode *ino)
        pnfs_clear_layoutcommit(ino, &tmp_list);
        pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0);
 
-       if (NFS_SERVER(ino)->pnfs_curr_ld->return_range)
-               NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
 
        /* Don't send a LAYOUTRETURN if list was initially empty */
        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
@@ -1476,6 +1474,8 @@ _pnfs_return_layout(struct inode *ino)
 
        send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
        spin_unlock(&ino->i_lock);
+       if (NFS_SERVER(ino)->pnfs_curr_ld->return_range)
+               NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
        if (send)
                status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY,
                                                0);