]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
mm/swap, PM: hibernate: fix swapoff race in uswsusp by pinning swap device
authorYoungjun Park <youngjun.park@lge.com>
Mon, 23 Mar 2026 16:08:21 +0000 (01:08 +0900)
committerAndrew Morton <akpm@linux-foundation.org>
Tue, 9 Jun 2026 01:21:31 +0000 (18:21 -0700)
Patch series "mm/swap, PM: hibernate: fix swapoff race in uswsusp by
pinning swap device", v8.

Currently, in the uswsusp path, only the swap type value is retrieved at
lookup time without holding a reference. If swapoff races after the type
is acquired, subsequent slot allocations operate on a stale swap device.

Additionally, grabbing and releasing the swap device reference on every
slot allocation is inefficient across the entire hibernation swap path.

This patch series addresses these issues:
- Patch 1: Fixes the swapoff race in uswsusp by pinning the swap device
  from the point it is looked up until the session completes.
- Patch 2: Removes the overhead of per-slot reference counting in alloc/free
  paths and cleans up the redundant SWP_WRITEOK check.

This patch (of 2):

Hibernation via uswsusp (/dev/snapshot ioctls) has a race window: after
selecting the resume swap area but before user space is frozen, swapoff
may run and invalidate the selected swap device.

Fix this by pinning the swap device with SWP_HIBERNATION while it is in
use.  The pin is exclusive, which is sufficient since hibernate_acquire()
already prevents concurrent hibernation sessions.

The kernel swsusp path (sysfs-based hibernate/resume) uses
find_hibernation_swap_type() which is not affected by the pin.  It freezes
user space before touching swap, so swapoff cannot race.

Introduce dedicated helpers:
- pin_hibernation_swap_type(): Look up and pin the swap device.
  Used by the uswsusp path.
- find_hibernation_swap_type(): Lookup without pinning.
  Used by the kernel swsusp path.
- unpin_hibernation_swap_type(): Clear the hibernation pin.

While a swap device is pinned, swapoff is prevented from proceeding.

Link: https://lore.kernel.org/20260323160822.1409904-1-youngjun.park@lge.com
Link: https://lore.kernel.org/20260323160822.1409904-2-youngjun.park@lge.com
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Reviewed-by: Kairui Song <kasong@tencent.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: "Rafael J . Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/swap.h
kernel/power/swap.c
kernel/power/user.c
mm/swapfile.c

index 8c43bc3055c95b294bfb36bd972f2f32c7f6b4f6..8f0f68e245baa347ed2112b56c9afd284e38dd3c 100644 (file)
@@ -213,6 +213,7 @@ enum {
        SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
        SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
        SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
+       SWP_HIBERNATION = (1 << 13),    /* pinned for hibernation */
                                        /* add others here before... */
 };
 
@@ -432,7 +433,9 @@ static inline long get_nr_swap_pages(void)
 }
 
 extern void si_swapinfo(struct sysinfo *);
-int swap_type_of(dev_t device, sector_t offset);
+extern int pin_hibernation_swap_type(dev_t device, sector_t offset);
+extern void unpin_hibernation_swap_type(int type);
+extern int find_hibernation_swap_type(dev_t device, sector_t offset);
 int find_first_swap(dev_t *device);
 extern unsigned int count_swap_pages(int, int);
 extern sector_t swapdev_block(int, pgoff_t);
index 2e64869bb5a093d7a1d2decfe7be9e7f20092d47..cc4764149e8fce67cc975aa2d1e1b447d4ffe3eb 100644 (file)
@@ -341,7 +341,7 @@ static int swsusp_swap_check(void)
         * This is called before saving the image.
         */
        if (swsusp_resume_device)
-               res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
+               res = find_hibernation_swap_type(swsusp_resume_device, swsusp_resume_block);
        else
                res = find_first_swap(&swsusp_resume_device);
        if (res < 0)
index be77f3556bd7da592c068458ca2e9d430ad4cc38..d0fcfba7ac235a483f645ccc579779a9e9907912 100644 (file)
@@ -71,7 +71,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        memset(&data->handle, 0, sizeof(struct snapshot_handle));
        if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
                /* Hibernating.  The image device should be accessible. */
-               data->swap = swap_type_of(swsusp_resume_device, 0);
+               data->swap = pin_hibernation_swap_type(swsusp_resume_device, 0);
                data->mode = O_RDONLY;
                data->free_bitmaps = false;
                error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
@@ -90,8 +90,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                        data->free_bitmaps = !error;
                }
        }
-       if (error)
+       if (error) {
+               unpin_hibernation_swap_type(data->swap);
                hibernate_release();
+       }
 
        data->frozen = false;
        data->ready = false;
@@ -115,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
        data = filp->private_data;
        data->dev = 0;
        free_all_swap_pages(data->swap);
+       unpin_hibernation_swap_type(data->swap);
        if (data->frozen) {
                pm_restore_gfp_mask();
                free_basic_memory_bitmaps();
@@ -235,11 +238,17 @@ static int snapshot_set_swap_area(struct snapshot_data *data,
                offset = swap_area.offset;
        }
 
+       /*
+        * Unpin the swap device if a swap area was already
+        * set by SNAPSHOT_SET_SWAP_AREA.
+        */
+       unpin_hibernation_swap_type(data->swap);
+
        /*
         * User space encodes device types as two-byte values,
         * so we need to recode them
         */
-       data->swap = swap_type_of(swdev, offset);
+       data->swap = pin_hibernation_swap_type(swdev, offset);
        if (data->swap < 0)
                return swdev ? -ENODEV : -EINVAL;
        data->dev = swdev;
index 615d908671113f94f7c69f30c4be50116528051b..5e1e605ad9a1e85c78150e91e0bbbc2c3f884312 100644 (file)
@@ -132,7 +132,7 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
 /* May return NULL on invalid type, caller must check for NULL return */
 static struct swap_info_struct *swap_type_to_info(int type)
 {
-       if (type >= MAX_SWAPFILES)
+       if (type < 0 || type >= MAX_SWAPFILES)
                return NULL;
        return READ_ONCE(swap_info[type]); /* rcu_dereference() */
 }
@@ -2199,22 +2199,15 @@ void swap_free_hibernation_slot(swp_entry_t entry)
        put_swap_device(si);
 }
 
-/*
- * Find the swap type that corresponds to given device (if any).
- *
- * @offset - number of the PAGE_SIZE-sized block of the device, starting
- * from 0, in which the swap header is expected to be located.
- *
- * This is needed for the suspend to disk (aka swsusp).
- */
-int swap_type_of(dev_t device, sector_t offset)
+static int __find_hibernation_swap_type(dev_t device, sector_t offset)
 {
        int type;
 
+       lockdep_assert_held(&swap_lock);
+
        if (!device)
-               return -1;
+               return -EINVAL;
 
-       spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *sis = swap_info[type];
 
@@ -2224,16 +2217,118 @@ int swap_type_of(dev_t device, sector_t offset)
                if (device == sis->bdev->bd_dev) {
                        struct swap_extent *se = first_se(sis);
 
-                       if (se->start_block == offset) {
-                               spin_unlock(&swap_lock);
+                       if (se->start_block == offset)
                                return type;
-                       }
                }
        }
-       spin_unlock(&swap_lock);
        return -ENODEV;
 }
 
+/**
+ * pin_hibernation_swap_type - Pin the swap device for hibernation
+ * @device: Block device containing the resume image
+ * @offset: Offset identifying the swap area
+ *
+ * Locate the swap device for @device/@offset and mark it as pinned
+ * for hibernation. While pinned, swapoff() is prevented.
+ *
+ * Only one uswsusp context may pin a swap device at a time.
+ * If already pinned, this function returns -EBUSY.
+ *
+ * Return:
+ * >= 0 on success (swap type).
+ * -EINVAL if @device is invalid.
+ * -ENODEV if the swap device is not found.
+ * -EBUSY if the device is already pinned for hibernation.
+ */
+int pin_hibernation_swap_type(dev_t device, sector_t offset)
+{
+       int type;
+       struct swap_info_struct *si;
+
+       spin_lock(&swap_lock);
+
+       type = __find_hibernation_swap_type(device, offset);
+       if (type < 0) {
+               spin_unlock(&swap_lock);
+               return type;
+       }
+
+       si = swap_type_to_info(type);
+       if (WARN_ON_ONCE(!si)) {
+               spin_unlock(&swap_lock);
+               return -ENODEV;
+       }
+
+       /*
+        * hibernate_acquire() prevents concurrent hibernation sessions.
+        * This check additionally guards against double-pinning within
+        * the same session.
+        */
+       if (WARN_ON_ONCE(si->flags & SWP_HIBERNATION)) {
+               spin_unlock(&swap_lock);
+               return -EBUSY;
+       }
+
+       si->flags |= SWP_HIBERNATION;
+
+       spin_unlock(&swap_lock);
+       return type;
+}
+
+/**
+ * unpin_hibernation_swap_type - Unpin the swap device for hibernation
+ * @type: Swap type previously returned by pin_hibernation_swap_type()
+ *
+ * Clear the hibernation pin on the given swap device, allowing
+ * swapoff() to proceed normally.
+ *
+ * If @type does not refer to a valid swap device, this function
+ * does nothing.
+ */
+void unpin_hibernation_swap_type(int type)
+{
+       struct swap_info_struct *si;
+
+       spin_lock(&swap_lock);
+       si = swap_type_to_info(type);
+       if (!si) {
+               spin_unlock(&swap_lock);
+               return;
+       }
+       si->flags &= ~SWP_HIBERNATION;
+       spin_unlock(&swap_lock);
+}
+
+/**
+ * find_hibernation_swap_type - Find swap type for hibernation
+ * @device: Block device containing the resume image
+ * @offset: Offset within the device identifying the swap area
+ *
+ * Locate the swap device corresponding to @device and @offset.
+ *
+ * Unlike pin_hibernation_swap_type(), this function only performs a
+ * lookup and does not mark the swap device as pinned for hibernation.
+ *
+ * This is safe in the sysfs-based hibernation path where user space
+ * is already frozen and swapoff() cannot run concurrently.
+ *
+ * Return:
+ * A non-negative swap type on success.
+ * -EINVAL if @device is invalid.
+ * -ENODEV if no matching swap device is found.
+ */
+int find_hibernation_swap_type(dev_t device, sector_t offset)
+{
+       int type;
+
+       spin_lock(&swap_lock);
+       type = __find_hibernation_swap_type(device, offset);
+       spin_unlock(&swap_lock);
+
+       return type;
+}
+
 int find_first_swap(dev_t *device)
 {
        int type;
@@ -2996,6 +3091,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                spin_unlock(&swap_lock);
                goto out_dput;
        }
+
+       /* Refuse swapoff while the device is pinned for hibernation */
+       if (p->flags & SWP_HIBERNATION) {
+               err = -EBUSY;
+               spin_unlock(&swap_lock);
+               goto out_dput;
+       }
+
        if (!security_vm_enough_memory_mm(current->mm, p->pages))
                vm_unacct_memory(p->pages);
        else {