/**
* __swap_cache_add_check - Check if a range is suitable for adding a folio.
- * @ci: The locked swap cluster.
- * @ci_off: Range start offset.
- * @nr: Number of slots to check.
- * @shadow: Returns the shadow value if one exists in the range.
+ * @ci: The locked swap cluster
+ * @targ_entry: The target swap entry to check, will be rounded down by @nr
+ * @nr: Number of slots to check, must be a power of 2
+ * @shadowp: Returns the shadow value if one exists in the range.
*
* Check if all slots covered by given range have a swap count >= 1.
* Retrieves the shadow if there is one.
* Return: 0 if success, error code if failed.
*/
static int __swap_cache_add_check(struct swap_cluster_info *ci,
- unsigned int ci_off, unsigned int nr,
- void **shadow)
+ swp_entry_t targ_entry,
+ unsigned long nr, void **shadowp)
{
- unsigned int ci_end = ci_off + nr;
+ unsigned int ci_off, ci_end;
unsigned long old_tb;
lockdep_assert_held(&ci->lock);
- if (WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER))
- return -EINVAL;
+ /*
+ * If the target slot is not swapped out or already cached, return
+ * -ENOENT or -EEXIST. If the batch is not suitable, could be a
+ * race with concurrent free or cache add, return -EBUSY.
+ */
if (unlikely(!ci->table))
return -ENOENT;
+ ci_off = swp_cluster_offset(targ_entry);
+ old_tb = __swap_table_get(ci, ci_off);
+ if (swp_tb_is_folio(old_tb))
+ return -EEXIST;
+ if (!__swp_tb_get_count(old_tb))
+ return -ENOENT;
+ if (swp_tb_is_shadow(old_tb) && shadowp)
+ *shadowp = swp_tb_to_shadow(old_tb);
+
+ if (nr == 1)
+ return 0;
+
+ ci_off = round_down(ci_off, nr);
+ ci_end = ci_off + nr;
do {
old_tb = __swap_table_get(ci, ci_off);
- if (unlikely(swp_tb_is_folio(old_tb)))
- return -EEXIST;
- if (unlikely(!__swp_tb_get_count(old_tb)))
- return -ENOENT;
- if (swp_tb_is_shadow(old_tb))
- *shadow = swp_tb_to_shadow(old_tb);
+ if (unlikely(swp_tb_is_folio(old_tb) ||
+ !__swp_tb_get_count(old_tb)))
+ return -EBUSY;
} while (++ci_off < ci_end);
return 0;
{
int err;
void *shadow = NULL;
- unsigned int ci_off;
struct swap_info_struct *si;
struct swap_cluster_info *ci;
unsigned long nr_pages = folio_nr_pages(folio);
si = __swap_entry_to_info(entry);
ci = swap_cluster_lock(si, swp_offset(entry));
- ci_off = swp_cluster_offset(entry);
- err = __swap_cache_add_check(ci, ci_off, nr_pages, &shadow);
+ err = __swap_cache_add_check(ci, entry, nr_pages, &shadow);
if (err) {
swap_cluster_unlock(ci);
return err;
}
}
+/*
+ * Try to allocate a folio of given order in the swap cache.
+ *
+ * This helper resolves the potential races of swap allocation
+ * and prepares a folio to be used for swap IO. May return following
+ * value:
+ *
+ * -ENOMEM / -EBUSY: Order is too large or in conflict with sub slot,
+ * caller should shrink the order and retry
+ * -ENOENT / -EEXIST: Target swap entry is unavailable or cached, the caller
+ * should abort or try to use the cached folio instead
+ */
+static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
+ swp_entry_t targ_entry, gfp_t gfp,
+ unsigned int order, struct vm_fault *vmf,
+ struct mempolicy *mpol, pgoff_t ilx)
+{
+ int err;
+ swp_entry_t entry;
+ struct folio *folio;
+ void *shadow = NULL;
+ unsigned long address, nr_pages = 1UL << order;
+ struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+
+ VM_WARN_ON_ONCE(nr_pages > SWAPFILE_CLUSTER);
+ entry.val = round_down(targ_entry.val, nr_pages);
+
+ /* Check if the slot and range are available, skip allocation if not */
+ spin_lock(&ci->lock);
+ err = __swap_cache_add_check(ci, targ_entry, nr_pages, NULL);
+ spin_unlock(&ci->lock);
+ if (unlikely(err))
+ return ERR_PTR(err);
+
+ /*
+ * Limit THP gfp. The limitation is a no-op for typical
+ * GFP_HIGHUSER_MOVABLE but matters for shmem.
+ */
+ if (order)
+ gfp = thp_shmem_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
+
+ if (mpol || !vmf) {
+ folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
+ } else {
+ address = round_down(vmf->address, PAGE_SIZE << order);
+ folio = vma_alloc_folio(gfp, order, vmf->vma, address);
+ }
+ if (unlikely(!folio))
+ return ERR_PTR(-ENOMEM);
+
+ /* Double check the range is still not in conflict */
+ spin_lock(&ci->lock);
+ err = __swap_cache_add_check(ci, targ_entry, nr_pages, &shadow);
+ if (unlikely(err)) {
+ spin_unlock(&ci->lock);
+ folio_put(folio);
+ return ERR_PTR(err);
+ }
+
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ __swap_cache_do_add_folio(ci, folio, entry);
+ spin_unlock(&ci->lock);
+
+ if (mem_cgroup_swapin_charge_folio(folio, vmf ? vmf->vma->vm_mm : NULL,
+ gfp, entry)) {
+ spin_lock(&ci->lock);
+ __swap_cache_do_del_folio(ci, folio, entry, shadow);
+ spin_unlock(&ci->lock);
+ folio_unlock(folio);
+ /* nr_pages refs from swap cache, 1 from allocation */
+ folio_put_refs(folio, nr_pages + 1);
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* For memsw accounting, swap is uncharged when folio is added to swap cache */
+ memcg1_swapin(entry, 1 << order);
+ if (shadow)
+ workingset_refault(folio, shadow);
+
+ node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
+
+ /* Caller will initiate read into locked new_folio */
+ folio_add_lru(folio);
+ return folio;
+}
+
+/**
+ * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache.
+ * @targ_entry: swap entry indicating the target slot
+ * @gfp: memory allocation flags
+ * @orders: allocation orders, must be non zero
+ * @vmf: fault information
+ * @mpol: NUMA memory allocation policy to be applied
+ * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
+ *
+ * Allocate a folio in the swap cache for one swap slot, typically before
+ * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by
+ * @targ_entry must have a non-zero swap count (swapped out).
+ *
+ * Context: Caller must protect the swap device with reference count or locks.
+ * Return: Returns the folio if allocation succeeded and folio is in the swap
+ * cache. Returns error code if failed due to race, OOM or invalid arguments.
+ */
+struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
+ unsigned long orders, struct vm_fault *vmf,
+ struct mempolicy *mpol, pgoff_t ilx)
+{
+ int order, err;
+ struct folio *ret;
+ struct swap_cluster_info *ci;
+
+ ci = __swap_entry_to_cluster(targ_entry);
+ order = highest_order(orders);
+
+ /* orders must be non-zero, and must not exceed cluster size. */
+ if (WARN_ON_ONCE(!orders || (1UL << order) > SWAPFILE_CLUSTER))
+ return ERR_PTR(-EINVAL);
+
+ do {
+ ret = __swap_cache_alloc(ci, targ_entry, gfp, order,
+ vmf, mpol, ilx);
+ if (!IS_ERR(ret))
+ break;
+ err = PTR_ERR(ret);
+ if (!order || (err && err != -EBUSY && err != -ENOMEM))
+ break;
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
+ order = next_order(&orders, order);
+ } while (orders);
+
+ return ret;
+}
+
/*
* If we are the only user, then try to free up the swap cache.
*
return ret;
}
-/**
- * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache.
- * @entry: the swapped out swap entry to be binded to the folio.
- * @gfp_mask: memory allocation flags
- * @mpol: NUMA memory allocation policy to be applied
- * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
- *
- * Allocate a folio in the swap cache for one swap slot, typically before
- * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by
- * @entry must have a non-zero swap count (swapped out).
- * Currently only supports order 0.
- *
- * Context: Caller must protect the swap device with reference count or locks.
- * Return: Returns the folio if allocation succeeded and folio is added to
- * swap cache. Returns error code if allocation failed due to race or OOM.
- */
-struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
- struct mempolicy *mpol, pgoff_t ilx)
-{
- int err;
- struct folio *folio;
-
- /* Allocate a new folio to be added into the swap cache. */
- folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
- if (!folio)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Try to add the new folio to the swap cache. It returns
- * -EEXIST if the entry is already cached.
- */
- err = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
- if (err) {
- folio_put(folio);
- return ERR_PTR(err);
- }
-
- return folio;
-}
-
static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
struct mempolicy *mpol, pgoff_t ilx,
struct swap_iocb **plug, bool readahead)
{
- struct swap_info_struct *si = __swap_entry_to_info(entry);
struct folio *folio;
- /* Check the swap cache again for readahead path. */
- folio = swap_cache_get_folio(entry);
- if (folio)
- return folio;
-
- /* Skip allocation for unused and bad swap slot for readahead. */
- if (!swap_entry_swapped(si, entry))
- return NULL;
-
do {
folio = swap_cache_get_folio(entry);
if (folio)
return folio;
- folio = swap_cache_alloc_folio(entry, gfp, mpol, ilx);
+ folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx);
} while (PTR_ERR(folio) == -EEXIST);
if (IS_ERR_OR_NULL(folio))