return VM_FAULT_SIGBUS;
}
-static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
-{
- struct vm_area_struct *vma = vmf->vma;
- struct folio *folio;
- softleaf_t entry;
-
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
- if (!folio)
- return NULL;
-
- entry = softleaf_from_pte(vmf->orig_pte);
- if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
- GFP_KERNEL, entry)) {
- folio_put(folio);
- return NULL;
- }
-
- return folio;
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* Check if the PTEs within a range are contiguous swap entries
*/
if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
return false;
- if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
- return false;
return true;
}
return orders;
}
-static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long orders;
- struct folio *folio;
unsigned long addr;
softleaf_t entry;
spinlock_t *ptl;
pte_t *pte;
- gfp_t gfp;
int order;
/*
* maintain the uffd semantics.
*/
if (unlikely(userfaultfd_armed(vma)))
- goto fallback;
+ return 0;
/*
* A large swapped out folio could be partially or fully in zswap. We
* folio.
*/
if (!zswap_never_enabled())
- goto fallback;
+ return 0;
entry = softleaf_from_pte(vmf->orig_pte);
/*
vmf->address, orders);
if (!orders)
- goto fallback;
+ return 0;
pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
vmf->address & PMD_MASK, &ptl);
if (unlikely(!pte))
- goto fallback;
+ return 0;
/*
* For do_swap_page, find the highest order where the aligned range is
pte_unmap_unlock(pte, ptl);
- /* Try allocating the highest of the remaining orders. */
- gfp = vma_thp_gfp_mask(vma);
- while (orders) {
- addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- folio = vma_alloc_folio(gfp, order, vma, addr);
- if (folio) {
- if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
- gfp, entry))
- return folio;
- count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
- folio_put(folio);
- }
- count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
- order = next_order(&orders, order);
- }
-
-fallback:
- return __alloc_swap_folio(vmf);
+ return orders;
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
{
- return __alloc_swap_folio(vmf);
+ return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
if (folio)
swap_update_readahead(folio, vma, vmf->address);
if (!folio) {
- if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
- folio = alloc_swap_folio(vmf);
- if (folio) {
- /*
- * folio is charged, so swapin can only fail due
- * to raced swapin and return NULL.
- */
- swapcache = swapin_folio(entry, folio);
- if (swapcache != folio)
- folio_put(folio);
- folio = swapcache;
- }
- } else {
+ /* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+ folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE,
+ thp_swapin_suitable_orders(vmf) | BIT(0),
+ vmf, NULL, 0);
+ else
folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
- }
- if (!folio) {
+ if (IS_ERR_OR_NULL(folio)) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
if (likely(vmf->pte &&
pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
ret = VM_FAULT_OOM;
+ folio = NULL;
goto unlock;
}
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, vm_fault_t *fault_type);
+ struct vm_fault *vmf, vm_fault_t *fault_type);
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
}
static struct folio *shmem_swap_alloc_folio(struct inode *inode,
- struct vm_area_struct *vma, pgoff_t index,
+ struct vm_fault *vmf, pgoff_t index,
swp_entry_t entry, int order, gfp_t gfp)
{
+ pgoff_t ilx;
+ struct folio *folio;
+ struct mempolicy *mpol;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct folio *new, *swapcache;
- int nr_pages = 1 << order;
- gfp_t alloc_gfp = gfp;
-
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- if (WARN_ON_ONCE(order))
- return ERR_PTR(-EINVAL);
- } else if (order) {
- /*
- * If uffd is active for the vma, we need per-page fault
- * fidelity to maintain the uffd semantics, then fallback
- * to swapin order-0 folio, as well as for zswap case.
- * Any existing sub folio in the swap cache also blocks
- * mTHP swapin.
- */
- if ((vma && unlikely(userfaultfd_armed(vma))) ||
- !zswap_never_enabled() ||
- non_swapcache_batch(entry, nr_pages) != nr_pages)
- goto fallback;
- alloc_gfp = thp_shmem_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
- }
-retry:
- new = shmem_alloc_folio(alloc_gfp, order, info, index);
- if (!new) {
- new = ERR_PTR(-ENOMEM);
- goto fallback;
- }
+ if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) ||
+ !zswap_never_enabled())
+ order = 0;
- if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
- alloc_gfp, entry)) {
- folio_put(new);
- new = ERR_PTR(-ENOMEM);
- goto fallback;
- }
+again:
+ mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
+ folio = swapin_sync(entry, gfp, BIT(order), vmf, mpol, ilx);
+ mpol_cond_put(mpol);
- swapcache = swapin_folio(entry, new);
- if (swapcache != new) {
- folio_put(new);
- if (!swapcache) {
- /*
- * The new folio is charged already, swapin can
- * only fail due to another raced swapin.
- */
- new = ERR_PTR(-EEXIST);
- goto fallback;
- }
+ if (!IS_ERR(folio))
+ return folio;
+
+ if (order) {
+ order = 0;
+ goto again;
}
- return swapcache;
-fallback:
- /* Order 0 swapin failed, nothing to fallback to, abort */
- if (!order)
- return new;
- entry.val += index - round_down(index, nr_pages);
- alloc_gfp = gfp;
- nr_pages = 1;
- order = 0;
- goto retry;
+
+ return folio;
}
/*
*/
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
+ gfp_t gfp, struct vm_fault *vmf,
vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
- struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
+ struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+ struct mm_struct *fault_mm = vmf ? vmf->vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode);
swp_entry_t swap;
softleaf_t index_entry;
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
/* Direct swapin skipping swap cache & readahead */
- folio = shmem_swap_alloc_folio(inode, vma, index,
- index_entry, order, gfp);
- if (IS_ERR(folio)) {
- error = PTR_ERR(folio);
- folio = NULL;
- goto failed;
- }
+ folio = shmem_swap_alloc_folio(inode, vmf, index,
+ swap, order, gfp);
} else {
/* Cached swapin only supports order 0 folio */
folio = shmem_swapin_cluster(swap, gfp, info, index);
- if (!folio) {
+ }
+ if (IS_ERR_OR_NULL(folio)) {
+ if (IS_ERR(folio))
+ error = PTR_ERR(folio);
+ else
error = -ENOMEM;
- goto failed;
- }
+ folio = NULL;
+ goto failed;
}
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
if (xa_is_value(folio)) {
error = shmem_swapin_folio(inode, index, &folio,
- sgp, gfp, vma, fault_type);
+ sgp, gfp, vmf, fault_type);
if (error == -EEXIST)
goto repeat;
struct mempolicy *mpol, pgoff_t ilx);
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
-struct folio *swapin_folio(swp_entry_t entry, struct folio *folio);
+struct folio *swapin_sync(swp_entry_t entry, gfp_t flag, unsigned long orders,
+ struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx);
void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr);
return find_next_bit(sis->zeromap, end, start) - start;
}
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- int i;
-
- /*
- * While allocating a large folio and doing mTHP swapin, we need to
- * ensure all entries are not cached, otherwise, the mTHP folio will
- * be in conflict with the folio in swap cache.
- */
- for (i = 0; i < max_nr; i++) {
- if (swap_cache_has_folio(entry))
- return i;
- entry.val++;
- }
-
- return i;
-}
-
#else /* CONFIG_SWAP */
struct swap_iocb;
static inline struct swap_cluster_info *swap_cluster_lock(
return NULL;
}
-static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
+static inline struct folio *swapin_sync(
+ swp_entry_t entry, gfp_t flag, unsigned long orders,
+ struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
{
return NULL;
}
{
return 0;
}
-
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- return 0;
-}
#endif /* CONFIG_SWAP */
#endif /* _MM_SWAP_H */
lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
}
-/**
- * swap_cache_add_folio - Add a folio into the swap cache.
- * @folio: The folio to be added.
- * @entry: The swap entry corresponding to the folio.
- * @shadowp: If a shadow is found, return the shadow.
- *
- * Add a folio into the swap cache. Will return error if any slot is no
- * longer a valid swapped out slot or already occupied by another folio.
- *
- * Context: Caller must ensure @entry is valid and protect the swap device
- * with reference count or locks.
- */
-static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
- void **shadowp)
-{
- int err;
- void *shadow = NULL;
- struct swap_info_struct *si;
- struct swap_cluster_info *ci;
- unsigned long nr_pages = folio_nr_pages(folio);
-
- si = __swap_entry_to_info(entry);
- ci = swap_cluster_lock(si, swp_offset(entry));
- err = __swap_cache_add_check(ci, entry, nr_pages, &shadow);
- if (err) {
- swap_cluster_unlock(ci);
- return err;
- }
-
- __swap_cache_add_folio(ci, folio, entry);
- swap_cluster_unlock(ci);
- if (shadowp)
- *shadowp = shadow;
-
- return 0;
-}
-
static void __swap_cache_do_del_folio(struct swap_cluster_info *ci,
struct folio *folio,
swp_entry_t entry, void *shadow)
}
}
-/**
- * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache.
- * @entry: swap entry to be bound to the folio.
- * @folio: folio to be added.
- * @gfp: memory allocation flags for charge, can be 0 if @charged if true.
- * @charged: if the folio is already charged.
- *
- * Update the swap_map and add folio as swap cache, typically before swapin.
- * All swap slots covered by the folio must have a non-zero swap count.
- *
- * Context: Caller must protect the swap device with reference count or locks.
- * Return: 0 if success, error code if failed.
- */
-static int __swap_cache_prepare_and_add(swp_entry_t entry,
- struct folio *folio,
- gfp_t gfp, bool charged)
-{
- void *shadow;
- int ret;
-
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
- if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) {
- ret = -ENOMEM;
- goto failed;
- }
-
- ret = swap_cache_add_folio(folio, entry, &shadow);
- if (ret)
- goto failed;
-
- memcg1_swapin(entry, folio_nr_pages(folio));
- if (shadow)
- workingset_refault(folio, shadow);
-
- /* Caller will initiate read into locked folio */
- folio_add_lru(folio);
- return 0;
-
-failed:
- folio_unlock(folio);
- return ret;
-}
-
static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
struct mempolicy *mpol, pgoff_t ilx,
struct swap_iocb **plug, bool readahead)
folio = swap_cache_get_folio(entry);
if (folio)
return folio;
-
folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx);
} while (PTR_ERR(folio) == -EEXIST);
}
/**
- * swapin_folio - swap-in one or multiple entries skipping readahead.
- * @entry: starting swap entry to swap in
- * @folio: a new allocated and charged folio
+ * swapin_sync - swap-in one or multiple entries skipping readahead.
+ * @entry: swap entry indicating the target slot
+ * @gfp: memory allocation flags
+ * @orders: allocation orders
+ * @vmf: fault information
+ * @mpol: NUMA memory allocation policy to be applied
+ * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
*
- * Reads @entry into @folio, @folio will be added to the swap cache.
- * If @folio is a large folio, the @entry will be rounded down to align
- * with the folio size.
+ * This allocates a folio suitable for given @orders, or returns the
+ * existing folio in the swap cache for @entry. This initiates the IO, too,
+ * if needed. @entry is rounded down if @orders allow large allocation.
*
- * Return: returns pointer to @folio on success. If folio is a large folio
- * and this raced with another swapin, NULL will be returned to allow fallback
- * to order 0. Else, if another folio was already added to the swap cache,
- * return that swap cache folio instead.
+ * Context: Caller must ensure @entry is valid and pin the swap device with refcount.
+ * Return: Returns the folio on success, error code if failed.
*/
-struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
+struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
+ struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
{
- int ret;
- struct folio *swapcache;
- pgoff_t offset = swp_offset(entry);
- unsigned long nr_pages = folio_nr_pages(folio);
-
- entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
- for (;;) {
- ret = __swap_cache_prepare_and_add(entry, folio, 0, true);
- if (!ret) {
- swap_read_folio(folio, NULL);
- break;
- }
+ struct folio *folio;
- /*
- * Large order allocation needs special handling on
- * race: if a smaller folio exists in cache, swapin needs
- * to fall back to order 0, and doing a swap cache lookup
- * might return a folio that is irrelevant to the faulting
- * entry because @entry is aligned down. Just return NULL.
- */
- if (ret != -EEXIST || nr_pages > 1)
- return NULL;
+ do {
+ folio = swap_cache_get_folio(entry);
+ if (folio)
+ return folio;
+ folio = swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx);
+ } while (PTR_ERR(folio) == -EEXIST);
- swapcache = swap_cache_get_folio(entry);
- if (swapcache)
- return swapcache;
- }
+ if (IS_ERR(folio))
+ return folio;
+ swap_read_folio(folio, NULL);
return folio;
}
* do_swap_page()
* ... swapoff+swapon
* swap_cache_alloc_folio()
- * swap_cache_add_folio()
- * // check swap_map
+ * // check swap_map
* // verify PTE not changed
*
* In __swap_duplicate(), the swap_map need to be checked before