}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+/* Sanity check that a folio is fully exclusive */
+static void check_swap_exclusive(struct folio *folio, swp_entry_t entry,
+ unsigned int nr_pages)
+{
+ /* Called under PT locked and folio locked, the swap count is stable */
+ do {
+ VM_WARN_ON_ONCE_FOLIO(__swap_count(entry) != 1, folio);
+ entry.val++;
+ } while (--nr_pages);
+}
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct folio *swapcache, *folio = NULL;
- DECLARE_WAITQUEUE(wait, current);
+ struct folio *swapcache = NULL, *folio;
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
- bool need_clear_cache = false;
bool exclusive = false;
softleaf_t entry;
pte_t pte;
vm_fault_t ret = 0;
- void *shadow = NULL;
int nr_pages;
unsigned long page_idx;
unsigned long address;
folio = swap_cache_get_folio(entry);
if (folio)
swap_update_readahead(folio, vma, vmf->address);
- swapcache = folio;
-
if (!folio) {
- if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
- __swap_count(entry) == 1) {
- /* skip swapcache */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
folio = alloc_swap_folio(vmf);
if (folio) {
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
- nr_pages = folio_nr_pages(folio);
- if (folio_test_large(folio))
- entry.val = ALIGN_DOWN(entry.val, nr_pages);
/*
- * Prevent parallel swapin from proceeding with
- * the cache flag. Otherwise, another thread
- * may finish swapin first, free the entry, and
- * swapout reusing the same entry. It's
- * undetectable as pte_same() returns true due
- * to entry reuse.
+ * folio is charged, so swapin can only fail due
+ * to raced swapin and return NULL.
*/
- if (swapcache_prepare(entry, nr_pages)) {
- /*
- * Relax a bit to prevent rapid
- * repeated page faults.
- */
- add_wait_queue(&swapcache_wq, &wait);
- schedule_timeout_uninterruptible(1);
- remove_wait_queue(&swapcache_wq, &wait);
- goto out_page;
- }
- need_clear_cache = true;
-
- memcg1_swapin(entry, nr_pages);
-
- shadow = swap_cache_get_shadow(entry);
- if (shadow)
- workingset_refault(folio, shadow);
-
- folio_add_lru(folio);
-
- /* To provide entry to swap_read_folio() */
- folio->swap = entry;
- swap_read_folio(folio, NULL);
- folio->private = NULL;
+ swapcache = swapin_folio(entry, folio);
+ if (swapcache != folio)
+ folio_put(folio);
+ folio = swapcache;
}
} else {
- folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
- vmf);
- swapcache = folio;
+ folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
}
if (!folio) {
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
}
+ swapcache = folio;
ret |= folio_lock_or_retry(folio, vmf);
if (ret & VM_FAULT_RETRY)
goto out_release;
goto out_nomap;
}
- /* allocated large folios for SWP_SYNCHRONOUS_IO */
- if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
- unsigned long nr = folio_nr_pages(folio);
- unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
- unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
- pte_t *folio_ptep = vmf->pte - idx;
- pte_t folio_pte = ptep_get(folio_ptep);
-
- if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
- swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
- goto out_nomap;
-
- page_idx = idx;
- address = folio_start;
- ptep = folio_ptep;
- goto check_folio;
- }
-
nr_pages = 1;
page_idx = 0;
address = vmf->address;
BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
+ /*
+ * If a large folio already belongs to anon mapping, then we
+ * can just go on and map it partially.
+ * If not, with the large swapin check above failing, the page table
+ * have changed, so sub pages might got charged to the wrong cgroup,
+ * or even should be shmem. So we have to free it and fallback.
+ * Nothing should have touched it, both anon and shmem checks if a
+ * large folio is fully appliable before use.
+ *
+ * This will be removed once we unify folio allocation in the swap cache
+ * layer, where allocation of a folio stabilizes the swap entries.
+ */
+ if (!folio_test_anon(folio) && folio_test_large(folio) &&
+ nr_pages != folio_nr_pages(folio)) {
+ if (!WARN_ON_ONCE(folio_test_dirty(folio)))
+ swap_cache_del_folio(folio);
+ goto out_nomap;
+ }
+
/*
* Check under PT lock (to protect against concurrent fork() sharing
* the swap entry concurrently) for certainly exclusive pages.
*/
if (!folio_test_ksm(folio)) {
+ /*
+ * The can_swapin_thp check above ensures all PTE have
+ * same exclusiveness. Checking just one PTE is fine.
+ */
exclusive = pte_swp_exclusive(vmf->orig_pte);
+ if (exclusive)
+ check_swap_exclusive(folio, entry, nr_pages);
if (folio != swapcache) {
/*
* We have a fresh page that is not exposed to the
vmf->orig_pte = pte_advance_pfn(pte, page_idx);
/* ksm created a completely new copy */
- if (unlikely(folio != swapcache && swapcache)) {
+ if (unlikely(folio != swapcache)) {
folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
} else if (!folio_test_anon(folio)) {
/*
- * We currently only expect small !anon folios which are either
- * fully exclusive or fully shared, or new allocated large
- * folios which are fully exclusive. If we ever get large
- * folios within swapcache here, we have to be careful.
+ * We currently only expect !anon folios that are fully
+ * mappable. See the comment after can_swapin_thp above.
*/
- VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
- VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
} else {
folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- /* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache) {
- swapcache_clear(si, entry, nr_pages);
- if (waitqueue_active(&swapcache_wq))
- wake_up(&swapcache_wq);
- }
if (si)
put_swap_device(si);
return ret;
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
folio_unlock(folio);
out_release:
folio_put(folio);
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache) {
- swapcache_clear(si, entry, nr_pages);
- if (waitqueue_active(&swapcache_wq))
- wake_up(&swapcache_wq);
- }
if (si)
put_swap_device(si);
return ret;