#include "swap_table.h"
#include "swap.h"
-static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
- unsigned char);
-static void free_swap_count_continuations(struct swap_info_struct *);
static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr);
-static void swap_put_entry_locked(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- unsigned long offset);
static bool folio_swapcache_freeable(struct folio *folio);
static void move_cluster(struct swap_info_struct *si,
struct swap_cluster_info *ci, struct list_head *list,
/* Reclaim the swap entry if swap is getting full */
#define TTRS_FULL 0x4
-static bool swap_only_has_cache(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
+static bool swap_only_has_cache(struct swap_cluster_info *ci,
unsigned long offset, int nr_pages)
{
unsigned int ci_off = offset % SWAPFILE_CLUSTER;
- unsigned char *map = si->swap_map + offset;
- unsigned char *map_end = map + nr_pages;
+ unsigned int ci_end = ci_off + nr_pages;
unsigned long swp_tb;
do {
swp_tb = __swap_table_get(ci, ci_off);
VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb));
- if (*map)
+ if (swp_tb_get_count(swp_tb))
return false;
- ++ci_off;
- } while (++map < map_end);
+ } while (++ci_off < ci_end);
return true;
}
* reference or pending writeback, and can't be allocated to others.
*/
ci = swap_cluster_lock(si, offset);
- need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages);
+ need_reclaim = swap_only_has_cache(ci, offset, nr_pages);
swap_cluster_unlock(ci);
if (!need_reclaim)
goto out_unlock;
} while (++ci_off < ci_end);
WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0));
+ WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table);
}
static void swap_cluster_free_table(struct swap_cluster_info *ci)
pr_warn("Duplicated bad slot offset %d\n", offset);
ret = -EINVAL;
} else {
- si->swap_map[offset] = SWAP_MAP_BAD;
ci->count++;
}
spin_unlock(&ci->lock);
{
unsigned int nr_pages = 1 << order;
unsigned long offset = start, end = start + nr_pages;
- unsigned char *map = si->swap_map;
unsigned long swp_tb;
spin_unlock(&ci->lock);
do {
- if (READ_ONCE(map[offset]))
- break;
swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
- if (swp_tb_is_folio(swp_tb)) {
+ if (swp_tb_get_count(swp_tb))
+ break;
+ if (swp_tb_is_folio(swp_tb))
if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0)
break;
- }
} while (++offset < end);
spin_lock(&ci->lock);
*/
for (offset = start; offset < end; offset++) {
swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
- if (map[offset] || !swp_tb_is_null(swp_tb))
+ if (!swp_tb_is_null(swp_tb))
return false;
}
unsigned long offset, unsigned int nr_pages,
bool *need_reclaim)
{
- unsigned long end = offset + nr_pages;
- unsigned char *map = si->swap_map;
+ unsigned int ci_off = offset % SWAPFILE_CLUSTER;
+ unsigned int ci_end = ci_off + nr_pages;
unsigned long swp_tb;
- if (cluster_is_empty(ci))
- return true;
-
do {
- if (map[offset])
- return false;
- swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
- if (swp_tb_is_folio(swp_tb)) {
+ swp_tb = __swap_table_get(ci, ci_off);
+ if (swp_tb_is_null(swp_tb))
+ continue;
+ if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) {
if (!vm_swap_full())
return false;
*need_reclaim = true;
- } else {
- /* A entry with no count and no cache must be null */
- VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+ continue;
}
- } while (++offset < end);
+ /* Slot with zero count can only be NULL or folio */
+ VM_WARN_ON(!swp_tb_get_count(swp_tb));
+ return false;
+ } while (++ci_off < ci_end);
return true;
}
-static bool cluster_alloc_range(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- struct folio *folio,
- unsigned int offset)
+static bool __swap_cluster_alloc_entries(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ struct folio *folio,
+ unsigned int ci_off)
{
- unsigned long nr_pages;
unsigned int order;
+ unsigned long nr_pages;
lockdep_assert_held(&ci->lock);
if (likely(folio)) {
order = folio_order(folio);
nr_pages = 1 << order;
- swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false);
- __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset));
+ swap_cluster_assert_empty(ci, ci_off, nr_pages, false);
+ __swap_cache_add_folio(ci, folio, swp_entry(si->type,
+ ci_off + cluster_offset(si, ci)));
} else if (IS_ENABLED(CONFIG_HIBERNATION)) {
order = 0;
nr_pages = 1;
- WARN_ON_ONCE(si->swap_map[offset]);
- si->swap_map[offset] = 1;
- swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, 1, false);
+ swap_cluster_assert_empty(ci, ci_off, 1, false);
+ /* Sets a fake shadow as placeholder */
+ __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1));
} else {
/* Allocation without folio is only possible with hibernation */
WARN_ON_ONCE(1);
if (!ret)
continue;
}
- if (!cluster_alloc_range(si, ci, folio, offset))
+ if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER))
break;
found = offset;
offset += nr_pages;
long to_scan = 1;
unsigned long offset, end;
struct swap_cluster_info *ci;
- unsigned char *map = si->swap_map;
+ unsigned long swp_tb;
int nr_reclaim;
if (force)
to_scan--;
while (offset < end) {
- if (!READ_ONCE(map[offset]) &&
- swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) {
+ swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
+ if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) {
spin_unlock(&ci->lock);
nr_reclaim = __try_to_reclaim_swap(si, offset,
TTRS_ANYWAY);
return false;
}
+static int swap_extend_table_alloc(struct swap_info_struct *si,
+ struct swap_cluster_info *ci, gfp_t gfp)
+{
+ void *table;
+
+ table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp);
+ if (!table)
+ return -ENOMEM;
+
+ spin_lock(&ci->lock);
+ if (!ci->extend_table)
+ ci->extend_table = table;
+ else
+ kfree(table);
+ spin_unlock(&ci->lock);
+ return 0;
+}
+
+int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
+{
+ int ret;
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ si = get_swap_device(entry);
+ if (!si)
+ return 0;
+
+ ci = __swap_offset_to_cluster(si, offset);
+ ret = swap_extend_table_alloc(si, ci, gfp);
+
+ put_swap_device(si);
+ return ret;
+}
+
+static void swap_extend_table_try_free(struct swap_cluster_info *ci)
+{
+ unsigned long i;
+ bool can_free = true;
+
+ if (!ci->extend_table)
+ return;
+
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ if (ci->extend_table[i])
+ can_free = false;
+ }
+
+ if (can_free) {
+ kfree(ci->extend_table);
+ ci->extend_table = NULL;
+ }
+}
+
+/* Decrease the swap count of one slot, without freeing it */
+static void __swap_cluster_put_entry(struct swap_cluster_info *ci,
+ unsigned int ci_off)
+{
+ int count;
+ unsigned long swp_tb;
+
+ lockdep_assert_held(&ci->lock);
+ swp_tb = __swap_table_get(ci, ci_off);
+ count = __swp_tb_get_count(swp_tb);
+
+ VM_WARN_ON_ONCE(count <= 0);
+ VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX);
+
+ if (count == SWP_TB_COUNT_MAX) {
+ count = ci->extend_table[ci_off];
+ /* Overflow starts with SWP_TB_COUNT_MAX */
+ VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX);
+ count--;
+ if (count == (SWP_TB_COUNT_MAX - 1)) {
+ ci->extend_table[ci_off] = 0;
+ __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count));
+ swap_extend_table_try_free(ci);
+ } else {
+ ci->extend_table[ci_off] = count;
+ }
+ } else {
+ __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count));
+ }
+}
+
/**
- * swap_put_entries_cluster - Decrease the swap count of a set of slots.
+ * swap_put_entries_cluster - Decrease the swap count of slots within one cluster
* @si: The swap device.
- * @start: start offset of slots.
+ * @offset: start offset of slots.
* @nr: number of slots.
- * @reclaim_cache: if true, also reclaim the swap cache.
+ * @reclaim_cache: if true, also reclaim the swap cache if slots are freed.
*
* This helper decreases the swap count of a set of slots and tries to
* batch free them. Also reclaims the swap cache if @reclaim_cache is true.
- * Context: The caller must ensure that all slots belong to the same
- * cluster and their swap count doesn't go underflow.
+ *
+ * Context: The specified slots must be pinned by existing swap count or swap
+ * cache reference, so they won't be released until this helper returns.
*/
static void swap_put_entries_cluster(struct swap_info_struct *si,
- unsigned long start, int nr,
+ pgoff_t offset, int nr,
bool reclaim_cache)
{
- unsigned long offset = start, end = start + nr;
- unsigned long batch_start = SWAP_ENTRY_INVALID;
struct swap_cluster_info *ci;
+ unsigned int ci_off, ci_end;
+ pgoff_t end = offset + nr;
bool need_reclaim = false;
unsigned int nr_reclaimed;
unsigned long swp_tb;
- unsigned int count;
+ int ci_batch = -1;
ci = swap_cluster_lock(si, offset);
+ ci_off = offset % SWAPFILE_CLUSTER;
+ ci_end = ci_off + nr;
do {
- swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
- count = si->swap_map[offset];
- VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD);
- if (count == 1) {
+ swp_tb = __swap_table_get(ci, ci_off);
+ if (swp_tb_get_count(swp_tb) == 1) {
/* count == 1 and non-cached slots will be batch freed. */
if (!swp_tb_is_folio(swp_tb)) {
- if (!batch_start)
- batch_start = offset;
+ if (ci_batch == -1)
+ ci_batch = ci_off;
continue;
}
/* count will be 0 after put, slot can be reclaimed */
* slots will be freed when folio is removed from swap cache
* (__swap_cache_del_folio).
*/
- swap_put_entry_locked(si, ci, offset);
- if (batch_start) {
- swap_entries_free(si, ci, batch_start, offset - batch_start);
- batch_start = SWAP_ENTRY_INVALID;
+ __swap_cluster_put_entry(ci, ci_off);
+ if (ci_batch != -1) {
+ __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch);
+ ci_batch = -1;
}
- } while (++offset < end);
+ } while (++ci_off < ci_end);
- if (batch_start)
- swap_entries_free(si, ci, batch_start, offset - batch_start);
+ if (ci_batch != -1)
+ __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch);
swap_cluster_unlock(ci);
if (!need_reclaim || !reclaim_cache)
return;
- offset = start;
do {
nr_reclaimed = __try_to_reclaim_swap(si, offset,
TTRS_UNMAPPED | TTRS_FULL);
} while (offset < end);
}
+/* Increase the swap count of one slot. */
+static int __swap_cluster_dup_entry(struct swap_cluster_info *ci,
+ unsigned int ci_off)
+{
+ int count;
+ unsigned long swp_tb;
+
+ lockdep_assert_held(&ci->lock);
+ swp_tb = __swap_table_get(ci, ci_off);
+ /* Bad or special slots can't be handled */
+ if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb)))
+ return -EINVAL;
+ count = __swp_tb_get_count(swp_tb);
+ /* Must be either cached or have a count already */
+ if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb)))
+ return -ENOENT;
+
+ if (likely(count < (SWP_TB_COUNT_MAX - 1))) {
+ __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1));
+ VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]);
+ } else if (count == (SWP_TB_COUNT_MAX - 1)) {
+ if (ci->extend_table) {
+ VM_WARN_ON_ONCE(ci->extend_table[ci_off]);
+ ci->extend_table[ci_off] = SWP_TB_COUNT_MAX;
+ __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX));
+ } else {
+ return -ENOMEM;
+ }
+ } else if (count == SWP_TB_COUNT_MAX) {
+ VM_WARN_ON_ONCE(ci->extend_table[ci_off] >=
+ type_max(typeof(ci->extend_table[0])));
+ ++ci->extend_table[ci_off];
+ } else {
+ /* Never happens unless counting went wrong */
+ WARN_ON_ONCE(1);
+ }
+
+ return 0;
+}
+
+/**
+ * swap_dup_entries_cluster: Increase the swap count of slots within one cluster.
+ * @si: The swap device.
+ * @offset: start offset of slots.
+ * @nr: number of slots.
+ *
+ * Context: The specified slots must be pinned by existing swap count or swap
+ * cache reference, so they won't be released until this helper returns.
+ * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX)
+ * and failed to allocate an extended table, -EINVAL if any entry is bad entry.
+ */
+static int swap_dup_entries_cluster(struct swap_info_struct *si,
+ pgoff_t offset, int nr)
+{
+ int err;
+ struct swap_cluster_info *ci;
+ unsigned int ci_start, ci_off, ci_end;
+
+ ci_start = offset % SWAPFILE_CLUSTER;
+ ci_end = ci_start + nr;
+ ci_off = ci_start;
+ ci = swap_cluster_lock(si, offset);
+restart:
+ do {
+ err = __swap_cluster_dup_entry(ci, ci_off);
+ if (unlikely(err)) {
+ if (err == -ENOMEM) {
+ spin_unlock(&ci->lock);
+ err = swap_extend_table_alloc(si, ci, GFP_ATOMIC);
+ spin_lock(&ci->lock);
+ if (!err)
+ goto restart;
+ }
+ goto failed;
+ }
+ } while (++ci_off < ci_end);
+ swap_cluster_unlock(ci);
+ return 0;
+failed:
+ while (ci_off-- > ci_start)
+ __swap_cluster_put_entry(ci, ci_off);
+ swap_extend_table_try_free(ci);
+ swap_cluster_unlock(ci);
+ return err;
+}
+
/**
* folio_alloc_swap - allocate swap space for a folio
* @folio: folio we want to move to swap
* Context: Caller must ensure the folio is locked and in the swap cache.
* NOTE: The caller also has to ensure there is no raced call to
* swap_put_entries_direct on its swap entry before this helper returns, or
- * the swap map may underflow. Currently, we only accept @subpage == NULL
- * for shmem due to the limitation of swap continuation: shmem always
- * duplicates the swap entry only once, so there is no such issue for it.
+ * the swap count may underflow.
*/
int folio_dup_swap(struct folio *folio, struct page *subpage)
{
- int err = 0;
swp_entry_t entry = folio->swap;
unsigned long nr_pages = folio_nr_pages(folio);
nr_pages = 1;
}
- while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM)
- err = add_swap_count_continuation(entry, GFP_ATOMIC);
-
- return err;
+ return swap_dup_entries_cluster(swap_entry_to_info(entry),
+ swp_offset(entry), nr_pages);
}
/**
swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false);
}
-static void swap_put_entry_locked(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- unsigned long offset)
-{
- unsigned char count;
-
- count = si->swap_map[offset];
- if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
- if (count == COUNT_CONTINUED) {
- if (swap_count_continued(si, offset, count))
- count = SWAP_MAP_MAX | COUNT_CONTINUED;
- else
- count = SWAP_MAP_MAX;
- } else
- count--;
- }
-
- WRITE_ONCE(si->swap_map[offset], count);
- if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))
- swap_entries_free(si, ci, offset, 1);
-}
-
/*
* When we get a swap entry, if there aren't some other ways to
* prevent swapoff, such as the folio in swap cache is locked, RCU
}
/*
- * Drop the last ref of swap entries, caller have to ensure all entries
- * belong to the same cgroup and cluster.
+ * Free a set of swap slots after their swap count dropped to zero, or will be
+ * zero after putting the last ref (saves one __swap_cluster_put_entry call).
*/
-void swap_entries_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- unsigned long offset, unsigned int nr_pages)
+void __swap_cluster_free_entries(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned int ci_start, unsigned int nr_pages)
{
- swp_entry_t entry = swp_entry(si->type, offset);
- unsigned char *map = si->swap_map + offset;
- unsigned char *map_end = map + nr_pages;
+ unsigned long old_tb;
+ unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
+ unsigned long offset = cluster_offset(si, ci) + ci_start;
- /* It should never free entries across different clusters */
- VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
- VM_BUG_ON(cluster_is_empty(ci));
- VM_BUG_ON(ci->count < nr_pages);
+ VM_WARN_ON(ci->count < nr_pages);
ci->count -= nr_pages;
do {
- VM_WARN_ON(*map > 1);
- *map = 0;
- } while (++map < map_end);
+ old_tb = __swap_table_get(ci, ci_off);
+ /* Release the last ref, or after swap cache is dropped */
+ VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1);
+ __swap_table_set(ci, ci_off, null_to_swp_tb());
+ } while (++ci_off < ci_end);
- mem_cgroup_uncharge_swap(entry, nr_pages);
+ mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages);
swap_range_free(si, offset, nr_pages);
- swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false);
+ swap_cluster_assert_empty(ci, ci_start, nr_pages, false);
if (!ci->count)
free_cluster(si, ci);
int __swap_count(swp_entry_t entry)
{
- struct swap_info_struct *si = __swap_entry_to_info(entry);
- pgoff_t offset = swp_offset(entry);
+ struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+ unsigned int ci_off = swp_cluster_offset(entry);
- return si->swap_map[offset];
+ return swp_tb_get_count(__swap_table_get(ci, ci_off));
}
/**
{
pgoff_t offset = swp_offset(entry);
struct swap_cluster_info *ci;
- int count;
+ unsigned long swp_tb;
ci = swap_cluster_lock(si, offset);
- count = si->swap_map[offset];
+ swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
swap_cluster_unlock(ci);
- return count && count != SWAP_MAP_BAD;
+ return swp_tb_get_count(swp_tb) > 0;
}
/*
* How many references to @entry are currently swapped out?
- * This considers COUNT_CONTINUED so it returns exact answer.
+ * This returns exact answer.
*/
int swp_swapcount(swp_entry_t entry)
{
- int count, tmp_count, n;
struct swap_info_struct *si;
struct swap_cluster_info *ci;
- struct page *page;
- pgoff_t offset;
- unsigned char *map;
+ unsigned long swp_tb;
+ int count;
si = get_swap_device(entry);
if (!si)
return 0;
- offset = swp_offset(entry);
-
- ci = swap_cluster_lock(si, offset);
-
- count = si->swap_map[offset];
- if (!(count & COUNT_CONTINUED))
- goto out;
-
- count &= ~COUNT_CONTINUED;
- n = SWAP_MAP_MAX + 1;
-
- page = vmalloc_to_page(si->swap_map + offset);
- offset &= ~PAGE_MASK;
- VM_BUG_ON(page_private(page) != SWP_CONTINUED);
-
- do {
- page = list_next_entry(page, lru);
- map = kmap_local_page(page);
- tmp_count = map[offset];
- kunmap_local(map);
-
- count += (tmp_count & ~COUNT_CONTINUED) * n;
- n *= (SWAP_CONT_MAX + 1);
- } while (tmp_count & COUNT_CONTINUED);
-out:
+ ci = swap_cluster_lock(si, swp_offset(entry));
+ swp_tb = __swap_table_get(ci, swp_cluster_offset(entry));
+ count = swp_tb_get_count(swp_tb);
+ if (count == SWP_TB_COUNT_MAX)
+ count = ci->extend_table[swp_cluster_offset(entry)];
swap_cluster_unlock(ci);
put_swap_device(si);
- return count;
+
+ return count < 0 ? 0 : count;
}
static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
swp_entry_t entry, int order)
{
struct swap_cluster_info *ci;
- unsigned char *map = si->swap_map;
unsigned int nr_pages = 1 << order;
unsigned long roffset = swp_offset(entry);
unsigned long offset = round_down(roffset, nr_pages);
+ unsigned int ci_off;
int i;
bool ret = false;
ci = swap_cluster_lock(si, offset);
if (nr_pages == 1) {
- if (map[roffset])
+ ci_off = roffset % SWAPFILE_CLUSTER;
+ if (swp_tb_get_count(__swap_table_get(ci, ci_off)))
ret = true;
goto unlock_out;
}
for (i = 0; i < nr_pages; i++) {
- if (map[offset + i]) {
+ ci_off = (offset + i) % SWAPFILE_CLUSTER;
+ if (swp_tb_get_count(__swap_table_get(ci, ci_off))) {
ret = true;
break;
}
return;
ci = swap_cluster_lock(si, offset);
- swap_put_entry_locked(si, ci, offset);
+ __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER);
+ __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1);
swap_cluster_unlock(ci);
/* In theory readahead might add it to the swap cache by accident */
unsigned int type)
{
pte_t *pte = NULL;
- struct swap_info_struct *si;
- si = swap_info[type];
do {
struct folio *folio;
- unsigned long offset;
- unsigned char swp_count;
+ unsigned long swp_tb;
softleaf_t entry;
int ret;
pte_t ptent;
if (swp_type(entry) != type)
continue;
- offset = swp_offset(entry);
pte_unmap(pte);
pte = NULL;
&vmf);
}
if (!folio) {
- swp_count = READ_ONCE(si->swap_map[offset]);
- if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
+ swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ if (swp_tb_get_count(swp_tb) <= 0)
continue;
return -ENOMEM;
}
}
/*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap table from current position to next entry still in use.
* Return 0 if there are no inuse entries after prev till end of
* the map.
*/
{
unsigned int i;
unsigned long swp_tb;
- unsigned char count;
/*
* No need for swap_lock here: we're just looking
* allocations from this area (while holding swap_lock).
*/
for (i = prev + 1; i < si->max; i++) {
- count = READ_ONCE(si->swap_map[i]);
swp_tb = swap_table_get(__swap_offset_to_cluster(si, i),
i % SWAPFILE_CLUSTER);
- if (count == SWAP_MAP_BAD)
- continue;
- if (count || swp_tb_is_folio(swp_tb))
+ if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb))
break;
if ((i % LATENCY_LIMIT) == 0)
cond_resched();
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
- unsigned char *swap_map;
unsigned long *zeromap;
struct swap_cluster_info *cluster_info;
struct file *swap_file, *victim;
flush_percpu_swap_cluster(p);
destroy_swap_extents(p, p->swap_file);
- if (p->flags & SWP_CONTINUED)
- free_swap_count_continuations(p);
if (!(p->flags & SWP_SOLIDSTATE))
atomic_dec(&nr_rotate_swap);
swap_file = p->swap_file;
p->swap_file = NULL;
- swap_map = p->swap_map;
- p->swap_map = NULL;
zeromap = p->zeromap;
p->zeromap = NULL;
maxpages = p->max;
mutex_unlock(&swapon_mutex);
kfree(p->global_cluster);
p->global_cluster = NULL;
- vfree(swap_map);
kvfree(zeromap);
free_swap_cluster_info(cluster_info, maxpages);
/* Destroy swap account information */
kvfree(defer);
}
spin_lock_init(&p->lock);
- spin_lock_init(&p->cont_lock);
atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
init_completion(&p->comp);
return maxpages;
}
-static int setup_swap_map(struct swap_info_struct *si,
- union swap_header *swap_header,
- unsigned long maxpages)
-{
- unsigned char *swap_map;
-
- swap_map = vzalloc(maxpages);
- si->swap_map = swap_map;
- if (!swap_map)
- return -ENOMEM;
- return 0;
-}
-
static int setup_swap_clusters_info(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned long maxpages)
maxpages = si->max;
- /* Setup the swap map and apply bad block */
- error = setup_swap_map(si, swap_header, maxpages);
- if (error)
- goto bad_swap_unlock_inode;
-
/* Set up the swap cluster info */
error = setup_swap_clusters_info(si, swap_header, maxpages);
if (error)
inode = NULL;
destroy_swap_extents(si, swap_file);
swap_cgroup_swapoff(si->type);
- vfree(si->swap_map);
- si->swap_map = NULL;
free_swap_cluster_info(si->cluster_info, si->max);
si->cluster_info = NULL;
kvfree(si->zeromap);
spin_unlock(&swap_lock);
}
-/*
- * Verify that nr swap entries are valid and increment their swap map counts.
- *
- * Returns error code in following case.
- * - success -> 0
- * - swp_entry is invalid -> EINVAL
- * - swap-mapped reference is requested but the entry is not used. -> ENOENT
- * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
- */
-static int swap_dup_entries(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- unsigned long offset,
- unsigned char usage, int nr)
-{
- int i;
- unsigned char count;
-
- for (i = 0; i < nr; i++) {
- count = si->swap_map[offset + i];
- /*
- * For swapin out, allocator never allocates bad slots. for
- * swapin, readahead is guarded by swap_entry_swapped.
- */
- if (WARN_ON(count == SWAP_MAP_BAD))
- return -ENOENT;
- /*
- * Swap count duplication must be guarded by either swap cache folio (from
- * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct).
- */
- if (WARN_ON(!count &&
- !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))))
- return -ENOENT;
- if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX))
- return -EINVAL;
- }
-
- for (i = 0; i < nr; i++) {
- count = si->swap_map[offset + i];
- if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
- count += usage;
- else if (swap_count_continued(si, offset + i, count))
- count = COUNT_CONTINUED;
- else {
- /*
- * Don't need to rollback changes, because if
- * usage == 1, there must be nr == 1.
- */
- return -ENOMEM;
- }
-
- WRITE_ONCE(si->swap_map[offset + i], count);
- }
-
- return 0;
-}
-
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
-{
- int err;
- struct swap_info_struct *si;
- struct swap_cluster_info *ci;
- unsigned long offset = swp_offset(entry);
-
- si = swap_entry_to_info(entry);
- if (WARN_ON_ONCE(!si)) {
- pr_err("%s%08lx\n", Bad_file, entry.val);
- return -EINVAL;
- }
-
- VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- ci = swap_cluster_lock(si, offset);
- err = swap_dup_entries(si, ci, offset, usage, nr);
- swap_cluster_unlock(ci);
- return err;
-}
-
/*
* swap_dup_entry_direct() - Increase reference count of a swap entry by one.
* @entry: first swap entry from which we want to increase the refcount.
*
- * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
- * but could not be atomically allocated. Returns 0, just as if it succeeded,
- * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
- * might occur if a page table entry has got corrupted.
+ * Returns 0 for success, or -ENOMEM if the extend table is required
+ * but could not be atomically allocated. Returns -EINVAL if the swap
+ * entry is invalid, which might occur if a page table entry has got
+ * corrupted.
*
* Context: Caller must ensure there is no race condition on the reference
* owner. e.g., locking the PTL of a PTE containing the entry being increased.
*/
int swap_dup_entry_direct(swp_entry_t entry)
-{
- int err = 0;
- while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
- err = add_swap_count_continuation(entry, GFP_ATOMIC);
- return err;
-}
-
-/*
- * add_swap_count_continuation - called when a swap count is duplicated
- * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
- * page of the original vmalloc'ed swap_map, to hold the continuation count
- * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
- * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
- *
- * These continuation pages are seldom referenced: the common paths all work
- * on the original swap_map, only referring to a continuation page when the
- * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
- *
- * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
- * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
- * can be called after dropping locks.
- */
-int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
{
struct swap_info_struct *si;
- struct swap_cluster_info *ci;
- struct page *head;
- struct page *page;
- struct page *list_page;
- pgoff_t offset;
- unsigned char count;
- int ret = 0;
-
- /*
- * When debugging, it's easier to use __GFP_ZERO here; but it's better
- * for latency not to zero a page while GFP_ATOMIC and holding locks.
- */
- page = alloc_page(gfp_mask | __GFP_HIGHMEM);
-
- si = get_swap_device(entry);
- if (!si) {
- /*
- * An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap device may be swapoff
- */
- goto outer;
- }
-
- offset = swp_offset(entry);
-
- ci = swap_cluster_lock(si, offset);
-
- count = si->swap_map[offset];
-
- if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
- /*
- * The higher the swap count, the more likely it is that tasks
- * will race to add swap count continuation: we need to avoid
- * over-provisioning.
- */
- goto out;
- }
-
- if (!page) {
- ret = -ENOMEM;
- goto out;
- }
-
- head = vmalloc_to_page(si->swap_map + offset);
- offset &= ~PAGE_MASK;
-
- spin_lock(&si->cont_lock);
- /*
- * Page allocation does not initialize the page's lru field,
- * but it does always reset its private field.
- */
- if (!page_private(head)) {
- BUG_ON(count & COUNT_CONTINUED);
- INIT_LIST_HEAD(&head->lru);
- set_page_private(head, SWP_CONTINUED);
- si->flags |= SWP_CONTINUED;
- }
-
- list_for_each_entry(list_page, &head->lru, lru) {
- unsigned char *map;
-
- /*
- * If the previous map said no continuation, but we've found
- * a continuation page, free our allocation and use this one.
- */
- if (!(count & COUNT_CONTINUED))
- goto out_unlock_cont;
-
- map = kmap_local_page(list_page) + offset;
- count = *map;
- kunmap_local(map);
-
- /*
- * If this continuation count now has some space in it,
- * free our allocation and use this one.
- */
- if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
- goto out_unlock_cont;
- }
- list_add_tail(&page->lru, &head->lru);
- page = NULL; /* now it's attached, don't free it */
-out_unlock_cont:
- spin_unlock(&si->cont_lock);
-out:
- swap_cluster_unlock(ci);
- put_swap_device(si);
-outer:
- if (page)
- __free_page(page);
- return ret;
-}
-
-/*
- * swap_count_continued - when the original swap_map count is incremented
- * from SWAP_MAP_MAX, check if there is already a continuation page to carry
- * into, carry if so, or else fail until a new continuation page is allocated;
- * when the original swap_map count is decremented from 0 with continuation,
- * borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or caller of swap_put_entry_locked()
- * holds cluster lock.
- */
-static bool swap_count_continued(struct swap_info_struct *si,
- pgoff_t offset, unsigned char count)
-{
- struct page *head;
- struct page *page;
- unsigned char *map;
- bool ret;
-
- head = vmalloc_to_page(si->swap_map + offset);
- if (page_private(head) != SWP_CONTINUED) {
- BUG_ON(count & COUNT_CONTINUED);
- return false; /* need to add count continuation */
- }
-
- spin_lock(&si->cont_lock);
- offset &= ~PAGE_MASK;
- page = list_next_entry(head, lru);
- map = kmap_local_page(page) + offset;
-
- if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
- goto init_map; /* jump over SWAP_CONT_MAX checks */
-
- if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
- /*
- * Think of how you add 1 to 999
- */
- while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
- kunmap_local(map);
- page = list_next_entry(page, lru);
- BUG_ON(page == head);
- map = kmap_local_page(page) + offset;
- }
- if (*map == SWAP_CONT_MAX) {
- kunmap_local(map);
- page = list_next_entry(page, lru);
- if (page == head) {
- ret = false; /* add count continuation */
- goto out;
- }
- map = kmap_local_page(page) + offset;
-init_map: *map = 0; /* we didn't zero the page */
- }
- *map += 1;
- kunmap_local(map);
- while ((page = list_prev_entry(page, lru)) != head) {
- map = kmap_local_page(page) + offset;
- *map = COUNT_CONTINUED;
- kunmap_local(map);
- }
- ret = true; /* incremented */
-
- } else { /* decrementing */
- /*
- * Think of how you subtract 1 from 1000
- */
- BUG_ON(count != COUNT_CONTINUED);
- while (*map == COUNT_CONTINUED) {
- kunmap_local(map);
- page = list_next_entry(page, lru);
- BUG_ON(page == head);
- map = kmap_local_page(page) + offset;
- }
- BUG_ON(*map == 0);
- *map -= 1;
- if (*map == 0)
- count = 0;
- kunmap_local(map);
- while ((page = list_prev_entry(page, lru)) != head) {
- map = kmap_local_page(page) + offset;
- *map = SWAP_CONT_MAX | count;
- count = COUNT_CONTINUED;
- kunmap_local(map);
- }
- ret = count == COUNT_CONTINUED;
+ si = swap_entry_to_info(entry);
+ if (WARN_ON_ONCE(!si)) {
+ pr_err("%s%08lx\n", Bad_file, entry.val);
+ return -EINVAL;
}
-out:
- spin_unlock(&si->cont_lock);
- return ret;
-}
-/*
- * free_swap_count_continuations - swapoff free all the continuation pages
- * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
- */
-static void free_swap_count_continuations(struct swap_info_struct *si)
-{
- pgoff_t offset;
-
- for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
- struct page *head;
- head = vmalloc_to_page(si->swap_map + offset);
- if (page_private(head)) {
- struct page *page, *next;
-
- list_for_each_entry_safe(page, next, &head->lru, lru) {
- list_del(&page->lru);
- __free_page(page);
- }
- }
- }
+ return swap_dup_entries_cluster(si, swp_offset(entry), 1);
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)