1 /* SPDX-License-Identifier: GPL-2.0 */
5 #include <linux/spinlock.h>
6 #include <linux/linkage.h>
7 #include <linux/mmzone.h>
8 #include <linux/list.h>
9 #include <linux/memcontrol.h>
10 #include <linux/sched.h>
11 #include <linux/node.h>
13 #include <linux/pagemap.h>
14 #include <linux/atomic.h>
15 #include <linux/page-flags.h>
16 #include <uapi/linux/mempolicy.h>
19 struct notifier_block
;
25 #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */
26 #define SWAP_FLAG_PRIO_MASK 0x7fff
27 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */
28 #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */
29 #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */
31 #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
32 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
33 SWAP_FLAG_DISCARD_PAGES)
36 static inline int current_is_kswapd(void)
38 return current
->flags
& PF_KSWAPD
;
42 * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
43 * be swapped to. The swap type and the offset into that swap type are
44 * encoded into pte's and into pgoff_t's in the swapcache. Using five bits
45 * for the type means that the maximum number of swapcache pages is 27 bits
46 * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs
47 * the type/offset into the pte as 5/27 as well.
49 #define MAX_SWAPFILES_SHIFT 5
52 * Use some of the swap files numbers for other purposes. This
53 * is a convenient way to hook into the VM to trigger special
58 * PTE markers are used to persist information onto PTEs that otherwise
59 * should be a none pte. As its name "PTE" hints, it should only be
60 * applied to the leaves of pgtables.
62 #define SWP_PTE_MARKER_NUM 1
63 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
64 SWP_MIGRATION_NUM + SWP_DEVICE_NUM)
67 * Unaddressable device memory support. See include/linux/hmm.h and
68 * Documentation/mm/hmm.rst. Short description is we need struct pages for
69 * device memory that is unaddressable (inaccessible) by CPU, so that we can
70 * migrate part of a process memory to device memory.
72 * When a page is migrated from CPU to device, we set the CPU page table entry
73 * to a special SWP_DEVICE_{READ|WRITE} entry.
75 * When a page is mapped by the device for exclusive access we set the CPU page
76 * table entries to a special SWP_DEVICE_EXCLUSIVE entry.
78 #ifdef CONFIG_DEVICE_PRIVATE
79 #define SWP_DEVICE_NUM 3
80 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
81 #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
82 #define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
84 #define SWP_DEVICE_NUM 0
88 * Page migration support.
90 * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and
91 * indicates that the referenced (part of) an anonymous page is exclusive to
92 * a single process. For SWP_MIGRATION_WRITE, that information is implicit:
93 * (part of) an anonymous page that are mapped writable are exclusive to a
96 #ifdef CONFIG_MIGRATION
97 #define SWP_MIGRATION_NUM 3
98 #define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
99 #define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
100 #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2)
102 #define SWP_MIGRATION_NUM 0
106 * Handling of hardware poisoned pages with memory corruption.
108 #ifdef CONFIG_MEMORY_FAILURE
109 #define SWP_HWPOISON_NUM 1
110 #define SWP_HWPOISON MAX_SWAPFILES
112 #define SWP_HWPOISON_NUM 0
115 #define MAX_SWAPFILES \
116 ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
117 SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
121 * Magic header for a swap area. The first part of the union is
122 * what the swap magic looks like for the old (limited to 128MB)
123 * swap area format, the second part of the union adds - in the
124 * old reserved area - some extra information. Note that the first
125 * kilobyte is reserved for boot loader or disk label stuff...
127 * Having the magic at the end of the PAGE_SIZE makes detecting swap
128 * areas somewhat tricky on machines that support multiple page sizes.
129 * For 2.5 we'll probably want to move the magic to just beyond the
134 char reserved
[PAGE_SIZE
- 10];
135 char magic
[10]; /* SWAP-SPACE or SWAPSPACE2 */
138 char bootbits
[1024]; /* Space for disklabel etc. */
142 unsigned char sws_uuid
[16];
143 unsigned char sws_volume
[16];
150 * current->reclaim_state points to one of these when a task is running
153 struct reclaim_state
{
154 /* pages reclaimed outside of LRU-based reclaim */
155 unsigned long reclaimed
;
156 #ifdef CONFIG_LRU_GEN
157 /* per-thread mm walk data */
158 struct lru_gen_mm_walk
*mm_walk
;
163 * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
165 * @pages: number of pages reclaimed
167 * If the current process is undergoing a reclaim operation, increment the
168 * number of reclaimed pages by @pages.
170 static inline void mm_account_reclaimed_pages(unsigned long pages
)
172 if (current
->reclaim_state
)
173 current
->reclaim_state
->reclaimed
+= pages
;
178 struct address_space
;
180 struct writeback_control
;
184 * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
185 * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the
186 * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart
187 * from setup, they're handled identically.
189 * We always assume that blocks are of size PAGE_SIZE.
192 struct rb_node rb_node
;
195 sector_t start_block
;
199 * Max bad pages in the new format..
201 #define MAX_SWAP_BADPAGES \
202 ((offsetof(union swap_header, magic.magic) - \
203 offsetof(union swap_header, info.badpages)) / sizeof(int))
206 SWP_USED
= (1 << 0), /* is slot in swap_info[] used? */
207 SWP_WRITEOK
= (1 << 1), /* ok to write to this swap? */
208 SWP_DISCARDABLE
= (1 << 2), /* blkdev support discard */
209 SWP_DISCARDING
= (1 << 3), /* now discarding a free cluster */
210 SWP_SOLIDSTATE
= (1 << 4), /* blkdev seeks are cheap */
211 SWP_CONTINUED
= (1 << 5), /* swap_map has count continuation */
212 SWP_BLKDEV
= (1 << 6), /* its a block device */
213 SWP_ACTIVATED
= (1 << 7), /* set after swap_activate success */
214 SWP_FS_OPS
= (1 << 8), /* swapfile operations go through fs */
215 SWP_AREA_DISCARD
= (1 << 9), /* single-time swap area discards */
216 SWP_PAGE_DISCARD
= (1 << 10), /* freed swap page-cluster discards */
217 SWP_STABLE_WRITES
= (1 << 11), /* no overwrite PG_writeback pages */
218 SWP_SYNCHRONOUS_IO
= (1 << 12), /* synchronous IO is efficient */
219 /* add others here before... */
222 #define SWAP_CLUSTER_MAX 32UL
223 #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10)
224 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
226 /* Bit flag in swap_map */
227 #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
228 #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */
230 /* Special value in first swap_map */
231 #define SWAP_MAP_MAX 0x3e /* Max count */
232 #define SWAP_MAP_BAD 0x3f /* Note page is bad */
233 #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */
235 /* Special value in each swap_map continuation */
236 #define SWAP_CONT_MAX 0x7f /* Max count */
239 * We use this to track usage of a cluster. A cluster is a block of swap disk
240 * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
241 * free clusters are organized into a list. We fetch an entry from the list to
242 * get a free cluster.
244 * The flags field determines if a cluster is free. This is
245 * protected by cluster lock.
247 struct swap_cluster_info
{
249 * Protect swap_cluster_info fields
250 * other than list, and swap_info_struct->swap_map
251 * elements corresponding to the swap cluster.
256 struct list_head list
;
259 /* All on-list cluster must have a non-zero flag. */
260 enum swap_cluster_flags
{
261 CLUSTER_FLAG_NONE
= 0, /* For temporary off-list cluster */
263 CLUSTER_FLAG_NONFULL
,
265 /* Clusters with flags above are allocatable */
266 CLUSTER_FLAG_USABLE
= CLUSTER_FLAG_FRAG
,
268 CLUSTER_FLAG_DISCARD
,
273 * The first page in the swap file is the swap header, which is always marked
274 * bad to prevent it from being allocated as an entry. This also prevents the
275 * cluster to which it belongs being marked free. Therefore 0 is safe to use as
276 * a sentinel to indicate an entry is not valid.
278 #define SWAP_ENTRY_INVALID 0
280 #ifdef CONFIG_THP_SWAP
281 #define SWAP_NR_ORDERS (PMD_ORDER + 1)
283 #define SWAP_NR_ORDERS 1
287 * We keep using same cluster for rotational device so IO will be sequential.
288 * The purpose is to optimize SWAP throughput on these device.
290 struct swap_sequential_cluster
{
291 unsigned int next
[SWAP_NR_ORDERS
]; /* Likely next allocation offset */
295 * The in-memory structure used to track swap areas.
297 struct swap_info_struct
{
298 struct percpu_ref users
; /* indicate and keep swap device valid. */
299 unsigned long flags
; /* SWP_USED etc: see above */
300 signed short prio
; /* swap priority of this type */
301 struct plist_node list
; /* entry in swap_active_head */
302 signed char type
; /* strange name for an index */
303 unsigned int max
; /* extent of the swap_map */
304 unsigned char *swap_map
; /* vmalloc'ed array of usage counts */
305 unsigned long *zeromap
; /* kvmalloc'ed bitmap to track zero pages */
306 struct swap_cluster_info
*cluster_info
; /* cluster info. Only for SSD */
307 struct list_head free_clusters
; /* free clusters list */
308 struct list_head full_clusters
; /* full clusters list */
309 struct list_head nonfull_clusters
[SWAP_NR_ORDERS
];
310 /* list of cluster that contains at least one free slot */
311 struct list_head frag_clusters
[SWAP_NR_ORDERS
];
312 /* list of cluster that are fragmented or contented */
313 atomic_long_t frag_cluster_nr
[SWAP_NR_ORDERS
];
314 unsigned int pages
; /* total of usable pages of swap */
315 atomic_long_t inuse_pages
; /* number of those currently in use */
316 struct swap_sequential_cluster
*global_cluster
; /* Use one global cluster for rotating device */
317 spinlock_t global_cluster_lock
; /* Serialize usage of global cluster */
318 struct rb_root swap_extent_root
;/* root of the swap extent rbtree */
319 struct block_device
*bdev
; /* swap device or bdev of swap file */
320 struct file
*swap_file
; /* seldom referenced */
321 struct completion comp
; /* seldom referenced */
323 * protect map scan related fields like
324 * swap_map, lowest_bit, highest_bit,
325 * inuse_pages, cluster_next,
326 * cluster_nr, lowest_alloc,
327 * highest_alloc, free/discard cluster
328 * list. other fields are only changed
329 * at swapon/swapoff, so are protected
330 * by swap_lock. changing flags need
331 * hold this lock and swap_lock. If
332 * both locks need hold, hold swap_lock
335 spinlock_t cont_lock
; /*
336 * protect swap count continuation page
339 struct work_struct discard_work
; /* discard worker */
340 struct work_struct reclaim_work
; /* reclaim worker */
341 struct list_head discard_clusters
; /* discard clusters list */
342 struct plist_node avail_lists
[]; /*
343 * entries in swap_avail_heads, one
345 * Must be last as the number of the
346 * array is nr_node_ids, which is not
347 * a fixed value so have to allocate
349 * And it has to be an array so that
350 * plist_for_each_* can work.
354 static inline swp_entry_t
page_swap_entry(struct page
*page
)
356 struct folio
*folio
= page_folio(page
);
357 swp_entry_t entry
= folio
->swap
;
359 entry
.val
+= folio_page_idx(folio
, page
);
363 /* linux/mm/workingset.c */
364 bool workingset_test_recent(void *shadow
, bool file
, bool *workingset
,
366 void workingset_age_nonresident(struct lruvec
*lruvec
, unsigned long nr_pages
);
367 void *workingset_eviction(struct folio
*folio
, struct mem_cgroup
*target_memcg
);
368 void workingset_refault(struct folio
*folio
, void *shadow
);
369 void workingset_activation(struct folio
*folio
);
371 /* linux/mm/page_alloc.c */
372 extern unsigned long totalreserve_pages
;
374 /* Definition of global_zone_page_state not available yet */
375 #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
378 /* linux/mm/swap.c */
379 void lru_note_cost(struct lruvec
*lruvec
, bool file
,
380 unsigned int nr_io
, unsigned int nr_rotated
);
381 void lru_note_cost_refault(struct folio
*);
382 void folio_add_lru(struct folio
*);
383 void folio_add_lru_vma(struct folio
*, struct vm_area_struct
*);
384 void mark_page_accessed(struct page
*);
385 void folio_mark_accessed(struct folio
*);
387 extern atomic_t lru_disable_count
;
389 static inline bool lru_cache_disabled(void)
391 return atomic_read(&lru_disable_count
);
394 static inline void lru_cache_enable(void)
396 atomic_dec(&lru_disable_count
);
399 extern void lru_cache_disable(void);
400 extern void lru_add_drain(void);
401 extern void lru_add_drain_cpu(int cpu
);
402 extern void lru_add_drain_cpu_zone(struct zone
*zone
);
403 extern void lru_add_drain_all(void);
404 void folio_deactivate(struct folio
*folio
);
405 void folio_mark_lazyfree(struct folio
*folio
);
406 extern void swap_setup(void);
408 /* linux/mm/vmscan.c */
409 extern unsigned long zone_reclaimable_pages(struct zone
*zone
);
410 extern unsigned long try_to_free_pages(struct zonelist
*zonelist
, int order
,
411 gfp_t gfp_mask
, nodemask_t
*mask
);
413 #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
414 #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
415 #define MIN_SWAPPINESS 0
416 #define MAX_SWAPPINESS 200
418 /* Just recliam from anon folios in proactive memory reclaim */
419 #define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1)
421 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup
*memcg
,
422 unsigned long nr_pages
,
424 unsigned int reclaim_options
,
426 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup
*mem
,
427 gfp_t gfp_mask
, bool noswap
,
429 unsigned long *nr_scanned
);
430 extern unsigned long shrink_all_memory(unsigned long nr_pages
);
431 extern int vm_swappiness
;
432 long remove_mapping(struct address_space
*mapping
, struct folio
*folio
);
435 extern int sysctl_min_unmapped_ratio
;
436 extern int sysctl_min_slab_ratio
;
439 void check_move_unevictable_folios(struct folio_batch
*fbatch
);
441 extern void __meminit
kswapd_run(int nid
);
442 extern void __meminit
kswapd_stop(int nid
);
446 int add_swap_extent(struct swap_info_struct
*sis
, unsigned long start_page
,
447 unsigned long nr_pages
, sector_t start_block
);
448 int generic_swapfile_activate(struct swap_info_struct
*, struct file
*,
451 static inline unsigned long total_swapcache_pages(void)
453 return global_node_page_state(NR_SWAPCACHE
);
456 void free_swap_cache(struct folio
*folio
);
457 void free_folio_and_swap_cache(struct folio
*folio
);
458 void free_pages_and_swap_cache(struct encoded_page
**, int);
459 /* linux/mm/swapfile.c */
460 extern atomic_long_t nr_swap_pages
;
461 extern long total_swap_pages
;
462 extern atomic_t nr_rotate_swap
;
464 /* Swap 50% full? Release swapcache more aggressively.. */
465 static inline bool vm_swap_full(void)
467 return atomic_long_read(&nr_swap_pages
) * 2 < total_swap_pages
;
470 static inline long get_nr_swap_pages(void)
472 return atomic_long_read(&nr_swap_pages
);
475 extern void si_swapinfo(struct sysinfo
*);
476 int folio_alloc_swap(struct folio
*folio
, gfp_t gfp_mask
);
477 bool folio_free_swap(struct folio
*folio
);
478 void put_swap_folio(struct folio
*folio
, swp_entry_t entry
);
479 extern swp_entry_t
get_swap_page_of_type(int);
480 extern int add_swap_count_continuation(swp_entry_t
, gfp_t
);
481 extern void swap_shmem_alloc(swp_entry_t
, int);
482 extern int swap_duplicate(swp_entry_t
);
483 extern int swapcache_prepare(swp_entry_t entry
, int nr
);
484 extern void swap_free_nr(swp_entry_t entry
, int nr_pages
);
485 extern void free_swap_and_cache_nr(swp_entry_t entry
, int nr
);
486 int swap_type_of(dev_t device
, sector_t offset
);
487 int find_first_swap(dev_t
*device
);
488 extern unsigned int count_swap_pages(int, int);
489 extern sector_t
swapdev_block(int, pgoff_t
);
490 extern int __swap_count(swp_entry_t entry
);
491 extern bool swap_entry_swapped(struct swap_info_struct
*si
, swp_entry_t entry
);
492 extern int swp_swapcount(swp_entry_t entry
);
493 struct swap_info_struct
*swp_swap_info(swp_entry_t entry
);
494 struct backing_dev_info
;
495 extern int init_swap_address_space(unsigned int type
, unsigned long nr_pages
);
496 extern void exit_swap_address_space(unsigned int type
);
497 extern struct swap_info_struct
*get_swap_device(swp_entry_t entry
);
498 sector_t
swap_folio_sector(struct folio
*folio
);
500 static inline void put_swap_device(struct swap_info_struct
*si
)
502 percpu_ref_put(&si
->users
);
505 #else /* CONFIG_SWAP */
506 static inline struct swap_info_struct
*swp_swap_info(swp_entry_t entry
)
511 static inline struct swap_info_struct
*get_swap_device(swp_entry_t entry
)
516 static inline void put_swap_device(struct swap_info_struct
*si
)
520 #define get_nr_swap_pages() 0L
521 #define total_swap_pages 0L
522 #define total_swapcache_pages() 0UL
523 #define vm_swap_full() 0
525 #define si_swapinfo(val) \
526 do { (val)->freeswap = (val)->totalswap = 0; } while (0)
527 #define free_folio_and_swap_cache(folio) \
529 #define free_pages_and_swap_cache(pages, nr) \
530 release_pages((pages), (nr));
532 static inline void free_swap_and_cache_nr(swp_entry_t entry
, int nr
)
536 static inline void free_swap_cache(struct folio
*folio
)
540 static inline int add_swap_count_continuation(swp_entry_t swp
, gfp_t gfp_mask
)
545 static inline void swap_shmem_alloc(swp_entry_t swp
, int nr
)
549 static inline int swap_duplicate(swp_entry_t swp
)
554 static inline int swapcache_prepare(swp_entry_t swp
, int nr
)
559 static inline void swap_free_nr(swp_entry_t entry
, int nr_pages
)
563 static inline void put_swap_folio(struct folio
*folio
, swp_entry_t swp
)
567 static inline int __swap_count(swp_entry_t entry
)
572 static inline bool swap_entry_swapped(struct swap_info_struct
*si
, swp_entry_t entry
)
577 static inline int swp_swapcount(swp_entry_t entry
)
582 static inline int folio_alloc_swap(struct folio
*folio
, gfp_t gfp_mask
)
587 static inline bool folio_free_swap(struct folio
*folio
)
592 static inline int add_swap_extent(struct swap_info_struct
*sis
,
593 unsigned long start_page
,
594 unsigned long nr_pages
, sector_t start_block
)
598 #endif /* CONFIG_SWAP */
600 static inline void free_swap_and_cache(swp_entry_t entry
)
602 free_swap_and_cache_nr(entry
, 1);
605 static inline void swap_free(swp_entry_t entry
)
607 swap_free_nr(entry
, 1);
611 static inline int mem_cgroup_swappiness(struct mem_cgroup
*memcg
)
613 /* Cgroup2 doesn't have per-cgroup swappiness */
614 if (cgroup_subsys_on_dfl(memory_cgrp_subsys
))
615 return READ_ONCE(vm_swappiness
);
618 if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg
))
619 return READ_ONCE(vm_swappiness
);
621 return READ_ONCE(memcg
->swappiness
);
624 static inline int mem_cgroup_swappiness(struct mem_cgroup
*mem
)
626 return READ_ONCE(vm_swappiness
);
630 #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
631 void __folio_throttle_swaprate(struct folio
*folio
, gfp_t gfp
);
632 static inline void folio_throttle_swaprate(struct folio
*folio
, gfp_t gfp
)
634 if (mem_cgroup_disabled())
636 __folio_throttle_swaprate(folio
, gfp
);
639 static inline void folio_throttle_swaprate(struct folio
*folio
, gfp_t gfp
)
644 #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
645 int __mem_cgroup_try_charge_swap(struct folio
*folio
, swp_entry_t entry
);
646 static inline int mem_cgroup_try_charge_swap(struct folio
*folio
,
649 if (mem_cgroup_disabled())
651 return __mem_cgroup_try_charge_swap(folio
, entry
);
654 extern void __mem_cgroup_uncharge_swap(swp_entry_t entry
, unsigned int nr_pages
);
655 static inline void mem_cgroup_uncharge_swap(swp_entry_t entry
, unsigned int nr_pages
)
657 if (mem_cgroup_disabled())
659 __mem_cgroup_uncharge_swap(entry
, nr_pages
);
662 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup
*memcg
);
663 extern bool mem_cgroup_swap_full(struct folio
*folio
);
665 static inline int mem_cgroup_try_charge_swap(struct folio
*folio
,
671 static inline void mem_cgroup_uncharge_swap(swp_entry_t entry
,
672 unsigned int nr_pages
)
676 static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup
*memcg
)
678 return get_nr_swap_pages();
681 static inline bool mem_cgroup_swap_full(struct folio
*folio
)
683 return vm_swap_full();
687 #endif /* __KERNEL__*/
688 #endif /* _LINUX_SWAP_H */