+++ /dev/null
-===========================================
-Automatically bind swap device to numa node
-===========================================
-
-If the system has more than one swap device and swap device has the node
-information, we can make use of this information to decide which swap
-device to use in get_swap_pages() to get better performance.
-
-
-How to use this feature
-=======================
-
-Swap device has priority and that decides the order of it to be used. To make
-use of automatically binding, there is no need to manipulate priority settings
-for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
-swapB, with swapA attached to node 0 and swapB attached to node 1, are going
-to be swapped on. Simply swapping them on by doing::
-
- # swapon /dev/swapA
- # swapon /dev/swapB
-
-Then node 0 will use the two swap devices in the order of swapA then swapB and
-node 1 will use the two swap devices in the order of swapB then swapA. Note
-that the order of them being swapped on doesn't matter.
-
-A more complex example on a 4 node machine. Assume 6 swap devices are going to
-be swapped on: swapA and swapB are attached to node 0, swapC is attached to
-node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
-The way to swap them on is the same as above::
-
- # swapon /dev/swapA
- # swapon /dev/swapB
- # swapon /dev/swapC
- # swapon /dev/swapD
- # swapon /dev/swapE
- # swapon /dev/swapF
-
-Then node 0 will use them in the order of::
-
- swapA/swapB -> swapC -> swapD -> swapE -> swapF
-
-swapA and swapB will be used in a round robin mode before any other swap device.
-
-node 1 will use them in the order of::
-
- swapC -> swapA -> swapB -> swapD -> swapE -> swapF
-
-node 2 will use them in the order of::
-
- swapD/swapE -> swapA -> swapB -> swapC -> swapF
-
-Similaly, swapD and swapE will be used in a round robin mode before any
-other swap devices.
-
-node 3 will use them in the order of::
-
- swapF -> swapA -> swapB -> swapC -> swapD -> swapE
-
-
-Implementation details
-======================
-
-The current code uses a priority based list, swap_avail_list, to decide
-which swap device to use and if multiple swap devices share the same
-priority, they are used round robin. This change here replaces the single
-global swap_avail_list with a per-numa-node list, i.e. for each numa node,
-it sees its own priority based list of available swap devices. Swap
-device's priority can be promoted on its matching node's swap_avail_list.
-
-The current swap device's priority is set as: user can set a >=0 value,
-or the system will pick one starting from -1 then downwards. The priority
-value in the swap_avail_list is the negated value of the swap device's
-due to plist being sorted from low to high. The new policy doesn't change
-the semantics for priority >=0 cases, the previous starting from -1 then
-downwards now becomes starting from -2 then downwards and -1 is reserved
-as the promoted value. So if multiple swap devices are attached to the same
-node, they will all be promoted to priority -1 on that node's plist and will
-be used round robin before any other swap devices.
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
-static int least_priority = -1;
+static int least_priority;
unsigned long swapfile_maximum_size;
#ifdef CONFIG_MIGRATION
bool swap_migration_ad_supported;
* is held and the locking order requires swap_lock to be taken
* before any swap_info_struct->lock.
*/
-static struct plist_head *swap_avail_heads;
+static PLIST_HEAD(swap_avail_head);
static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{
- int nid;
unsigned long pages;
spin_lock(&swap_avail_lock);
goto skip;
}
- for_each_node(nid)
- plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
+ plist_del(&si->avail_list, &swap_avail_head);
skip:
spin_unlock(&swap_avail_lock);
/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
{
- int nid;
long val;
unsigned long pages;
goto skip;
}
- for_each_node(nid)
- plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
+ plist_add(&si->avail_list, &swap_avail_head);
skip:
spin_unlock(&swap_avail_lock);
static bool swap_alloc_slow(swp_entry_t *entry,
int order)
{
- int node;
unsigned long offset;
struct swap_info_struct *si, *next;
- node = numa_node_id();
spin_lock(&swap_avail_lock);
start_over:
- plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
/* Rotate the device and switch to a new cluster */
- plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
+ plist_requeue(&si->avail_list, &swap_avail_head);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
* still in the swap_avail_head list then try it, otherwise
* start over if we have not gotten any slots.
*/
- if (plist_node_empty(&next->avail_lists[node]))
+ if (plist_node_empty(&si->avail_list))
goto start_over;
}
spin_unlock(&swap_avail_lock);
static bool swap_sync_discard(void)
{
bool ret = false;
- int nid = numa_node_id();
struct swap_info_struct *si, *next;
spin_lock(&swap_avail_lock);
- plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) {
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
if (si->flags & SWP_PAGE_DISCARD)
return generic_swapfile_activate(sis, swap_file, span);
}
-static int swap_node(struct swap_info_struct *si)
-{
- struct block_device *bdev;
-
- if (si->bdev)
- bdev = si->bdev;
- else
- bdev = si->swap_file->f_inode->i_sb->s_bdev;
-
- return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
-}
-
static void setup_swap_info(struct swap_info_struct *si, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info,
unsigned long *zeromap)
{
- int i;
-
if (prio >= 0)
si->prio = prio;
else
* low-to-high, while swap ordering is high-to-low
*/
si->list.prio = -si->prio;
- for_each_node(i) {
- if (si->prio >= 0)
- si->avail_lists[i].prio = -si->prio;
- else {
- if (swap_node(si) == i)
- si->avail_lists[i].prio = 1;
- else
- si->avail_lists[i].prio = -si->prio;
- }
- }
+ si->avail_list.prio = -si->prio;
si->swap_map = swap_map;
si->cluster_info = cluster_info;
si->zeromap = zeromap;
del_from_avail_list(p, true);
if (p->prio < 0) {
struct swap_info_struct *si = p;
- int nid;
plist_for_each_entry_continue(si, &swap_active_head, list) {
si->prio++;
si->list.prio--;
- for_each_node(nid) {
- if (si->avail_lists[nid].prio != 1)
- si->avail_lists[nid].prio--;
- }
+ si->avail_list.prio--;
}
least_priority++;
}
struct swap_info_struct *p;
struct swap_info_struct *defer = NULL;
unsigned int type;
- int i;
- p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
+ p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
}
p->swap_extent_root = RB_ROOT;
plist_node_init(&p->list, 0);
- for_each_node(i)
- plist_node_init(&p->avail_lists[i], 0);
+ plist_node_init(&p->avail_list, 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
if (defer) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!swap_avail_heads)
- return -ENOMEM;
-
si = alloc_swap_info();
if (IS_ERR(si))
return PTR_ERR(si);
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
struct swap_info_struct *si, *next;
- int nid = folio_nid(folio);
if (!(gfp & __GFP_IO))
return;
return;
spin_lock(&swap_avail_lock);
- plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
- avail_lists[nid]) {
+ plist_for_each_entry_safe(si, next, &swap_avail_head,
+ avail_list) {
if (si->bdev) {
blkcg_schedule_throttle(si->bdev->bd_disk, true);
break;
static int __init swapfile_init(void)
{
- int nid;
-
- swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
- GFP_KERNEL);
- if (!swap_avail_heads) {
- pr_emerg("Not enough memory for swap heads, swap is disabled\n");
- return -ENOMEM;
- }
-
- for_each_node(nid)
- plist_head_init(&swap_avail_heads[nid]);
-
swapfile_maximum_size = arch_max_swapfile_size();
/*