[thirdparty/linux.git] / mm / page_io.c

// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/mm/page_io.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, 
 *  Asynchronous swapping added 30.12.95. Stephen Tweedie
 *  Removed race in async swapping. 14.4.1996. Bruno Haible
 *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
 *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
 */

#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <linux/delayacct.h>
#include <linux/zswap.h>
#include "swap.h"

static void __end_swap_bio_write(struct bio *bio)
{
	struct folio *folio = bio_first_folio_all(bio);

	if (bio->bi_status) {
		/*
		 * We failed to write the page out to swap-space.
		 * Re-dirty the page in order to avoid it being reclaimed.
		 * Also print a dire warning that things will go BAD (tm)
		 * very quickly.
		 *
		 * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
		 */
		folio_mark_dirty(folio);
		pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
				     (unsigned long long)bio->bi_iter.bi_sector);
		folio_clear_reclaim(folio);
	}
	folio_end_writeback(folio);
}

static void end_swap_bio_write(struct bio *bio)
{
	__end_swap_bio_write(bio);
	bio_put(bio);
}

static void __end_swap_bio_read(struct bio *bio)
{
	struct folio *folio = bio_first_folio_all(bio);

	if (bio->bi_status) {
		pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
				     (unsigned long long)bio->bi_iter.bi_sector);
	} else {
		folio_mark_uptodate(folio);
	}
	folio_unlock(folio);
}

static void end_swap_bio_read(struct bio *bio)
{
	__end_swap_bio_read(bio);
	bio_put(bio);
}

int generic_swapfile_activate(struct swap_info_struct *sis,
				struct file *swap_file,
				sector_t *span)
{
	struct address_space *mapping = swap_file->f_mapping;
	struct inode *inode = mapping->host;
	unsigned blocks_per_page;
	unsigned long page_no;
	unsigned blkbits;
	sector_t probe_block;
	sector_t last_block;
	sector_t lowest_block = -1;
	sector_t highest_block = 0;
	int nr_extents = 0;
	int ret;

	blkbits = inode->i_blkbits;
	blocks_per_page = PAGE_SIZE >> blkbits;

	/*
	 * Map all the blocks into the extent tree.  This code doesn't try
	 * to be very smart.
	 */
	probe_block = 0;
	page_no = 0;
	last_block = i_size_read(inode) >> blkbits;
	while ((probe_block + blocks_per_page) <= last_block &&
			page_no < sis->max) {
		unsigned block_in_page;
		sector_t first_block;

		cond_resched();

		first_block = probe_block;
		ret = bmap(inode, &first_block);
		if (ret || !first_block)
			goto bad_bmap;

		/*
		 * It must be PAGE_SIZE aligned on-disk
		 */
		if (first_block & (blocks_per_page - 1)) {
			probe_block++;
			goto reprobe;
		}

		for (block_in_page = 1; block_in_page < blocks_per_page;
					block_in_page++) {
			sector_t block;

			block = probe_block + block_in_page;
			ret = bmap(inode, &block);
			if (ret || !block)
				goto bad_bmap;

			if (block != first_block + block_in_page) {
				/* Discontiguity */
				probe_block++;
				goto reprobe;
			}
		}

		first_block >>= (PAGE_SHIFT - blkbits);
		if (page_no) {	/* exclude the header page */
			if (first_block < lowest_block)
				lowest_block = first_block;
			if (first_block > highest_block)
				highest_block = first_block;
		}

		/*
		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
		 */
		ret = add_swap_extent(sis, page_no, 1, first_block);
		if (ret < 0)
			goto out;
		nr_extents += ret;
		page_no++;
		probe_block += blocks_per_page;
reprobe:
		continue;
	}
	ret = nr_extents;
	*span = 1 + highest_block - lowest_block;
	if (page_no == 0)
		page_no = 1;	/* force Empty message */
	sis->max = page_no;
	sis->pages = page_no - 1;
	sis->highest_bit = page_no - 1;
out:
	return ret;
bad_bmap:
	pr_err("swapon: swapfile has holes\n");
	ret = -EINVAL;
	goto out;
}

/*
 * We may have stale swap cache pages in memory: notice
 * them here and get rid of the unnecessary final write.
 */
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
	struct folio *folio = page_folio(page);
	int ret;

	if (folio_free_swap(folio)) {
		folio_unlock(folio);
		return 0;
	}
	/*
	 * Arch code may have to preserve more data than just the page
	 * contents, e.g. memory tags.
	 */
	ret = arch_prepare_to_swap(&folio->page);
	if (ret) {
		folio_mark_dirty(folio);
		folio_unlock(folio);
		return ret;
	}
	if (zswap_store(folio)) {
		folio_start_writeback(folio);
		folio_unlock(folio);
		folio_end_writeback(folio);
		return 0;
	}
	__swap_writepage(&folio->page, wbc);
	return 0;
}

static inline void count_swpout_vm_event(struct folio *folio)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	if (unlikely(folio_test_pmd_mappable(folio)))
		count_vm_event(THP_SWPOUT);
#endif
	count_vm_events(PSWPOUT, folio_nr_pages(folio));
}

#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
{
	struct cgroup_subsys_state *css;
	struct mem_cgroup *memcg;

	memcg = folio_memcg(folio);
	if (!memcg)
		return;

	rcu_read_lock();
	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
	bio_associate_blkg_from_css(bio, css);
	rcu_read_unlock();
}
#else
#define bio_associate_blkg_from_page(bio, folio)		do { } while (0)
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */

struct swap_iocb {
	struct kiocb		iocb;
	struct bio_vec		bvec[SWAP_CLUSTER_MAX];
	int			pages;
	int			len;
};
static mempool_t *sio_pool;

int sio_pool_init(void)
{
	if (!sio_pool) {
		mempool_t *pool = mempool_create_kmalloc_pool(
			SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
		if (cmpxchg(&sio_pool, NULL, pool))
			mempool_destroy(pool);
	}
	if (!sio_pool)
		return -ENOMEM;
	return 0;
}

static void sio_write_complete(struct kiocb *iocb, long ret)
{
	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
	struct page *page = sio->bvec[0].bv_page;
	int p;

	if (ret != sio->len) {
		/*
		 * In the case of swap-over-nfs, this can be a
		 * temporary failure if the system has limited
		 * memory for allocating transmit buffers.
		 * Mark the page dirty and avoid
		 * folio_rotate_reclaimable but rate-limit the
		 * messages but do not flag PageError like
		 * the normal direct-to-bio case as it could
		 * be temporary.
		 */
		pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
				   ret, page_file_offset(page));
		for (p = 0; p < sio->pages; p++) {
			page = sio->bvec[p].bv_page;
			set_page_dirty(page);
			ClearPageReclaim(page);
		}
	} else {
		for (p = 0; p < sio->pages; p++)
			count_swpout_vm_event(page_folio(sio->bvec[p].bv_page));
	}

	for (p = 0; p < sio->pages; p++)
		end_page_writeback(sio->bvec[p].bv_page);

	mempool_free(sio, sio_pool);
}

static void swap_writepage_fs(struct page *page, struct writeback_control *wbc)
{
	struct swap_iocb *sio = NULL;
	struct swap_info_struct *sis = page_swap_info(page);
	struct file *swap_file = sis->swap_file;
	loff_t pos = page_file_offset(page);

	set_page_writeback(page);
	unlock_page(page);
	if (wbc->swap_plug)
		sio = *wbc->swap_plug;
	if (sio) {
		if (sio->iocb.ki_filp != swap_file ||
		    sio->iocb.ki_pos + sio->len != pos) {
			swap_write_unplug(sio);
			sio = NULL;
		}
	}
	if (!sio) {
		sio = mempool_alloc(sio_pool, GFP_NOIO);
		init_sync_kiocb(&sio->iocb, swap_file);
		sio->iocb.ki_complete = sio_write_complete;
		sio->iocb.ki_pos = pos;
		sio->pages = 0;
		sio->len = 0;
	}
	bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
	sio->len += thp_size(page);
	sio->pages += 1;
	if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
		swap_write_unplug(sio);
		sio = NULL;
	}
	if (wbc->swap_plug)
		*wbc->swap_plug = sio;
}

static void swap_writepage_bdev_sync(struct page *page,
		struct writeback_control *wbc, struct swap_info_struct *sis)
{
	struct bio_vec bv;
	struct bio bio;
	struct folio *folio = page_folio(page);

	bio_init(&bio, sis->bdev, &bv, 1,
		 REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
	bio.bi_iter.bi_sector = swap_page_sector(page);
	__bio_add_page(&bio, page, thp_size(page), 0);

	bio_associate_blkg_from_page(&bio, folio);
	count_swpout_vm_event(folio);

	folio_start_writeback(folio);
	folio_unlock(folio);

	submit_bio_wait(&bio);
	__end_swap_bio_write(&bio);
}

static void swap_writepage_bdev_async(struct page *page,
		struct writeback_control *wbc, struct swap_info_struct *sis)
{
	struct bio *bio;
	struct folio *folio = page_folio(page);

	bio = bio_alloc(sis->bdev, 1,
			REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
			GFP_NOIO);
	bio->bi_iter.bi_sector = swap_page_sector(page);
	bio->bi_end_io = end_swap_bio_write;
	__bio_add_page(bio, page, thp_size(page), 0);

	bio_associate_blkg_from_page(bio, folio);
	count_swpout_vm_event(folio);
	folio_start_writeback(folio);
	folio_unlock(folio);
	submit_bio(bio);
}

void __swap_writepage(struct page *page, struct writeback_control *wbc)
{
	struct swap_info_struct *sis = page_swap_info(page);

	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
	/*
	 * ->flags can be updated non-atomicially (scan_swap_map_slots),
	 * but that will never affect SWP_FS_OPS, so the data_race
	 * is safe.
	 */
	if (data_race(sis->flags & SWP_FS_OPS))
		swap_writepage_fs(page, wbc);
	else if (sis->flags & SWP_SYNCHRONOUS_IO)
		swap_writepage_bdev_sync(page, wbc, sis);
	else
		swap_writepage_bdev_async(page, wbc, sis);
}

void swap_write_unplug(struct swap_iocb *sio)
{
	struct iov_iter from;
	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
	int ret;

	iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
	if (ret != -EIOCBQUEUED)
		sio_write_complete(&sio->iocb, ret);
}

static void sio_read_complete(struct kiocb *iocb, long ret)
{
	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
	int p;

	if (ret == sio->len) {
		for (p = 0; p < sio->pages; p++) {
			struct folio *folio = page_folio(sio->bvec[p].bv_page);

			folio_mark_uptodate(folio);
			folio_unlock(folio);
		}
		count_vm_events(PSWPIN, sio->pages);
	} else {
		for (p = 0; p < sio->pages; p++) {
			struct folio *folio = page_folio(sio->bvec[p].bv_page);

			folio_unlock(folio);
		}
		pr_alert_ratelimited("Read-error on swap-device\n");
	}
	mempool_free(sio, sio_pool);
}

static void swap_readpage_fs(struct page *page,
			     struct swap_iocb **plug)
{
	struct swap_info_struct *sis = page_swap_info(page);
	struct swap_iocb *sio = NULL;
	loff_t pos = page_file_offset(page);

	if (plug)
		sio = *plug;
	if (sio) {
		if (sio->iocb.ki_filp != sis->swap_file ||
		    sio->iocb.ki_pos + sio->len != pos) {
			swap_read_unplug(sio);
			sio = NULL;
		}
	}
	if (!sio) {
		sio = mempool_alloc(sio_pool, GFP_KERNEL);
		init_sync_kiocb(&sio->iocb, sis->swap_file);
		sio->iocb.ki_pos = pos;
		sio->iocb.ki_complete = sio_read_complete;
		sio->pages = 0;
		sio->len = 0;
	}
	bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
	sio->len += thp_size(page);
	sio->pages += 1;
	if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
		swap_read_unplug(sio);
		sio = NULL;
	}
	if (plug)
		*plug = sio;
}

static void swap_readpage_bdev_sync(struct page *page,
		struct swap_info_struct *sis)
{
	struct bio_vec bv;
	struct bio bio;

	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
	bio.bi_iter.bi_sector = swap_page_sector(page);
	__bio_add_page(&bio, page, thp_size(page), 0);
	/*
	 * Keep this task valid during swap readpage because the oom killer may
	 * attempt to access it in the page fault retry time check.
	 */
	get_task_struct(current);
	count_vm_event(PSWPIN);
	submit_bio_wait(&bio);
	__end_swap_bio_read(&bio);
	put_task_struct(current);
}

static void swap_readpage_bdev_async(struct page *page,
		struct swap_info_struct *sis)
{
	struct bio *bio;

	bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
	bio->bi_iter.bi_sector = swap_page_sector(page);
	bio->bi_end_io = end_swap_bio_read;
	__bio_add_page(bio, page, thp_size(page), 0);
	count_vm_event(PSWPIN);
	submit_bio(bio);
}

void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
{
	struct folio *folio = page_folio(page);
	struct swap_info_struct *sis = page_swap_info(page);
	bool workingset = folio_test_workingset(folio);
	unsigned long pflags;
	bool in_thrashing;

	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
	VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);

	/*
	 * Count submission time as memory stall and delay. When the device
	 * is congested, or the submitting cgroup IO-throttled, submission
	 * can be a significant part of overall IO time.
	 */
	if (workingset) {
		delayacct_thrashing_start(&in_thrashing);
		psi_memstall_enter(&pflags);
	}
	delayacct_swapin_start();

	if (zswap_load(folio)) {
		folio_mark_uptodate(folio);
		folio_unlock(folio);
	} else if (data_race(sis->flags & SWP_FS_OPS)) {
		swap_readpage_fs(page, plug);
	} else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
		swap_readpage_bdev_sync(page, sis);
	} else {
		swap_readpage_bdev_async(page, sis);
	}

	if (workingset) {
		delayacct_thrashing_end(&in_thrashing);
		psi_memstall_leave(&pflags);
	}
	delayacct_swapin_end();
}

void __swap_read_unplug(struct swap_iocb *sio)
{
	struct iov_iter from;
	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
	int ret;

	iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
	if (ret != -EIOCBQUEUED)
		sio_read_complete(&sio->iocb, ret);
}
Commit	Line	Data
b2441318	1	// SPDX-License-Identifier: GPL-2.0
1da177e4 LT	2	/*
	3	* linux/mm/page_io.c
	4	*
	5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
	6	*
	7	* Swap reorganised 29.12.95,
	8	* Asynchronous swapping added 30.12.95. Stephen Tweedie
	9	* Removed race in async swapping. 14.4.1996. Bruno Haible
	10	* Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
	11	* Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
	12	*/
	13
	14	#include <linux/mm.h>
	15	#include <linux/kernel_stat.h>
5a0e3ad6	16	#include <linux/gfp.h>
1da177e4 LT	17	#include <linux/pagemap.h>
	18	#include <linux/swap.h>
	19	#include <linux/bio.h>
	20	#include <linux/swapops.h>
	21	#include <linux/writeback.h>
b430e9d1	22	#include <linux/blkdev.h>
93779069	23	#include <linux/psi.h>
e2e40f2c	24	#include <linux/uio.h>
b0ba2d0f	25	#include <linux/sched/task.h>
a3d5dc90	26	#include <linux/delayacct.h>
42c06a0e	27	#include <linux/zswap.h>
014bb1de	28	#include "swap.h"
1da177e4	29
3222d8c2	30	static void __end_swap_bio_write(struct bio *bio)
1da177e4	31	{
a3ed1e9b	32	struct folio *folio = bio_first_folio_all(bio);
1da177e4	33
4e4cbee9	34	if (bio->bi_status) {
6ddab3b9 PZ	35	/*
	36	* We failed to write the page out to swap-space.
	37	* Re-dirty the page in order to avoid it being reclaimed.
	38	* Also print a dire warning that things will go BAD (tm)
	39	* very quickly.
	40	*
575ced1c	41	* Also clear PG_reclaim to avoid folio_rotate_reclaimable()
6ddab3b9	42	*/
a3ed1e9b	43	folio_mark_dirty(folio);
25eaab43 GD	44	pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
	45	MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
	46	(unsigned long long)bio->bi_iter.bi_sector);
a3ed1e9b	47	folio_clear_reclaim(folio);
6ddab3b9	48	}
a3ed1e9b	49	folio_end_writeback(folio);
3222d8c2 CH	50	}
	51
	52	static void end_swap_bio_write(struct bio *bio)
	53	{
	54	__end_swap_bio_write(bio);
1da177e4	55	bio_put(bio);
1da177e4 LT	56	}
1da177e4 LT	57
9b4e30bd	58	static void __end_swap_bio_read(struct bio *bio)
1da177e4	59	{
bc74b53f	60	struct folio *folio = bio_first_folio_all(bio);
1da177e4	61
4e4cbee9	62	if (bio->bi_status) {
25eaab43 GD	63	pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
	64	MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
	65	(unsigned long long)bio->bi_iter.bi_sector);
9b4e30bd	66	} else {
bc74b53f	67	folio_mark_uptodate(folio);
1da177e4	68	}
bc74b53f	69	folio_unlock(folio);
9b4e30bd CH	70	}
	71
	72	static void end_swap_bio_read(struct bio *bio)
	73	{
	74	__end_swap_bio_read(bio);
1da177e4	75	bio_put(bio);
1da177e4 LT	76	}
1da177e4 LT	77
a509bc1a MG	78	int generic_swapfile_activate(struct swap_info_struct *sis,
	79	struct file *swap_file,
	80	sector_t *span)
	81	{
	82	struct address_space *mapping = swap_file->f_mapping;
	83	struct inode *inode = mapping->host;
	84	unsigned blocks_per_page;
	85	unsigned long page_no;
	86	unsigned blkbits;
	87	sector_t probe_block;
	88	sector_t last_block;
	89	sector_t lowest_block = -1;
	90	sector_t highest_block = 0;
	91	int nr_extents = 0;
	92	int ret;
	93
	94	blkbits = inode->i_blkbits;
	95	blocks_per_page = PAGE_SIZE >> blkbits;
	96
	97	/*
4efaceb1	98	* Map all the blocks into the extent tree. This code doesn't try
a509bc1a MG	99	* to be very smart.
	100	*/
	101	probe_block = 0;
	102	page_no = 0;
	103	last_block = i_size_read(inode) >> blkbits;
	104	while ((probe_block + blocks_per_page) <= last_block &&
	105	page_no < sis->max) {
	106	unsigned block_in_page;
	107	sector_t first_block;
	108
7e4411bf MP	109	cond_resched();
7e4411bf MP	110
30460e1e CM	111	first_block = probe_block;
	112	ret = bmap(inode, &first_block);
	113	if (ret \|\| !first_block)
a509bc1a MG	114	goto bad_bmap;
	115
	116	/*
	117	* It must be PAGE_SIZE aligned on-disk
	118	*/
	119	if (first_block & (blocks_per_page - 1)) {
	120	probe_block++;
	121	goto reprobe;
	122	}
	123
	124	for (block_in_page = 1; block_in_page < blocks_per_page;
	125	block_in_page++) {
	126	sector_t block;
	127
30460e1e CM	128	block = probe_block + block_in_page;
	129	ret = bmap(inode, &block);
	130	if (ret \|\| !block)
a509bc1a	131	goto bad_bmap;
30460e1e	132
a509bc1a MG	133	if (block != first_block + block_in_page) {
	134	/* Discontiguity */
	135	probe_block++;
	136	goto reprobe;
	137	}
	138	}
	139
	140	first_block >>= (PAGE_SHIFT - blkbits);
	141	if (page_no) { /* exclude the header page */
	142	if (first_block < lowest_block)
	143	lowest_block = first_block;
	144	if (first_block > highest_block)
	145	highest_block = first_block;
	146	}
	147
	148	/*
	149	* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
	150	*/
	151	ret = add_swap_extent(sis, page_no, 1, first_block);
	152	if (ret < 0)
	153	goto out;
	154	nr_extents += ret;
	155	page_no++;
	156	probe_block += blocks_per_page;
	157	reprobe:
	158	continue;
	159	}
	160	ret = nr_extents;
	161	*span = 1 + highest_block - lowest_block;
	162	if (page_no == 0)
	163	page_no = 1; /* force Empty message */
	164	sis->max = page_no;
	165	sis->pages = page_no - 1;
	166	sis->highest_bit = page_no - 1;
	167	out:
	168	return ret;
	169	bad_bmap:
1170532b	170	pr_err("swapon: swapfile has holes\n");
a509bc1a MG	171	ret = -EINVAL;
	172	goto out;
	173	}
	174
1da177e4 LT	175	/*
	176	* We may have stale swap cache pages in memory: notice
	177	* them here and get rid of the unnecessary final write.
	178	*/
	179	int swap_writepage(struct page page, struct writeback_control wbc)
	180	{
71fa1a53	181	struct folio *folio = page_folio(page);
e3e2762b	182	int ret;
1da177e4	183
71fa1a53 MWO	184	if (folio_free_swap(folio)) {
71fa1a53 MWO	185	folio_unlock(folio);
e3e2762b	186	return 0;
1da177e4	187	}
8a84802e SP	188	/*
	189	* Arch code may have to preserve more data than just the page
	190	* contents, e.g. memory tags.
	191	*/
71fa1a53	192	ret = arch_prepare_to_swap(&folio->page);
8a84802e	193	if (ret) {
71fa1a53 MWO	194	folio_mark_dirty(folio);
71fa1a53 MWO	195	folio_unlock(folio);
e3e2762b	196	return ret;
8a84802e	197	}
34f4c198	198	if (zswap_store(folio)) {
71fa1a53 MWO	199	folio_start_writeback(folio);
	200	folio_unlock(folio);
	201	folio_end_writeback(folio);
e3e2762b	202	return 0;
38b5faf4	203	}
e3e2762b CH	204	__swap_writepage(&folio->page, wbc);
e3e2762b CH	205	return 0;
2f772e6c SJ	206	}
2f772e6c SJ	207
9b72b134	208	static inline void count_swpout_vm_event(struct folio *folio)
225311a4 HY	209	{
225311a4 HY	210	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
9b72b134	211	if (unlikely(folio_test_pmd_mappable(folio)))
225311a4 HY	212	count_vm_event(THP_SWPOUT);
225311a4 HY	213	#endif
9b72b134	214	count_vm_events(PSWPOUT, folio_nr_pages(folio));
225311a4 HY	215	}
225311a4 HY	216
a18b9b15	217	#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
98630cfd	218	static void bio_associate_blkg_from_page(struct bio bio, struct folio folio)
a18b9b15 CH	219	{
a18b9b15 CH	220	struct cgroup_subsys_state *css;
bcfe06bf	221	struct mem_cgroup *memcg;
a18b9b15	222
98630cfd	223	memcg = folio_memcg(folio);
bcfe06bf	224	if (!memcg)
a18b9b15 CH	225	return;
	226
	227	rcu_read_lock();
bcfe06bf	228	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
a18b9b15 CH	229	bio_associate_blkg_from_css(bio, css);
	230	rcu_read_unlock();
	231	}
	232	#else
98630cfd	233	#define bio_associate_blkg_from_page(bio, folio) do { } while (0)
a18b9b15 CH	234	#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
a18b9b15 CH	235
e1209d3a N	236	struct swap_iocb {
e1209d3a N	237	struct kiocb iocb;
5169b844 N	238	struct bio_vec bvec[SWAP_CLUSTER_MAX];
5169b844 N	239	int pages;
a1a0dfd5	240	int len;
e1209d3a N	241	};
	242	static mempool_t *sio_pool;
	243
	244	int sio_pool_init(void)
2f772e6c	245	{
e1209d3a N	246	if (!sio_pool) {
	247	mempool_t *pool = mempool_create_kmalloc_pool(
	248	SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
	249	if (cmpxchg(&sio_pool, NULL, pool))
	250	mempool_destroy(pool);
	251	}
	252	if (!sio_pool)
	253	return -ENOMEM;
	254	return 0;
	255	}
62c230bc	256
7eadabc0 N	257	static void sio_write_complete(struct kiocb *iocb, long ret)
	258	{
	259	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
5169b844	260	struct page *page = sio->bvec[0].bv_page;
2282679f	261	int p;
62c230bc	262
a1a0dfd5	263	if (ret != sio->len) {
7eadabc0 N	264	/*
	265	* In the case of swap-over-nfs, this can be a
	266	* temporary failure if the system has limited
	267	* memory for allocating transmit buffers.
	268	* Mark the page dirty and avoid
	269	* folio_rotate_reclaimable but rate-limit the
	270	* messages but do not flag PageError like
	271	* the normal direct-to-bio case as it could
	272	* be temporary.
	273	*/
7eadabc0 N	274	pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
7eadabc0 N	275	ret, page_file_offset(page));
2282679f N	276	for (p = 0; p < sio->pages; p++) {
2282679f N	277	page = sio->bvec[p].bv_page;
2d30d31e	278	set_page_dirty(page);
0cdc444a	279	ClearPageReclaim(page);
62c230bc	280	}
6341a446 N	281	} else {
6341a446 N	282	for (p = 0; p < sio->pages; p++)
9b72b134	283	count_swpout_vm_event(page_folio(sio->bvec[p].bv_page));
62c230bc MG	284	}
62c230bc MG	285
2282679f N	286	for (p = 0; p < sio->pages; p++)
	287	end_page_writeback(sio->bvec[p].bv_page);
	288
7eadabc0 N	289	mempool_free(sio, sio_pool);
	290	}
	291
e3e2762b	292	static void swap_writepage_fs(struct page page, struct writeback_control wbc)
7eadabc0	293	{
2282679f	294	struct swap_iocb *sio = NULL;
7eadabc0 N	295	struct swap_info_struct *sis = page_swap_info(page);
7eadabc0 N	296	struct file *swap_file = sis->swap_file;
2282679f	297	loff_t pos = page_file_offset(page);
7eadabc0 N	298
	299	set_page_writeback(page);
	300	unlock_page(page);
2282679f N	301	if (wbc->swap_plug)
	302	sio = *wbc->swap_plug;
	303	if (sio) {
	304	if (sio->iocb.ki_filp != swap_file \|\|
a1a0dfd5	305	sio->iocb.ki_pos + sio->len != pos) {
2282679f N	306	swap_write_unplug(sio);
	307	sio = NULL;
	308	}
	309	}
	310	if (!sio) {
	311	sio = mempool_alloc(sio_pool, GFP_NOIO);
	312	init_sync_kiocb(&sio->iocb, swap_file);
	313	sio->iocb.ki_complete = sio_write_complete;
	314	sio->iocb.ki_pos = pos;
	315	sio->pages = 0;
a1a0dfd5	316	sio->len = 0;
2282679f	317	}
8976fa6d	318	bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
a1a0dfd5	319	sio->len += thp_size(page);
2282679f N	320	sio->pages += 1;
	321	if (sio->pages == ARRAY_SIZE(sio->bvec) \|\| !wbc->swap_plug) {
	322	swap_write_unplug(sio);
	323	sio = NULL;
	324	}
	325	if (wbc->swap_plug)
	326	*wbc->swap_plug = sio;
7eadabc0 N	327	}
7eadabc0 N	328
3222d8c2	329	static void swap_writepage_bdev_sync(struct page *page,
05cda97e	330	struct writeback_control wbc, struct swap_info_struct sis)
2f772e6c	331	{
3222d8c2 CH	332	struct bio_vec bv;
3222d8c2 CH	333	struct bio bio;
f54fcaab	334	struct folio *folio = page_folio(page);
62c230bc	335
3222d8c2 CH	336	bio_init(&bio, sis->bdev, &bv, 1,
	337	REQ_OP_WRITE \| REQ_SWAP \| wbc_to_write_flags(wbc));
	338	bio.bi_iter.bi_sector = swap_page_sector(page);
cb58bf91	339	__bio_add_page(&bio, page, thp_size(page), 0);
62c230bc	340
98630cfd	341	bio_associate_blkg_from_page(&bio, folio);
9b72b134	342	count_swpout_vm_event(folio);
3222d8c2	343
f54fcaab Z	344	folio_start_writeback(folio);
f54fcaab Z	345	folio_unlock(folio);
3222d8c2 CH	346
	347	submit_bio_wait(&bio);
	348	__end_swap_bio_write(&bio);
	349	}
	350
	351	static void swap_writepage_bdev_async(struct page *page,
	352	struct writeback_control wbc, struct swap_info_struct sis)
	353	{
	354	struct bio *bio;
2675251d	355	struct folio *folio = page_folio(page);
dd6bd0d9	356
07888c66 CH	357	bio = bio_alloc(sis->bdev, 1,
	358	REQ_OP_WRITE \| REQ_SWAP \| wbc_to_write_flags(wbc),
	359	GFP_NOIO);
48d15436	360	bio->bi_iter.bi_sector = swap_page_sector(page);
cf1e3fe4	361	bio->bi_end_io = end_swap_bio_write;
cb58bf91	362	__bio_add_page(bio, page, thp_size(page), 0);
48d15436	363
98630cfd	364	bio_associate_blkg_from_page(bio, folio);
9b72b134	365	count_swpout_vm_event(folio);
2675251d Z	366	folio_start_writeback(folio);
2675251d Z	367	folio_unlock(folio);
4e49ea4a	368	submit_bio(bio);
1da177e4	369	}
548d9782	370
05cda97e CH	371	void __swap_writepage(struct page page, struct writeback_control wbc)
	372	{
	373	struct swap_info_struct *sis = page_swap_info(page);
	374
	375	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
	376	/*
	377	* ->flags can be updated non-atomicially (scan_swap_map_slots),
	378	* but that will never affect SWP_FS_OPS, so the data_race
	379	* is safe.
	380	*/
	381	if (data_race(sis->flags & SWP_FS_OPS))
	382	swap_writepage_fs(page, wbc);
3222d8c2 CH	383	else if (sis->flags & SWP_SYNCHRONOUS_IO)
3222d8c2 CH	384	swap_writepage_bdev_sync(page, wbc, sis);
05cda97e	385	else
3222d8c2	386	swap_writepage_bdev_async(page, wbc, sis);
1da177e4 LT	387	}
1da177e4 LT	388
2282679f N	389	void swap_write_unplug(struct swap_iocb *sio)
	390	{
	391	struct iov_iter from;
	392	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
	393	int ret;
	394
de4eda9d	395	iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
2282679f N	396	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
	397	if (ret != -EIOCBQUEUED)
	398	sio_write_complete(&sio->iocb, ret);
	399	}
	400
e1209d3a N	401	static void sio_read_complete(struct kiocb *iocb, long ret)
	402	{
	403	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
5169b844	404	int p;
e1209d3a	405
a1a0dfd5	406	if (ret == sio->len) {
5169b844	407	for (p = 0; p < sio->pages; p++) {
6a8c0687	408	struct folio *folio = page_folio(sio->bvec[p].bv_page);
5169b844	409
6a8c0687 Z	410	folio_mark_uptodate(folio);
6a8c0687 Z	411	folio_unlock(folio);
5169b844 N	412	}
5169b844 N	413	count_vm_events(PSWPIN, sio->pages);
e1209d3a	414	} else {
5169b844	415	for (p = 0; p < sio->pages; p++) {
6a8c0687	416	struct folio *folio = page_folio(sio->bvec[p].bv_page);
5169b844	417
6a8c0687	418	folio_unlock(folio);
5169b844 N	419	}
5169b844 N	420	pr_alert_ratelimited("Read-error on swap-device\n");
e1209d3a	421	}
e1209d3a N	422	mempool_free(sio, sio_pool);
	423	}
	424
5169b844 N	425	static void swap_readpage_fs(struct page *page,
5169b844 N	426	struct swap_iocb **plug)
e1209d3a N	427	{
e1209d3a N	428	struct swap_info_struct *sis = page_swap_info(page);
5169b844	429	struct swap_iocb *sio = NULL;
e1209d3a	430	loff_t pos = page_file_offset(page);
e1209d3a	431
5169b844 N	432	if (plug)
	433	sio = *plug;
	434	if (sio) {
	435	if (sio->iocb.ki_filp != sis->swap_file \|\|
a1a0dfd5	436	sio->iocb.ki_pos + sio->len != pos) {
5169b844 N	437	swap_read_unplug(sio);
	438	sio = NULL;
	439	}
	440	}
	441	if (!sio) {
	442	sio = mempool_alloc(sio_pool, GFP_KERNEL);
	443	init_sync_kiocb(&sio->iocb, sis->swap_file);
	444	sio->iocb.ki_pos = pos;
	445	sio->iocb.ki_complete = sio_read_complete;
	446	sio->pages = 0;
a1a0dfd5	447	sio->len = 0;
5169b844	448	}
8976fa6d	449	bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
a1a0dfd5	450	sio->len += thp_size(page);
5169b844 N	451	sio->pages += 1;
	452	if (sio->pages == ARRAY_SIZE(sio->bvec) \|\| !plug) {
	453	swap_read_unplug(sio);
	454	sio = NULL;
	455	}
	456	if (plug)
	457	*plug = sio;
e1209d3a N	458	}
e1209d3a N	459
9b4e30bd	460	static void swap_readpage_bdev_sync(struct page *page,
14bd75f5	461	struct swap_info_struct *sis)
1da177e4	462	{
9b4e30bd CH	463	struct bio_vec bv;
9b4e30bd CH	464	struct bio bio;
62c230bc	465
9b4e30bd CH	466	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
9b4e30bd CH	467	bio.bi_iter.bi_sector = swap_page_sector(page);
cb58bf91	468	__bio_add_page(&bio, page, thp_size(page), 0);
b0ba2d0f TH	469	/*
	470	* Keep this task valid during swap readpage because the oom killer may
	471	* attempt to access it in the page fault retry time check.
	472	*/
9b4e30bd	473	get_task_struct(current);
f8891e5e	474	count_vm_event(PSWPIN);
9b4e30bd CH	475	submit_bio_wait(&bio);
	476	__end_swap_bio_read(&bio);
	477	put_task_struct(current);
	478	}
	479
	480	static void swap_readpage_bdev_async(struct page *page,
	481	struct swap_info_struct *sis)
1da177e4 LT	482	{
1da177e4 LT	483	struct bio *bio;
23955622	484
9b4e30bd CH	485	bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
	486	bio->bi_iter.bi_sector = swap_page_sector(page);
	487	bio->bi_end_io = end_swap_bio_read;
cb58bf91	488	__bio_add_page(bio, page, thp_size(page), 0);
9b4e30bd CH	489	count_vm_event(PSWPIN);
9b4e30bd CH	490	submit_bio(bio);
14bd75f5 CH	491	}
	492
	493	void swap_readpage(struct page page, bool synchronous, struct swap_iocb *plug)
	494	{
fbcec6a3	495	struct folio *folio = page_folio(page);
62c230bc	496	struct swap_info_struct *sis = page_swap_info(page);
fbcec6a3	497	bool workingset = folio_test_workingset(folio);
93779069	498	unsigned long pflags;
3a9bb7b1	499	bool in_thrashing;
1da177e4	500
fbcec6a3 MWO	501	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
	502	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
	503	VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
93779069 MK	504
93779069 MK	505	/*
3a9bb7b1 YY	506	* Count submission time as memory stall and delay. When the device
	507	* is congested, or the submitting cgroup IO-throttled, submission
	508	* can be a significant part of overall IO time.
93779069	509	*/
3a9bb7b1 YY	510	if (workingset) {
3a9bb7b1 YY	511	delayacct_thrashing_start(&in_thrashing);
d8c47cc7	512	psi_memstall_enter(&pflags);
3a9bb7b1	513	}
a3d5dc90	514	delayacct_swapin_start();
93779069	515
ca54f6d8	516	if (zswap_load(folio)) {
fbcec6a3 MWO	517	folio_mark_uptodate(folio);
fbcec6a3 MWO	518	folio_unlock(folio);
14bd75f5	519	} else if (data_race(sis->flags & SWP_FS_OPS)) {
5169b844	520	swap_readpage_fs(page, plug);
3222d8c2	521	} else if (synchronous \|\| (sis->flags & SWP_SYNCHRONOUS_IO)) {
9b4e30bd	522	swap_readpage_bdev_sync(page, sis);
14bd75f5	523	} else {
9b4e30bd	524	swap_readpage_bdev_async(page, sis);
23955622	525	}
23955622	526
3a9bb7b1 YY	527	if (workingset) {
3a9bb7b1 YY	528	delayacct_thrashing_end(&in_thrashing);
d8c47cc7	529	psi_memstall_leave(&pflags);
3a9bb7b1	530	}
a3d5dc90	531	delayacct_swapin_end();
1da177e4	532	}
62c230bc	533
5169b844	534	void __swap_read_unplug(struct swap_iocb *sio)
62c230bc	535	{
5169b844 N	536	struct iov_iter from;
	537	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
	538	int ret;
cc30c5d6	539
de4eda9d	540	iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
5169b844 N	541	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
	542	if (ret != -EIOCBQUEUED)
	543	sio_read_complete(&sio->iocb, ret);
62c230bc	544	}