]> git.ipfire.org Git - people/ms/linux.git/blame - fs/xfs/xfs_aops.c
xfs: don't ever put nlink > 0 inodes on the unlinked list
[people/ms/linux.git] / fs / xfs / xfs_aops.c
CommitLineData
0b61f8a4 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
7b718769 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
98c1a7c0 4 * Copyright (c) 2016-2018 Christoph Hellwig.
7b718769 5 * All Rights Reserved.
1da177e4 6 */
1da177e4 7#include "xfs.h"
70a9883c 8#include "xfs_shared.h"
239880ef
DC
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
1da177e4 12#include "xfs_mount.h"
1da177e4 13#include "xfs_inode.h"
239880ef 14#include "xfs_trans.h"
281627df 15#include "xfs_inode_item.h"
a844f451 16#include "xfs_alloc.h"
1da177e4 17#include "xfs_error.h"
1da177e4 18#include "xfs_iomap.h"
0b1b213f 19#include "xfs_trace.h"
3ed3a434 20#include "xfs_bmap.h"
68988114 21#include "xfs_bmap_util.h"
a4fbe6ab 22#include "xfs_bmap_btree.h"
ef473667 23#include "xfs_reflink.h"
1da177e4
LT
24#include <linux/writeback.h>
25
fbcc0256
DC
26/*
27 * structure owned by writepages passed to individual writepage calls
28 */
29struct xfs_writepage_ctx {
30 struct xfs_bmbt_irec imap;
fbcc0256 31 unsigned int io_type;
d9252d52 32 unsigned int data_seq;
e666aa37 33 unsigned int cow_seq;
fbcc0256 34 struct xfs_ioend *ioend;
fbcc0256
DC
35};
36
20a90f58 37struct block_device *
6214ed44 38xfs_find_bdev_for_inode(
046f1685 39 struct inode *inode)
6214ed44 40{
046f1685 41 struct xfs_inode *ip = XFS_I(inode);
6214ed44
CH
42 struct xfs_mount *mp = ip->i_mount;
43
71ddabb9 44 if (XFS_IS_REALTIME_INODE(ip))
6214ed44
CH
45 return mp->m_rtdev_targp->bt_bdev;
46 else
47 return mp->m_ddev_targp->bt_bdev;
48}
49
486aff5e
DW
50struct dax_device *
51xfs_find_daxdev_for_inode(
52 struct inode *inode)
53{
54 struct xfs_inode *ip = XFS_I(inode);
55 struct xfs_mount *mp = ip->i_mount;
56
57 if (XFS_IS_REALTIME_INODE(ip))
58 return mp->m_rtdev_targp->bt_daxdev;
59 else
60 return mp->m_ddev_targp->bt_daxdev;
61}
62
ac8ee546
CH
63static void
64xfs_finish_page_writeback(
65 struct inode *inode,
66 struct bio_vec *bvec,
67 int error)
68{
82cb1417
CH
69 struct iomap_page *iop = to_iomap_page(bvec->bv_page);
70
ac8ee546
CH
71 if (error) {
72 SetPageError(bvec->bv_page);
73 mapping_set_error(inode->i_mapping, -EIO);
74 }
ac8ee546 75
82cb1417
CH
76 ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
77 ASSERT(!iop || atomic_read(&iop->write_count) > 0);
8353a814 78
82cb1417 79 if (!iop || atomic_dec_and_test(&iop->write_count))
8353a814 80 end_page_writeback(bvec->bv_page);
37992c18
DC
81}
82
83/*
84 * We're now finished for good with this ioend structure. Update the page
85 * state, release holds on bios, and finally free up memory. Do not use the
86 * ioend after this.
f6d6d4fc 87 */
0829c360
CH
88STATIC void
89xfs_destroy_ioend(
0e51a8e1
CH
90 struct xfs_ioend *ioend,
91 int error)
0829c360 92{
37992c18 93 struct inode *inode = ioend->io_inode;
8353a814
CH
94 struct bio *bio = &ioend->io_inline_bio;
95 struct bio *last = ioend->io_bio, *next;
96 u64 start = bio->bi_iter.bi_sector;
97 bool quiet = bio_flagged(bio, BIO_QUIET);
f6d6d4fc 98
0e51a8e1 99 for (bio = &ioend->io_inline_bio; bio; bio = next) {
37992c18
DC
100 struct bio_vec *bvec;
101 int i;
102
0e51a8e1
CH
103 /*
104 * For the last bio, bi_private points to the ioend, so we
105 * need to explicitly end the iteration here.
106 */
107 if (bio == last)
108 next = NULL;
109 else
110 next = bio->bi_private;
583fa586 111
37992c18 112 /* walk each page on bio, ending page IO on them */
82cb1417
CH
113 bio_for_each_segment_all(bvec, bio, i)
114 xfs_finish_page_writeback(inode, bvec, error);
37992c18 115 bio_put(bio);
f6d6d4fc 116 }
8353a814
CH
117
118 if (unlikely(error && !quiet)) {
119 xfs_err_ratelimited(XFS_I(inode)->i_mount,
120 "writeback error on sector %llu", start);
121 }
0829c360
CH
122}
123
fc0063c4
CH
124/*
125 * Fast and loose check if this write could update the on-disk inode size.
126 */
127static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
128{
129 return ioend->io_offset + ioend->io_size >
130 XFS_I(ioend->io_inode)->i_d.di_size;
131}
132
281627df
CH
133STATIC int
134xfs_setfilesize_trans_alloc(
135 struct xfs_ioend *ioend)
136{
137 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
138 struct xfs_trans *tp;
139 int error;
140
4df0f7f1
DC
141 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
142 XFS_TRANS_NOFS, &tp);
253f4911 143 if (error)
281627df 144 return error;
281627df
CH
145
146 ioend->io_append_trans = tp;
147
d9457dc0 148 /*
437a255a 149 * We may pass freeze protection with a transaction. So tell lockdep
d9457dc0
JK
150 * we released it.
151 */
bee9182d 152 __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
281627df
CH
153 /*
154 * We hand off the transaction to the completion thread now, so
155 * clear the flag here.
156 */
9070733b 157 current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
281627df
CH
158 return 0;
159}
160
ba87ea69 161/*
2813d682 162 * Update on-disk file size now that data has been written to disk.
ba87ea69 163 */
281627df 164STATIC int
e372843a 165__xfs_setfilesize(
2ba66237
CH
166 struct xfs_inode *ip,
167 struct xfs_trans *tp,
168 xfs_off_t offset,
169 size_t size)
ba87ea69 170{
ba87ea69 171 xfs_fsize_t isize;
ba87ea69 172
aa6bf01d 173 xfs_ilock(ip, XFS_ILOCK_EXCL);
2ba66237 174 isize = xfs_new_eof(ip, offset + size);
281627df
CH
175 if (!isize) {
176 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4906e215 177 xfs_trans_cancel(tp);
281627df 178 return 0;
ba87ea69
LM
179 }
180
2ba66237 181 trace_xfs_setfilesize(ip, offset, size);
281627df
CH
182
183 ip->i_d.di_size = isize;
184 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
185 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
186
70393313 187 return xfs_trans_commit(tp);
77d7a0c2
DC
188}
189
e372843a
CH
190int
191xfs_setfilesize(
192 struct xfs_inode *ip,
193 xfs_off_t offset,
194 size_t size)
195{
196 struct xfs_mount *mp = ip->i_mount;
197 struct xfs_trans *tp;
198 int error;
199
200 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
201 if (error)
202 return error;
203
204 return __xfs_setfilesize(ip, tp, offset, size);
205}
206
2ba66237
CH
207STATIC int
208xfs_setfilesize_ioend(
0e51a8e1
CH
209 struct xfs_ioend *ioend,
210 int error)
2ba66237
CH
211{
212 struct xfs_inode *ip = XFS_I(ioend->io_inode);
213 struct xfs_trans *tp = ioend->io_append_trans;
214
215 /*
216 * The transaction may have been allocated in the I/O submission thread,
217 * thus we need to mark ourselves as being in a transaction manually.
218 * Similarly for freeze protection.
219 */
9070733b 220 current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
bee9182d 221 __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
2ba66237 222
5cb13dcd 223 /* we abort the update if there was an IO error */
0e51a8e1 224 if (error) {
5cb13dcd 225 xfs_trans_cancel(tp);
0e51a8e1 226 return error;
5cb13dcd
Z
227 }
228
e372843a 229 return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
2ba66237
CH
230}
231
0829c360 232/*
5ec4fabb 233 * IO write completion.
f6d6d4fc
CH
234 */
235STATIC void
5ec4fabb 236xfs_end_io(
77d7a0c2 237 struct work_struct *work)
0829c360 238{
0e51a8e1
CH
239 struct xfs_ioend *ioend =
240 container_of(work, struct xfs_ioend, io_work);
241 struct xfs_inode *ip = XFS_I(ioend->io_inode);
787eb485
CH
242 xfs_off_t offset = ioend->io_offset;
243 size_t size = ioend->io_size;
4e4cbee9 244 int error;
ba87ea69 245
af055e37 246 /*
787eb485 247 * Just clean up the in-memory strutures if the fs has been shut down.
af055e37 248 */
787eb485 249 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
0e51a8e1 250 error = -EIO;
787eb485
CH
251 goto done;
252 }
04f658ee 253
43caeb18 254 /*
787eb485 255 * Clean up any COW blocks on an I/O error.
43caeb18 256 */
4e4cbee9 257 error = blk_status_to_errno(ioend->io_bio->bi_status);
787eb485
CH
258 if (unlikely(error)) {
259 switch (ioend->io_type) {
260 case XFS_IO_COW:
261 xfs_reflink_cancel_cow_range(ip, offset, size, true);
262 break;
43caeb18 263 }
787eb485
CH
264
265 goto done;
43caeb18
DW
266 }
267
5ec4fabb 268 /*
787eb485 269 * Success: commit the COW or unwritten blocks if needed.
5ec4fabb 270 */
787eb485
CH
271 switch (ioend->io_type) {
272 case XFS_IO_COW:
273 error = xfs_reflink_end_cow(ip, offset, size);
274 break;
275 case XFS_IO_UNWRITTEN:
ee70daab
EG
276 /* writeback should never update isize */
277 error = xfs_iomap_write_unwritten(ip, offset, size, false);
787eb485
CH
278 break;
279 default:
280 ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
281 break;
5ec4fabb 282 }
ba87ea69 283
04f658ee 284done:
787eb485
CH
285 if (ioend->io_append_trans)
286 error = xfs_setfilesize_ioend(ioend, error);
0e51a8e1 287 xfs_destroy_ioend(ioend, error);
c626d174
DC
288}
289
0e51a8e1
CH
290STATIC void
291xfs_end_bio(
292 struct bio *bio)
0829c360 293{
0e51a8e1
CH
294 struct xfs_ioend *ioend = bio->bi_private;
295 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
0829c360 296
43caeb18 297 if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
0e51a8e1
CH
298 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
299 else if (ioend->io_append_trans)
300 queue_work(mp->m_data_workqueue, &ioend->io_work);
301 else
4e4cbee9 302 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
0829c360
CH
303}
304
d9252d52
BF
305/*
306 * Fast revalidation of the cached writeback mapping. Return true if the current
307 * mapping is valid, false otherwise.
308 */
309static bool
310xfs_imap_valid(
311 struct xfs_writepage_ctx *wpc,
312 struct xfs_inode *ip,
313 xfs_fileoff_t offset_fsb)
314{
315 if (offset_fsb < wpc->imap.br_startoff ||
316 offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
317 return false;
318 /*
319 * If this is a COW mapping, it is sufficient to check that the mapping
320 * covers the offset. Be careful to check this first because the caller
321 * can revalidate a COW mapping without updating the data seqno.
322 */
323 if (wpc->io_type == XFS_IO_COW)
324 return true;
325
326 /*
327 * This is not a COW mapping. Check the sequence number of the data fork
328 * because concurrent changes could have invalidated the extent. Check
329 * the COW fork because concurrent changes since the last time we
330 * checked (and found nothing at this offset) could have added
331 * overlapping blocks.
332 */
333 if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
334 return false;
335 if (xfs_inode_has_cow_data(ip) &&
336 wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
337 return false;
338 return true;
339}
340
1da177e4
LT
341STATIC int
342xfs_map_blocks(
5c665e5b 343 struct xfs_writepage_ctx *wpc,
1da177e4 344 struct inode *inode,
5c665e5b 345 loff_t offset)
1da177e4 346{
a206c817
CH
347 struct xfs_inode *ip = XFS_I(inode);
348 struct xfs_mount *mp = ip->i_mount;
93407472 349 ssize_t count = i_blocksize(inode);
889c65b3 350 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
e666aa37 351 xfs_fileoff_t cow_fsb = NULLFILEOFF;
5c665e5b
CH
352 struct xfs_bmbt_irec imap;
353 int whichfork = XFS_DATA_FORK;
060d4eaa 354 struct xfs_iext_cursor icur;
a206c817 355 int error = 0;
a206c817 356
d9252d52
BF
357 if (XFS_FORCED_SHUTDOWN(mp))
358 return -EIO;
359
889c65b3
CH
360 /*
361 * COW fork blocks can overlap data fork blocks even if the blocks
362 * aren't shared. COW I/O always takes precedent, so we must always
363 * check for overlap on reflink inodes unless the mapping is already a
e666aa37
CH
364 * COW one, or the COW fork hasn't changed from the last time we looked
365 * at it.
366 *
367 * It's safe to check the COW fork if_seq here without the ILOCK because
368 * we've indirectly protected against concurrent updates: writeback has
369 * the page locked, which prevents concurrent invalidations by reflink
370 * and directio and prevents concurrent buffered writes to the same
371 * page. Changes to if_seq always happen under i_lock, which protects
372 * against concurrent updates and provides a memory barrier on the way
373 * out that ensures that we always see the current value.
889c65b3 374 */
d9252d52 375 if (xfs_imap_valid(wpc, ip, offset_fsb))
889c65b3
CH
376 return 0;
377
889c65b3
CH
378 /*
379 * If we don't have a valid map, now it's time to get a new one for this
380 * offset. This will convert delayed allocations (including COW ones)
381 * into real extents. If we return without a valid map, it means we
382 * landed in a hole and we skip the block.
383 */
988ef927 384 xfs_ilock(ip, XFS_ILOCK_SHARED);
8ff2957d
CH
385 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
386 (ip->i_df.if_flags & XFS_IFEXTENTS));
d2c28191 387 ASSERT(offset <= mp->m_super->s_maxbytes);
8ff2957d 388
060d4eaa
CH
389 if (offset > mp->m_super->s_maxbytes - count)
390 count = mp->m_super->s_maxbytes - offset;
391 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
060d4eaa
CH
392
393 /*
394 * Check if this is offset is covered by a COW extents, and if yes use
395 * it directly instead of looking up anything in the data fork.
396 */
51d62690 397 if (xfs_inode_has_cow_data(ip) &&
e666aa37
CH
398 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
399 cow_fsb = imap.br_startoff;
400 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
2ba090d5 401 wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
5c665e5b
CH
402 xfs_iunlock(ip, XFS_ILOCK_SHARED);
403 /*
404 * Truncate can race with writeback since writeback doesn't
405 * take the iolock and truncate decreases the file size before
406 * it starts truncating the pages between new_size and old_size.
407 * Therefore, we can end up in the situation where writeback
408 * gets a CoW fork mapping but the truncate makes the mapping
409 * invalid and we end up in here trying to get a new mapping.
410 * bail out here so that we simply never get a valid mapping
411 * and so we drop the write altogether. The page truncation
412 * will kill the contents anyway.
413 */
414 if (offset > i_size_read(inode)) {
415 wpc->io_type = XFS_IO_HOLE;
416 return 0;
417 }
418 whichfork = XFS_COW_FORK;
419 wpc->io_type = XFS_IO_COW;
420 goto allocate_blocks;
421 }
422
423 /*
d9252d52
BF
424 * No COW extent overlap. Revalidate now that we may have updated
425 * ->cow_seq. If the data mapping is still valid, we're done.
5c665e5b 426 */
d9252d52 427 if (xfs_imap_valid(wpc, ip, offset_fsb)) {
5c665e5b
CH
428 xfs_iunlock(ip, XFS_ILOCK_SHARED);
429 return 0;
430 }
431
432 /*
433 * If we don't have a valid map, now it's time to get a new one for this
434 * offset. This will convert delayed allocations (including COW ones)
435 * into real extents.
436 */
3345746e
CH
437 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
438 imap.br_startoff = end_fsb; /* fake a hole past EOF */
d9252d52 439 wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
8ff2957d 440 xfs_iunlock(ip, XFS_ILOCK_SHARED);
a206c817 441
3345746e
CH
442 if (imap.br_startoff > offset_fsb) {
443 /* landed in a hole or beyond EOF */
444 imap.br_blockcount = imap.br_startoff - offset_fsb;
5c665e5b 445 imap.br_startoff = offset_fsb;
5c665e5b
CH
446 imap.br_startblock = HOLESTARTBLOCK;
447 wpc->io_type = XFS_IO_HOLE;
e2f6ad46 448 } else {
e666aa37
CH
449 /*
450 * Truncate to the next COW extent if there is one. This is the
451 * only opportunity to do this because we can skip COW fork
452 * lookups for the subsequent blocks in the mapping; however,
453 * the requirement to treat the COW range separately remains.
454 */
455 if (cow_fsb != NULLFILEOFF &&
456 cow_fsb < imap.br_startoff + imap.br_blockcount)
457 imap.br_blockcount = cow_fsb - imap.br_startoff;
458
e2f6ad46
DC
459 if (isnullstartblock(imap.br_startblock)) {
460 /* got a delalloc extent */
461 wpc->io_type = XFS_IO_DELALLOC;
462 goto allocate_blocks;
463 }
5c665e5b 464
e2f6ad46
DC
465 if (imap.br_state == XFS_EXT_UNWRITTEN)
466 wpc->io_type = XFS_IO_UNWRITTEN;
467 else
468 wpc->io_type = XFS_IO_OVERWRITE;
8ff2957d 469 }
e2f6ad46 470
5c665e5b
CH
471 wpc->imap = imap;
472 trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
473 return 0;
474allocate_blocks:
e666aa37 475 error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
d9252d52
BF
476 whichfork == XFS_COW_FORK ?
477 &wpc->cow_seq : &wpc->data_seq);
5c665e5b
CH
478 if (error)
479 return error;
e666aa37
CH
480 ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
481 imap.br_startoff + imap.br_blockcount <= cow_fsb);
5c665e5b
CH
482 wpc->imap = imap;
483 trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
8ff2957d 484 return 0;
1da177e4
LT
485}
486
f6d6d4fc 487/*
bb18782a
DC
488 * Submit the bio for an ioend. We are passed an ioend with a bio attached to
489 * it, and we submit that bio. The ioend may be used for multiple bio
490 * submissions, so we only want to allocate an append transaction for the ioend
491 * once. In the case of multiple bio submission, each bio will take an IO
492 * reference to the ioend to ensure that the ioend completion is only done once
493 * all bios have been submitted and the ioend is really done.
7bf7f352
DC
494 *
495 * If @fail is non-zero, it means that we have a situation where some part of
496 * the submission process has failed after we have marked paged for writeback
bb18782a
DC
497 * and unlocked them. In this situation, we need to fail the bio and ioend
498 * rather than submit it to IO. This typically only happens on a filesystem
499 * shutdown.
f6d6d4fc 500 */
e10de372 501STATIC int
f6d6d4fc 502xfs_submit_ioend(
06342cf8 503 struct writeback_control *wbc,
0e51a8e1 504 struct xfs_ioend *ioend,
e10de372 505 int status)
f6d6d4fc 506{
5eda4300
DW
507 /* Convert CoW extents to regular */
508 if (!status && ioend->io_type == XFS_IO_COW) {
4a2d01b0
DC
509 /*
510 * Yuk. This can do memory allocation, but is not a
511 * transactional operation so everything is done in GFP_KERNEL
512 * context. That can deadlock, because we hold pages in
513 * writeback state and GFP_KERNEL allocations can block on them.
514 * Hence we must operate in nofs conditions here.
515 */
516 unsigned nofs_flag;
517
518 nofs_flag = memalloc_nofs_save();
5eda4300
DW
519 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
520 ioend->io_offset, ioend->io_size);
4a2d01b0 521 memalloc_nofs_restore(nofs_flag);
5eda4300
DW
522 }
523
e10de372
DC
524 /* Reserve log space if we might write beyond the on-disk inode size. */
525 if (!status &&
0e51a8e1 526 ioend->io_type != XFS_IO_UNWRITTEN &&
bb18782a
DC
527 xfs_ioend_is_append(ioend) &&
528 !ioend->io_append_trans)
e10de372 529 status = xfs_setfilesize_trans_alloc(ioend);
bb18782a 530
0e51a8e1
CH
531 ioend->io_bio->bi_private = ioend;
532 ioend->io_bio->bi_end_io = xfs_end_bio;
7637241e 533 ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
70fd7614 534
e10de372
DC
535 /*
536 * If we are failing the IO now, just mark the ioend with an
537 * error and finish it. This will run IO completion immediately
538 * as there is only one reference to the ioend at this point in
539 * time.
540 */
541 if (status) {
4e4cbee9 542 ioend->io_bio->bi_status = errno_to_blk_status(status);
0e51a8e1 543 bio_endio(ioend->io_bio);
e10de372
DC
544 return status;
545 }
d88992f6 546
31d7d58d 547 ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
4e49ea4a 548 submit_bio(ioend->io_bio);
e10de372 549 return 0;
f6d6d4fc 550}
f6d6d4fc 551
0e51a8e1
CH
552static struct xfs_ioend *
553xfs_alloc_ioend(
554 struct inode *inode,
555 unsigned int type,
556 xfs_off_t offset,
3faed667
CH
557 struct block_device *bdev,
558 sector_t sector)
0e51a8e1
CH
559{
560 struct xfs_ioend *ioend;
561 struct bio *bio;
f6d6d4fc 562
e292d7bc 563 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
3faed667
CH
564 bio_set_dev(bio, bdev);
565 bio->bi_iter.bi_sector = sector;
0e51a8e1
CH
566
567 ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
568 INIT_LIST_HEAD(&ioend->io_list);
569 ioend->io_type = type;
570 ioend->io_inode = inode;
571 ioend->io_size = 0;
572 ioend->io_offset = offset;
573 INIT_WORK(&ioend->io_work, xfs_end_io);
574 ioend->io_append_trans = NULL;
575 ioend->io_bio = bio;
576 return ioend;
577}
578
579/*
580 * Allocate a new bio, and chain the old bio to the new one.
581 *
582 * Note that we have to do perform the chaining in this unintuitive order
583 * so that the bi_private linkage is set up in the right direction for the
584 * traversal in xfs_destroy_ioend().
585 */
586static void
587xfs_chain_bio(
588 struct xfs_ioend *ioend,
589 struct writeback_control *wbc,
3faed667
CH
590 struct block_device *bdev,
591 sector_t sector)
0e51a8e1
CH
592{
593 struct bio *new;
594
595 new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
3faed667
CH
596 bio_set_dev(new, bdev);
597 new->bi_iter.bi_sector = sector;
0e51a8e1
CH
598 bio_chain(ioend->io_bio, new);
599 bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
7637241e 600 ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
31d7d58d 601 ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
4e49ea4a 602 submit_bio(ioend->io_bio);
0e51a8e1 603 ioend->io_bio = new;
f6d6d4fc
CH
604}
605
606/*
3faed667
CH
607 * Test to see if we have an existing ioend structure that we could append to
608 * first, otherwise finish off the current ioend and start another.
f6d6d4fc
CH
609 */
610STATIC void
611xfs_add_to_ioend(
612 struct inode *inode,
7336cea8 613 xfs_off_t offset,
3faed667 614 struct page *page,
82cb1417 615 struct iomap_page *iop,
e10de372 616 struct xfs_writepage_ctx *wpc,
bb18782a 617 struct writeback_control *wbc,
e10de372 618 struct list_head *iolist)
f6d6d4fc 619{
3faed667
CH
620 struct xfs_inode *ip = XFS_I(inode);
621 struct xfs_mount *mp = ip->i_mount;
622 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
623 unsigned len = i_blocksize(inode);
624 unsigned poff = offset & (PAGE_SIZE - 1);
625 sector_t sector;
626
627 sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
628 ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
629
fbcc0256 630 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
3faed667 631 sector != bio_end_sector(wpc->ioend->io_bio) ||
0df61da8 632 offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
e10de372
DC
633 if (wpc->ioend)
634 list_add(&wpc->ioend->io_list, iolist);
3faed667
CH
635 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
636 bdev, sector);
f6d6d4fc
CH
637 }
638
82cb1417
CH
639 if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
640 if (iop)
641 atomic_inc(&iop->write_count);
642 if (bio_full(wpc->ioend->io_bio))
643 xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
644 __bio_add_page(wpc->ioend->io_bio, page, len, poff);
645 }
bb18782a 646
3faed667 647 wpc->ioend->io_size += len;
f6d6d4fc
CH
648}
649
3ed3a434
DC
650STATIC void
651xfs_vm_invalidatepage(
652 struct page *page,
d47992f8
LC
653 unsigned int offset,
654 unsigned int length)
3ed3a434 655{
82cb1417
CH
656 trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
657 iomap_invalidatepage(page, offset, length);
3ed3a434
DC
658}
659
660/*
82cb1417
CH
661 * If the page has delalloc blocks on it, we need to punch them out before we
662 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
663 * inode that can trip up a later direct I/O read operation on the same region.
3ed3a434 664 *
82cb1417
CH
665 * We prevent this by truncating away the delalloc regions on the page. Because
666 * they are delalloc, we can do this without needing a transaction. Indeed - if
667 * we get ENOSPC errors, we have to be able to do this truncation without a
668 * transaction as there is no space left for block reservation (typically why we
669 * see a ENOSPC in writeback).
3ed3a434
DC
670 */
671STATIC void
672xfs_aops_discard_page(
673 struct page *page)
674{
675 struct inode *inode = page->mapping->host;
676 struct xfs_inode *ip = XFS_I(inode);
03625721 677 struct xfs_mount *mp = ip->i_mount;
3ed3a434 678 loff_t offset = page_offset(page);
03625721
CH
679 xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset);
680 int error;
3ed3a434 681
03625721 682 if (XFS_FORCED_SHUTDOWN(mp))
e8c3753c
DC
683 goto out_invalidate;
684
03625721 685 xfs_alert(mp,
c9690043 686 "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
3ed3a434
DC
687 page, ip->i_ino, offset);
688
03625721
CH
689 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
690 PAGE_SIZE / i_blocksize(inode));
03625721
CH
691 if (error && !XFS_FORCED_SHUTDOWN(mp))
692 xfs_alert(mp, "page discard unable to remove delalloc mapping.");
3ed3a434 693out_invalidate:
09cbfeaf 694 xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
3ed3a434
DC
695}
696
e10de372
DC
697/*
698 * We implement an immediate ioend submission policy here to avoid needing to
699 * chain multiple ioends and hence nest mempool allocations which can violate
700 * forward progress guarantees we need to provide. The current ioend we are
82cb1417 701 * adding blocks to is cached on the writepage context, and if the new block
e10de372
DC
702 * does not append to the cached ioend it will create a new ioend and cache that
703 * instead.
704 *
705 * If a new ioend is created and cached, the old ioend is returned and queued
706 * locally for submission once the entire page is processed or an error has been
707 * detected. While ioends are submitted immediately after they are completed,
708 * batching optimisations are provided by higher level block plugging.
709 *
710 * At the end of a writeback pass, there will be a cached ioend remaining on the
711 * writepage context that the caller will need to submit.
712 */
bfce7d2e
DC
713static int
714xfs_writepage_map(
715 struct xfs_writepage_ctx *wpc,
e10de372 716 struct writeback_control *wbc,
bfce7d2e
DC
717 struct inode *inode,
718 struct page *page,
2d5f4b5b 719 uint64_t end_offset)
bfce7d2e 720{
e10de372 721 LIST_HEAD(submit_list);
82cb1417
CH
722 struct iomap_page *iop = to_iomap_page(page);
723 unsigned len = i_blocksize(inode);
e10de372 724 struct xfs_ioend *ioend, *next;
6a4c9501 725 uint64_t file_offset; /* file offset of page */
82cb1417 726 int error = 0, count = 0, i;
bfce7d2e 727
82cb1417
CH
728 ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
729 ASSERT(!iop || atomic_read(&iop->write_count) == 0);
ac8ee546 730
e2f6ad46 731 /*
82cb1417
CH
732 * Walk through the page to find areas to write back. If we run off the
733 * end of the current map or find the current map invalid, grab a new
734 * one.
e2f6ad46 735 */
82cb1417
CH
736 for (i = 0, file_offset = page_offset(page);
737 i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
738 i++, file_offset += len) {
739 if (iop && !test_bit(i, iop->uptodate))
bfce7d2e 740 continue;
bfce7d2e 741
889c65b3
CH
742 error = xfs_map_blocks(wpc, inode, file_offset);
743 if (error)
744 break;
82cb1417 745 if (wpc->io_type == XFS_IO_HOLE)
5c665e5b 746 continue;
82cb1417
CH
747 xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
748 &submit_list);
5c665e5b 749 count++;
e2f6ad46 750 }
bfce7d2e 751
e10de372 752 ASSERT(wpc->ioend || list_empty(&submit_list));
1b65d3dd
CH
753 ASSERT(PageLocked(page));
754 ASSERT(!PageWriteback(page));
bfce7d2e 755
bfce7d2e 756 /*
82cb1417
CH
757 * On error, we have to fail the ioend here because we may have set
758 * pages under writeback, we have to make sure we run IO completion to
759 * mark the error state of the IO appropriately, so we can't cancel the
760 * ioend directly here. That means we have to mark this page as under
761 * writeback if we included any blocks from it in the ioend chain so
762 * that completion treats it correctly.
bfce7d2e 763 *
e10de372
DC
764 * If we didn't include the page in the ioend, the on error we can
765 * simply discard and unlock it as there are no other users of the page
82cb1417
CH
766 * now. The caller will still need to trigger submission of outstanding
767 * ioends on the writepage context so they are treated correctly on
768 * error.
bfce7d2e 769 */
8e1f065b
CH
770 if (unlikely(error)) {
771 if (!count) {
772 xfs_aops_discard_page(page);
773 ClearPageUptodate(page);
774 unlock_page(page);
775 goto done;
776 }
777
1b65d3dd
CH
778 /*
779 * If the page was not fully cleaned, we need to ensure that the
780 * higher layers come back to it correctly. That means we need
781 * to keep the page dirty, and for WB_SYNC_ALL writeback we need
782 * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
783 * so another attempt to write this page in this writeback sweep
784 * will be made.
785 */
8e1f065b 786 set_page_writeback_keepwrite(page);
e10de372 787 } else {
1b65d3dd
CH
788 clear_page_dirty_for_io(page);
789 set_page_writeback(page);
bfce7d2e 790 }
e10de372 791
8e1f065b
CH
792 unlock_page(page);
793
794 /*
795 * Preserve the original error if there was one, otherwise catch
796 * submission errors here and propagate into subsequent ioend
797 * submissions.
798 */
799 list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
800 int error2;
801
802 list_del_init(&ioend->io_list);
803 error2 = xfs_submit_ioend(wbc, ioend, error);
804 if (error2 && !error)
805 error = error2;
806 }
807
808 /*
82cb1417
CH
809 * We can end up here with no error and nothing to write only if we race
810 * with a partial page truncate on a sub-page block sized filesystem.
8e1f065b
CH
811 */
812 if (!count)
813 end_page_writeback(page);
814done:
bfce7d2e
DC
815 mapping_set_error(page->mapping, error);
816 return error;
817}
818
1da177e4 819/*
89f3b363
CH
820 * Write out a dirty page.
821 *
822 * For delalloc space on the page we need to allocate space and flush it.
823 * For unwritten space on the page we need to start the conversion to
824 * regular allocated space.
1da177e4 825 */
1da177e4 826STATIC int
fbcc0256 827xfs_do_writepage(
89f3b363 828 struct page *page,
fbcc0256
DC
829 struct writeback_control *wbc,
830 void *data)
1da177e4 831{
fbcc0256 832 struct xfs_writepage_ctx *wpc = data;
89f3b363 833 struct inode *inode = page->mapping->host;
1da177e4 834 loff_t offset;
c8ce540d 835 uint64_t end_offset;
ad68972a 836 pgoff_t end_index;
89f3b363 837
34097dfe 838 trace_xfs_writepage(inode, page, 0, 0);
89f3b363
CH
839
840 /*
841 * Refuse to write the page out if we are called from reclaim context.
842 *
d4f7a5cb
CH
843 * This avoids stack overflows when called from deeply used stacks in
844 * random callers for direct reclaim or memcg reclaim. We explicitly
845 * allow reclaim from kswapd as the stack usage there is relatively low.
89f3b363 846 *
94054fa3
MG
847 * This should never happen except in the case of a VM regression so
848 * warn about it.
89f3b363 849 */
94054fa3
MG
850 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
851 PF_MEMALLOC))
b5420f23 852 goto redirty;
1da177e4 853
89f3b363 854 /*
680a647b
CH
855 * Given that we do not allow direct reclaim to call us, we should
856 * never be called while in a filesystem transaction.
89f3b363 857 */
9070733b 858 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
b5420f23 859 goto redirty;
89f3b363 860
8695d27e 861 /*
ad68972a
DC
862 * Is this page beyond the end of the file?
863 *
8695d27e
JL
864 * The page index is less than the end_index, adjust the end_offset
865 * to the highest offset that this page should represent.
866 * -----------------------------------------------------
867 * | file mapping | <EOF> |
868 * -----------------------------------------------------
869 * | Page ... | Page N-2 | Page N-1 | Page N | |
870 * ^--------------------------------^----------|--------
871 * | desired writeback range | see else |
872 * ---------------------------------^------------------|
873 */
ad68972a 874 offset = i_size_read(inode);
09cbfeaf 875 end_index = offset >> PAGE_SHIFT;
8695d27e 876 if (page->index < end_index)
09cbfeaf 877 end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
8695d27e
JL
878 else {
879 /*
880 * Check whether the page to write out is beyond or straddles
881 * i_size or not.
882 * -------------------------------------------------------
883 * | file mapping | <EOF> |
884 * -------------------------------------------------------
885 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
886 * ^--------------------------------^-----------|---------
887 * | | Straddles |
888 * ---------------------------------^-----------|--------|
889 */
09cbfeaf 890 unsigned offset_into_page = offset & (PAGE_SIZE - 1);
6b7a03f0
CH
891
892 /*
ff9a28f6
JK
893 * Skip the page if it is fully outside i_size, e.g. due to a
894 * truncate operation that is in progress. We must redirty the
895 * page so that reclaim stops reclaiming it. Otherwise
896 * xfs_vm_releasepage() is called on it and gets confused.
8695d27e
JL
897 *
898 * Note that the end_index is unsigned long, it would overflow
899 * if the given offset is greater than 16TB on 32-bit system
900 * and if we do check the page is fully outside i_size or not
901 * via "if (page->index >= end_index + 1)" as "end_index + 1"
902 * will be evaluated to 0. Hence this page will be redirtied
903 * and be written out repeatedly which would result in an
904 * infinite loop, the user program that perform this operation
905 * will hang. Instead, we can verify this situation by checking
906 * if the page to write is totally beyond the i_size or if it's
907 * offset is just equal to the EOF.
6b7a03f0 908 */
8695d27e
JL
909 if (page->index > end_index ||
910 (page->index == end_index && offset_into_page == 0))
ff9a28f6 911 goto redirty;
6b7a03f0
CH
912
913 /*
914 * The page straddles i_size. It must be zeroed out on each
915 * and every writepage invocation because it may be mmapped.
916 * "A file is mapped in multiples of the page size. For a file
8695d27e 917 * that is not a multiple of the page size, the remaining
6b7a03f0
CH
918 * memory is zeroed when mapped, and writes to that region are
919 * not written out to the file."
920 */
09cbfeaf 921 zero_user_segment(page, offset_into_page, PAGE_SIZE);
8695d27e
JL
922
923 /* Adjust the end_offset to the end of file */
924 end_offset = offset;
1da177e4
LT
925 }
926
2d5f4b5b 927 return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
f51623b2 928
b5420f23 929redirty:
f51623b2
NS
930 redirty_page_for_writepage(wbc, page);
931 unlock_page(page);
932 return 0;
f51623b2
NS
933}
934
fbcc0256
DC
935STATIC int
936xfs_vm_writepage(
937 struct page *page,
938 struct writeback_control *wbc)
939{
940 struct xfs_writepage_ctx wpc = {
97e5a6e6 941 .io_type = XFS_IO_HOLE,
fbcc0256
DC
942 };
943 int ret;
944
945 ret = xfs_do_writepage(page, wbc, &wpc);
e10de372
DC
946 if (wpc.ioend)
947 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
948 return ret;
fbcc0256
DC
949}
950
7d4fb40a
NS
951STATIC int
952xfs_vm_writepages(
953 struct address_space *mapping,
954 struct writeback_control *wbc)
955{
fbcc0256 956 struct xfs_writepage_ctx wpc = {
97e5a6e6 957 .io_type = XFS_IO_HOLE,
fbcc0256
DC
958 };
959 int ret;
960
b3aea4ed 961 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
fbcc0256 962 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
e10de372
DC
963 if (wpc.ioend)
964 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
965 return ret;
7d4fb40a
NS
966}
967
6e2608df
DW
968STATIC int
969xfs_dax_writepages(
970 struct address_space *mapping,
971 struct writeback_control *wbc)
972{
973 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
974 return dax_writeback_mapping_range(mapping,
975 xfs_find_bdev_for_inode(mapping->host), wbc);
976}
977
f51623b2 978STATIC int
238f4c54 979xfs_vm_releasepage(
f51623b2
NS
980 struct page *page,
981 gfp_t gfp_mask)
982{
34097dfe 983 trace_xfs_releasepage(page->mapping->host, page, 0, 0);
82cb1417 984 return iomap_releasepage(page, gfp_mask);
1da177e4
LT
985}
986
1da177e4 987STATIC sector_t
e4c573bb 988xfs_vm_bmap(
1da177e4
LT
989 struct address_space *mapping,
990 sector_t block)
991{
b84e7722 992 struct xfs_inode *ip = XFS_I(mapping->host);
1da177e4 993
b84e7722 994 trace_xfs_vm_bmap(ip);
db1327b1
DW
995
996 /*
997 * The swap code (ab-)uses ->bmap to get a block mapping and then
793057e1 998 * bypasses the file system for actual I/O. We really can't allow
db1327b1 999 * that on reflinks inodes, so we have to skip out here. And yes,
eb5e248d
DW
1000 * 0 is the magic code for a bmap error.
1001 *
1002 * Since we don't pass back blockdev info, we can't return bmap
1003 * information for rt files either.
db1327b1 1004 */
eb5e248d 1005 if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
db1327b1 1006 return 0;
b84e7722 1007 return iomap_bmap(mapping, block, &xfs_iomap_ops);
1da177e4
LT
1008}
1009
1010STATIC int
e4c573bb 1011xfs_vm_readpage(
1da177e4
LT
1012 struct file *unused,
1013 struct page *page)
1014{
121e213e 1015 trace_xfs_vm_readpage(page->mapping->host, 1);
82cb1417 1016 return iomap_readpage(page, &xfs_iomap_ops);
1da177e4
LT
1017}
1018
1019STATIC int
e4c573bb 1020xfs_vm_readpages(
1da177e4
LT
1021 struct file *unused,
1022 struct address_space *mapping,
1023 struct list_head *pages,
1024 unsigned nr_pages)
1025{
121e213e 1026 trace_xfs_vm_readpages(mapping->host, nr_pages);
82cb1417 1027 return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
22e757a4
DC
1028}
1029
67482129
DW
1030static int
1031xfs_iomap_swapfile_activate(
1032 struct swap_info_struct *sis,
1033 struct file *swap_file,
1034 sector_t *span)
1035{
1036 sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
1037 return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
1038}
1039
f5e54d6e 1040const struct address_space_operations xfs_address_space_operations = {
e4c573bb
NS
1041 .readpage = xfs_vm_readpage,
1042 .readpages = xfs_vm_readpages,
1043 .writepage = xfs_vm_writepage,
7d4fb40a 1044 .writepages = xfs_vm_writepages,
82cb1417 1045 .set_page_dirty = iomap_set_page_dirty,
238f4c54
NS
1046 .releasepage = xfs_vm_releasepage,
1047 .invalidatepage = xfs_vm_invalidatepage,
e4c573bb 1048 .bmap = xfs_vm_bmap,
6e2608df 1049 .direct_IO = noop_direct_IO,
82cb1417
CH
1050 .migratepage = iomap_migrate_page,
1051 .is_partially_uptodate = iomap_is_partially_uptodate,
aa261f54 1052 .error_remove_page = generic_error_remove_page,
67482129 1053 .swap_activate = xfs_iomap_swapfile_activate,
1da177e4 1054};
6e2608df
DW
1055
1056const struct address_space_operations xfs_dax_aops = {
1057 .writepages = xfs_dax_writepages,
1058 .direct_IO = noop_direct_IO,
1059 .set_page_dirty = noop_set_page_dirty,
1060 .invalidatepage = noop_invalidatepage,
67482129 1061 .swap_activate = xfs_iomap_swapfile_activate,
6e2608df 1062};