--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:55 +0800
+Subject: btrfs: fallback to blocking mode when doing async dio over multiple extents
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Filipe Manana <fdmanana@suse.com>, Josef Bacik <josef@toxicpanda.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <9127cbbcd2bf2f8efd46298d8799e36282e1a311.1649951733.git.anand.jain@oracle.com>
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit ca93e44bfb5fd7996b76f0f544999171f647f93b upstream
+
+Some users recently reported that MariaDB was getting a read corruption
+when using io_uring on top of btrfs. This started to happen in 5.16,
+after commit 51bd9563b6783d ("btrfs: fix deadlock due to page faults
+during direct IO reads and writes"). That changed btrfs to use the new
+iomap flag IOMAP_DIO_PARTIAL and to disable page faults before calling
+iomap_dio_rw(). This was necessary to fix deadlocks when the iovector
+corresponds to a memory mapped file region. That type of scenario is
+exercised by test case generic/647 from fstests.
+
+For this MariaDB scenario, we attempt to read 16K from file offset X
+using IOCB_NOWAIT and io_uring. In that range we have 4 extents, each
+with a size of 4K, and what happens is the following:
+
+1) btrfs_direct_read() disables page faults and calls iomap_dio_rw();
+
+2) iomap creates a struct iomap_dio object, its reference count is
+ initialized to 1 and its ->size field is initialized to 0;
+
+3) iomap calls btrfs_dio_iomap_begin() with file offset X, which finds
+ the first 4K extent, and setups an iomap for this extent consisting
+ of a single page;
+
+4) At iomap_dio_bio_iter(), we are able to access the first page of the
+ buffer (struct iov_iter) with bio_iov_iter_get_pages() without
+ triggering a page fault;
+
+5) iomap submits a bio for this 4K extent
+ (iomap_dio_submit_bio() -> btrfs_submit_direct()) and increments
+ the refcount on the struct iomap_dio object to 2; The ->size field
+ of the struct iomap_dio object is incremented to 4K;
+
+6) iomap calls btrfs_iomap_begin() again, this time with a file
+ offset of X + 4K. There we setup an iomap for the next extent
+ that also has a size of 4K;
+
+7) Then at iomap_dio_bio_iter() we call bio_iov_iter_get_pages(),
+ which tries to access the next page (2nd page) of the buffer.
+ This triggers a page fault and returns -EFAULT;
+
+8) At __iomap_dio_rw() we see the -EFAULT, but we reset the error
+ to 0 because we passed the flag IOMAP_DIO_PARTIAL to iomap and
+ the struct iomap_dio object has a ->size value of 4K (we submitted
+ a bio for an extent already). The 'wait_for_completion' variable
+ is not set to true, because our iocb has IOCB_NOWAIT set;
+
+9) At the bottom of __iomap_dio_rw(), we decrement the reference count
+ of the struct iomap_dio object from 2 to 1. Because we were not
+ the only ones holding a reference on it and 'wait_for_completion' is
+ set to false, -EIOCBQUEUED is returned to btrfs_direct_read(), which
+ just returns it up the callchain, up to io_uring;
+
+10) The bio submitted for the first extent (step 5) completes and its
+ bio endio function, iomap_dio_bio_end_io(), decrements the last
+ reference on the struct iomap_dio object, resulting in calling
+ iomap_dio_complete_work() -> iomap_dio_complete().
+
+11) At iomap_dio_complete() we adjust the iocb->ki_pos from X to X + 4K
+ and return 4K (the amount of io done) to iomap_dio_complete_work();
+
+12) iomap_dio_complete_work() calls the iocb completion callback,
+ iocb->ki_complete() with a second argument value of 4K (total io
+ done) and the iocb with the adjust ki_pos of X + 4K. This results
+ in completing the read request for io_uring, leaving it with a
+ result of 4K bytes read, and only the first page of the buffer
+ filled in, while the remaining 3 pages, corresponding to the other
+ 3 extents, were not filled;
+
+13) For the application, the result is unexpected because if we ask
+ to read N bytes, it expects to get N bytes read as long as those
+ N bytes don't cross the EOF (i_size).
+
+MariaDB reports this as an error, as it's not expecting a short read,
+since it knows it's asking for read operations fully within the i_size
+boundary. This is typical in many applications, but it may also be
+questionable if they should react to such short reads by issuing more
+read calls to get the remaining data. Nevertheless, the short read
+happened due to a change in btrfs regarding how it deals with page
+faults while in the middle of a read operation, and there's no reason
+why btrfs can't have the previous behaviour of returning the whole data
+that was requested by the application.
+
+The problem can also be triggered with the following simple program:
+
+ /* Get O_DIRECT */
+ #ifndef _GNU_SOURCE
+ #define _GNU_SOURCE
+ #endif
+
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <errno.h>
+ #include <string.h>
+ #include <liburing.h>
+
+ int main(int argc, char *argv[])
+ {
+ char *foo_path;
+ struct io_uring ring;
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct iovec iovec;
+ int fd;
+ long pagesize;
+ void *write_buf;
+ void *read_buf;
+ ssize_t ret;
+ int i;
+
+ if (argc != 2) {
+ fprintf(stderr, "Use: %s <directory>\n", argv[0]);
+ return 1;
+ }
+
+ foo_path = malloc(strlen(argv[1]) + 5);
+ if (!foo_path) {
+ fprintf(stderr, "Failed to allocate memory for file path\n");
+ return 1;
+ }
+ strcpy(foo_path, argv[1]);
+ strcat(foo_path, "/foo");
+
+ /*
+ * Create file foo with 2 extents, each with a size matching
+ * the page size. Then allocate a buffer to read both extents
+ * with io_uring, using O_DIRECT and IOCB_NOWAIT. Before doing
+ * the read with io_uring, access the first page of the buffer
+ * to fault it in, so that during the read we only trigger a
+ * page fault when accessing the second page of the buffer.
+ */
+ fd = open(foo_path, O_CREAT | O_TRUNC | O_WRONLY |
+ O_DIRECT, 0666);
+ if (fd == -1) {
+ fprintf(stderr,
+ "Failed to create file 'foo': %s (errno %d)",
+ strerror(errno), errno);
+ return 1;
+ }
+
+ pagesize = sysconf(_SC_PAGE_SIZE);
+ ret = posix_memalign(&write_buf, pagesize, 2 * pagesize);
+ if (ret) {
+ fprintf(stderr, "Failed to allocate write buffer\n");
+ return 1;
+ }
+
+ memset(write_buf, 0xab, pagesize);
+ memset(write_buf + pagesize, 0xcd, pagesize);
+
+ /* Create 2 extents, each with a size matching page size. */
+ for (i = 0; i < 2; i++) {
+ ret = pwrite(fd, write_buf + i * pagesize, pagesize,
+ i * pagesize);
+ if (ret != pagesize) {
+ fprintf(stderr,
+ "Failed to write to file, ret = %ld errno %d (%s)\n",
+ ret, errno, strerror(errno));
+ return 1;
+ }
+ ret = fsync(fd);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to fsync file\n");
+ return 1;
+ }
+ }
+
+ close(fd);
+ fd = open(foo_path, O_RDONLY | O_DIRECT);
+ if (fd == -1) {
+ fprintf(stderr,
+ "Failed to open file 'foo': %s (errno %d)",
+ strerror(errno), errno);
+ return 1;
+ }
+
+ ret = posix_memalign(&read_buf, pagesize, 2 * pagesize);
+ if (ret) {
+ fprintf(stderr, "Failed to allocate read buffer\n");
+ return 1;
+ }
+
+ /*
+ * Fault in only the first page of the read buffer.
+ * We want to trigger a page fault for the 2nd page of the
+ * read buffer during the read operation with io_uring
+ * (O_DIRECT and IOCB_NOWAIT).
+ */
+ memset(read_buf, 0, 1);
+
+ ret = io_uring_queue_init(1, &ring, 0);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to create io_uring queue\n");
+ return 1;
+ }
+
+ sqe = io_uring_get_sqe(&ring);
+ if (!sqe) {
+ fprintf(stderr, "Failed to get io_uring sqe\n");
+ return 1;
+ }
+
+ iovec.iov_base = read_buf;
+ iovec.iov_len = 2 * pagesize;
+ io_uring_prep_readv(sqe, fd, &iovec, 1, 0);
+
+ ret = io_uring_submit_and_wait(&ring, 1);
+ if (ret != 1) {
+ fprintf(stderr,
+ "Failed at io_uring_submit_and_wait()\n");
+ return 1;
+ }
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "Failed at io_uring_wait_cqe()\n");
+ return 1;
+ }
+
+ printf("io_uring read result for file foo:\n\n");
+ printf(" cqe->res == %d (expected %d)\n", cqe->res, 2 * pagesize);
+ printf(" memcmp(read_buf, write_buf) == %d (expected 0)\n",
+ memcmp(read_buf, write_buf, 2 * pagesize));
+
+ io_uring_cqe_seen(&ring, cqe);
+ io_uring_queue_exit(&ring);
+
+ return 0;
+ }
+
+When running it on an unpatched kernel:
+
+ $ gcc io_uring_test.c -luring
+ $ mkfs.btrfs -f /dev/sda
+ $ mount /dev/sda /mnt/sda
+ $ ./a.out /mnt/sda
+ io_uring read result for file foo:
+
+ cqe->res == 4096 (expected 8192)
+ memcmp(read_buf, write_buf) == -205 (expected 0)
+
+After this patch, the read always returns 8192 bytes, with the buffer
+filled with the correct data. Although that reproducer always triggers
+the bug in my test vms, it's possible that it will not be so reliable
+on other environments, as that can happen if the bio for the first
+extent completes and decrements the reference on the struct iomap_dio
+object before we do the atomic_dec_and_test() on the reference at
+__iomap_dio_rw().
+
+Fix this in btrfs by having btrfs_dio_iomap_begin() return -EAGAIN
+whenever we try to satisfy a non blocking IO request (IOMAP_NOWAIT flag
+set) over a range that spans multiple extents (or a mix of extents and
+holes). This avoids returning success to the caller when we only did
+partial IO, which is not optimal for writes and for reads it's actually
+incorrect, as the caller doesn't expect to get less bytes read than it has
+requested (unless EOF is crossed), as previously mentioned. This is also
+the type of behaviour that xfs follows (xfs_direct_write_iomap_begin()),
+even though it doesn't use IOMAP_DIO_PARTIAL.
+
+A test case for fstests will follow soon.
+
+Link: https://lore.kernel.org/linux-btrfs/CABVffEM0eEWho+206m470rtM0d9J8ue85TtR-A_oVTuGLWFicA@mail.gmail.com/
+Link: https://lore.kernel.org/linux-btrfs/CAHF2GV6U32gmqSjLe=XKgfcZAmLCiH26cJ2OnHGp5x=VAH4OHQ@mail.gmail.com/
+CC: stable@vger.kernel.org # 5.16+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -7961,6 +7961,34 @@ static int btrfs_dio_iomap_begin(struct
+ }
+
+ len = min(len, em->len - (start - em->start));
++
++ /*
++ * If we have a NOWAIT request and the range contains multiple extents
++ * (or a mix of extents and holes), then we return -EAGAIN to make the
++ * caller fallback to a context where it can do a blocking (without
++ * NOWAIT) request. This way we avoid doing partial IO and returning
++ * success to the caller, which is not optimal for writes and for reads
++ * it can result in unexpected behaviour for an application.
++ *
++ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
++ * iomap_dio_rw(), we can end up returning less data then what the caller
++ * asked for, resulting in an unexpected, and incorrect, short read.
++ * That is, the caller asked to read N bytes and we return less than that,
++ * which is wrong unless we are crossing EOF. This happens if we get a
++ * page fault error when trying to fault in pages for the buffer that is
++ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
++ * have previously submitted bios for other extents in the range, in
++ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
++ * those bios have completed by the time we get the page fault error,
++ * which we return back to our caller - we should only return EIOCBQUEUED
++ * after we have submitted bios for all the extents in the range.
++ */
++ if ((flags & IOMAP_NOWAIT) && len < length) {
++ free_extent_map(em);
++ ret = -EAGAIN;
++ goto unlock_err;
++ }
++
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len);
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:54 +0800
+Subject: btrfs: fix deadlock due to page faults during direct IO reads and writes
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Filipe Manana <fdmanana@suse.com>, Josef Bacik <josef@toxicpanda.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <b3ed77a21e8c9b82b32a044aac971feaa0a893e0.1649951733.git.anand.jain@oracle.com>
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 51bd9563b6783de8315f38f7baed949e77c42311 upstream
+
+If we do a direct IO read or write when the buffer given by the user is
+memory mapped to the file range we are going to do IO, we end up ending
+in a deadlock. This is triggered by the new test case generic/647 from
+fstests.
+
+For a direct IO read we get a trace like this:
+
+ [967.872718] INFO: task mmap-rw-fault:12176 blocked for more than 120 seconds.
+ [967.874161] Not tainted 5.14.0-rc7-btrfs-next-95 #1
+ [967.874909] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ [967.875983] task:mmap-rw-fault state:D stack: 0 pid:12176 ppid: 11884 flags:0x00000000
+ [967.875992] Call Trace:
+ [967.875999] __schedule+0x3ca/0xe10
+ [967.876015] schedule+0x43/0xe0
+ [967.876020] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
+ [967.876109] ? do_wait_intr_irq+0xb0/0xb0
+ [967.876118] lock_extent_bits+0x37/0x90 [btrfs]
+ [967.876150] btrfs_lock_and_flush_ordered_range+0xa9/0x120 [btrfs]
+ [967.876184] ? extent_readahead+0xa7/0x530 [btrfs]
+ [967.876214] extent_readahead+0x32d/0x530 [btrfs]
+ [967.876253] ? lru_cache_add+0x104/0x220
+ [967.876255] ? kvm_sched_clock_read+0x14/0x40
+ [967.876258] ? sched_clock_cpu+0xd/0x110
+ [967.876263] ? lock_release+0x155/0x4a0
+ [967.876271] read_pages+0x86/0x270
+ [967.876274] ? lru_cache_add+0x125/0x220
+ [967.876281] page_cache_ra_unbounded+0x1a3/0x220
+ [967.876291] filemap_fault+0x626/0xa20
+ [967.876303] __do_fault+0x36/0xf0
+ [967.876308] __handle_mm_fault+0x83f/0x15f0
+ [967.876322] handle_mm_fault+0x9e/0x260
+ [967.876327] __get_user_pages+0x204/0x620
+ [967.876332] ? get_user_pages_unlocked+0x69/0x340
+ [967.876340] get_user_pages_unlocked+0xd3/0x340
+ [967.876349] internal_get_user_pages_fast+0xbca/0xdc0
+ [967.876366] iov_iter_get_pages+0x8d/0x3a0
+ [967.876374] bio_iov_iter_get_pages+0x82/0x4a0
+ [967.876379] ? lock_release+0x155/0x4a0
+ [967.876387] iomap_dio_bio_actor+0x232/0x410
+ [967.876396] iomap_apply+0x12a/0x4a0
+ [967.876398] ? iomap_dio_rw+0x30/0x30
+ [967.876414] __iomap_dio_rw+0x29f/0x5e0
+ [967.876415] ? iomap_dio_rw+0x30/0x30
+ [967.876420] ? lock_acquired+0xf3/0x420
+ [967.876429] iomap_dio_rw+0xa/0x30
+ [967.876431] btrfs_file_read_iter+0x10b/0x140 [btrfs]
+ [967.876460] new_sync_read+0x118/0x1a0
+ [967.876472] vfs_read+0x128/0x1b0
+ [967.876477] __x64_sys_pread64+0x90/0xc0
+ [967.876483] do_syscall_64+0x3b/0xc0
+ [967.876487] entry_SYSCALL_64_after_hwframe+0x44/0xae
+ [967.876490] RIP: 0033:0x7fb6f2c038d6
+ [967.876493] RSP: 002b:00007fffddf586b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000011
+ [967.876496] RAX: ffffffffffffffda RBX: 0000000000001000 RCX: 00007fb6f2c038d6
+ [967.876498] RDX: 0000000000001000 RSI: 00007fb6f2c17000 RDI: 0000000000000003
+ [967.876499] RBP: 0000000000001000 R08: 0000000000000003 R09: 0000000000000000
+ [967.876501] R10: 0000000000001000 R11: 0000000000000246 R12: 0000000000000003
+ [967.876502] R13: 0000000000000000 R14: 00007fb6f2c17000 R15: 0000000000000000
+
+This happens because at btrfs_dio_iomap_begin() we lock the extent range
+and return with it locked - we only unlock in the endio callback, at
+end_bio_extent_readpage() -> endio_readpage_release_extent(). Then after
+iomap called the btrfs_dio_iomap_begin() callback, it triggers the page
+faults that resulting in reading the pages, through the readahead callback
+btrfs_readahead(), and through there we end to attempt to lock again the
+same extent range (or a subrange of what we locked before), resulting in
+the deadlock.
+
+For a direct IO write, the scenario is a bit different, and it results in
+trace like this:
+
+ [1132.442520] run fstests generic/647 at 2021-08-31 18:53:35
+ [1330.349355] INFO: task mmap-rw-fault:184017 blocked for more than 120 seconds.
+ [1330.350540] Not tainted 5.14.0-rc7-btrfs-next-95 #1
+ [1330.351158] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ [1330.351900] task:mmap-rw-fault state:D stack: 0 pid:184017 ppid:183725 flags:0x00000000
+ [1330.351906] Call Trace:
+ [1330.351913] __schedule+0x3ca/0xe10
+ [1330.351930] schedule+0x43/0xe0
+ [1330.351935] btrfs_start_ordered_extent+0x108/0x1c0 [btrfs]
+ [1330.352020] ? do_wait_intr_irq+0xb0/0xb0
+ [1330.352028] btrfs_lock_and_flush_ordered_range+0x8c/0x120 [btrfs]
+ [1330.352064] ? extent_readahead+0xa7/0x530 [btrfs]
+ [1330.352094] extent_readahead+0x32d/0x530 [btrfs]
+ [1330.352133] ? lru_cache_add+0x104/0x220
+ [1330.352135] ? kvm_sched_clock_read+0x14/0x40
+ [1330.352138] ? sched_clock_cpu+0xd/0x110
+ [1330.352143] ? lock_release+0x155/0x4a0
+ [1330.352151] read_pages+0x86/0x270
+ [1330.352155] ? lru_cache_add+0x125/0x220
+ [1330.352162] page_cache_ra_unbounded+0x1a3/0x220
+ [1330.352172] filemap_fault+0x626/0xa20
+ [1330.352176] ? filemap_map_pages+0x18b/0x660
+ [1330.352184] __do_fault+0x36/0xf0
+ [1330.352189] __handle_mm_fault+0x1253/0x15f0
+ [1330.352203] handle_mm_fault+0x9e/0x260
+ [1330.352208] __get_user_pages+0x204/0x620
+ [1330.352212] ? get_user_pages_unlocked+0x69/0x340
+ [1330.352220] get_user_pages_unlocked+0xd3/0x340
+ [1330.352229] internal_get_user_pages_fast+0xbca/0xdc0
+ [1330.352246] iov_iter_get_pages+0x8d/0x3a0
+ [1330.352254] bio_iov_iter_get_pages+0x82/0x4a0
+ [1330.352259] ? lock_release+0x155/0x4a0
+ [1330.352266] iomap_dio_bio_actor+0x232/0x410
+ [1330.352275] iomap_apply+0x12a/0x4a0
+ [1330.352278] ? iomap_dio_rw+0x30/0x30
+ [1330.352292] __iomap_dio_rw+0x29f/0x5e0
+ [1330.352294] ? iomap_dio_rw+0x30/0x30
+ [1330.352306] btrfs_file_write_iter+0x238/0x480 [btrfs]
+ [1330.352339] new_sync_write+0x11f/0x1b0
+ [1330.352344] ? NF_HOOK_LIST.constprop.0.cold+0x31/0x3e
+ [1330.352354] vfs_write+0x292/0x3c0
+ [1330.352359] __x64_sys_pwrite64+0x90/0xc0
+ [1330.352365] do_syscall_64+0x3b/0xc0
+ [1330.352369] entry_SYSCALL_64_after_hwframe+0x44/0xae
+ [1330.352372] RIP: 0033:0x7f4b0a580986
+ [1330.352379] RSP: 002b:00007ffd34d75418 EFLAGS: 00000246 ORIG_RAX: 0000000000000012
+ [1330.352382] RAX: ffffffffffffffda RBX: 0000000000001000 RCX: 00007f4b0a580986
+ [1330.352383] RDX: 0000000000001000 RSI: 00007f4b0a3a4000 RDI: 0000000000000003
+ [1330.352385] RBP: 00007f4b0a3a4000 R08: 0000000000000003 R09: 0000000000000000
+ [1330.352386] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003
+ [1330.352387] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+
+Unlike for reads, at btrfs_dio_iomap_begin() we return with the extent
+range unlocked, but later when the page faults are triggered and we try
+to read the extents, we end up btrfs_lock_and_flush_ordered_range() where
+we find the ordered extent for our write, created by the iomap callback
+btrfs_dio_iomap_begin(), and we wait for it to complete, which makes us
+deadlock since we can't complete the ordered extent without reading the
+pages (the iomap code only submits the bio after the pages are faulted
+in).
+
+Fix this by setting the nofault attribute of the given iov_iter and retry
+the direct IO read/write if we get an -EFAULT error returned from iomap.
+For reads, also disable page faults completely, this is because when we
+read from a hole or a prealloc extent, we can still trigger page faults
+due to the call to iov_iter_zero() done by iomap - at the moment, it is
+oblivious to the value of the ->nofault attribute of an iov_iter.
+We also need to keep track of the number of bytes written or read, and
+pass it to iomap_dio_rw(), as well as use the new flag IOMAP_DIO_PARTIAL.
+
+This depends on the iov_iter and iomap changes introduced in commit
+c03098d4b9ad ("Merge tag 'gfs2-v5.15-rc5-mmap-fault' of
+git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2").
+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/file.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 123 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1903,16 +1903,17 @@ static ssize_t check_direct_IO(struct bt
+
+ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+ {
++ const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ loff_t pos;
+ ssize_t written = 0;
+ ssize_t written_buffered;
++ size_t prev_left = 0;
+ loff_t endbyte;
+ ssize_t err;
+ unsigned int ilock_flags = 0;
+- struct iomap_dio *dio = NULL;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+@@ -1955,23 +1956,80 @@ relock:
+ goto buffered;
+ }
+
+- dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+- 0, 0);
++ /*
++ * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
++ * calls generic_write_sync() (through iomap_dio_complete()), because
++ * that results in calling fsync (btrfs_sync_file()) which will try to
++ * lock the inode in exclusive/write mode.
++ */
++ if (is_sync_write)
++ iocb->ki_flags &= ~IOCB_DSYNC;
++
++ /*
++ * The iov_iter can be mapped to the same file range we are writing to.
++ * If that's the case, then we will deadlock in the iomap code, because
++ * it first calls our callback btrfs_dio_iomap_begin(), which will create
++ * an ordered extent, and after that it will fault in the pages that the
++ * iov_iter refers to. During the fault in we end up in the readahead
++ * pages code (starting at btrfs_readahead()), which will lock the range,
++ * find that ordered extent and then wait for it to complete (at
++ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
++ * obviously the ordered extent can never complete as we didn't submit
++ * yet the respective bio(s). This always happens when the buffer is
++ * memory mapped to the same file range, since the iomap DIO code always
++ * invalidates pages in the target file range (after starting and waiting
++ * for any writeback).
++ *
++ * So here we disable page faults in the iov_iter and then retry if we
++ * got -EFAULT, faulting in the pages before the retry.
++ */
++again:
++ from->nofault = true;
++ err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
++ IOMAP_DIO_PARTIAL, written);
++ from->nofault = false;
++
++ /* No increment (+=) because iomap returns a cumulative value. */
++ if (err > 0)
++ written = err;
++
++ if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
++ const size_t left = iov_iter_count(from);
++ /*
++ * We have more data left to write. Try to fault in as many as
++ * possible of the remainder pages and retry. We do this without
++ * releasing and locking again the inode, to prevent races with
++ * truncate.
++ *
++ * Also, in case the iov refers to pages in the file range of the
++ * file we want to write to (due to a mmap), we could enter an
++ * infinite loop if we retry after faulting the pages in, since
++ * iomap will invalidate any pages in the range early on, before
++ * it tries to fault in the pages of the iov. So we keep track of
++ * how much was left of iov in the previous EFAULT and fallback
++ * to buffered IO in case we haven't made any progress.
++ */
++ if (left == prev_left) {
++ err = -ENOTBLK;
++ } else {
++ fault_in_iov_iter_readable(from, left);
++ prev_left = left;
++ goto again;
++ }
++ }
+
+ btrfs_inode_unlock(inode, ilock_flags);
+
+- if (IS_ERR_OR_NULL(dio)) {
+- err = PTR_ERR_OR_ZERO(dio);
+- if (err < 0 && err != -ENOTBLK)
+- goto out;
+- } else {
+- written = iomap_dio_complete(dio);
+- }
++ /*
++ * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
++ * the fsync (call generic_write_sync()).
++ */
++ if (is_sync_write)
++ iocb->ki_flags |= IOCB_DSYNC;
+
+- if (written < 0 || !iov_iter_count(from)) {
+- err = written;
++ /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
++ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
+ goto out;
+- }
+
+ buffered:
+ pos = iocb->ki_pos;
+@@ -1996,7 +2054,7 @@ buffered:
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+ out:
+- return written ? written : err;
++ return err < 0 ? err : written;
+ }
+
+ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrf
+ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+ {
+ struct inode *inode = file_inode(iocb->ki_filp);
++ size_t prev_left = 0;
++ ssize_t read = 0;
+ ssize_t ret;
+
+ if (fsverity_active(inode))
+@@ -3668,10 +3728,57 @@ static ssize_t btrfs_direct_read(struct
+ return 0;
+
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
++again:
++ /*
++ * This is similar to what we do for direct IO writes, see the comment
++ * at btrfs_direct_write(), but we also disable page faults in addition
++ * to disabling them only at the iov_iter level. This is because when
++ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
++ * which can still trigger page fault ins despite having set ->nofault
++ * to true of our 'to' iov_iter.
++ *
++ * The difference to direct IO writes is that we deadlock when trying
++ * to lock the extent range in the inode's tree during he page reads
++ * triggered by the fault in (while for writes it is due to waiting for
++ * our own ordered extent). This is because for direct IO reads,
++ * btrfs_dio_iomap_begin() returns with the extent range locked, which
++ * is only unlocked in the endio callback (end_bio_extent_readpage()).
++ */
++ pagefault_disable();
++ to->nofault = true;
+ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+- 0, 0);
++ IOMAP_DIO_PARTIAL, read);
++ to->nofault = false;
++ pagefault_enable();
++
++ /* No increment (+=) because iomap returns a cumulative value. */
++ if (ret > 0)
++ read = ret;
++
++ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
++ const size_t left = iov_iter_count(to);
++
++ if (left == prev_left) {
++ /*
++ * We didn't make any progress since the last attempt,
++ * fallback to a buffered read for the remainder of the
++ * range. This is just to avoid any possibility of looping
++ * for too long.
++ */
++ ret = read;
++ } else {
++ /*
++ * We made some progress since the last retry or this is
++ * the first time we are retrying. Fault in as many pages
++ * as possible and retry.
++ */
++ fault_in_iov_iter_writeable(to, left);
++ prev_left = left;
++ goto again;
++ }
++ }
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+- return ret;
++ return ret < 0 ? ret : read;
+ }
+
+ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:42 +0800
+Subject: gfs2: Add wrapper for iomap_file_buffered_write
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <c6935195c043510ac0c69085b8e22a906a8acc6d.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 2eb7509a05443048fb4df60b782de3f03c6c298b upstream
+
+Add a wrapper around iomap_file_buffered_write. We'll add code for when
+the operation needs to be retried here later.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/gfs2/file.c | 27 +++++++++++++++++----------
+ 1 file changed, 17 insertions(+), 10 deletions(-)
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -877,6 +877,20 @@ out_uninit:
+ return written ? written : ret;
+ }
+
++static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *from)
++{
++ struct file *file = iocb->ki_filp;
++ struct inode *inode = file_inode(file);
++ ssize_t ret;
++
++ current->backing_dev_info = inode_to_bdi(inode);
++ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
++ current->backing_dev_info = NULL;
++ if (ret > 0)
++ iocb->ki_pos += ret;
++ return ret;
++}
++
+ /**
+ * gfs2_file_write_iter - Perform a write to a file
+ * @iocb: The io context
+@@ -928,9 +942,7 @@ static ssize_t gfs2_file_write_iter(stru
+ goto out_unlock;
+
+ iocb->ki_flags |= IOCB_DSYNC;
+- current->backing_dev_info = inode_to_bdi(inode);
+- buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+- current->backing_dev_info = NULL;
++ buffered = gfs2_file_buffered_write(iocb, from);
+ if (unlikely(buffered <= 0)) {
+ if (!ret)
+ ret = buffered;
+@@ -944,7 +956,6 @@ static ssize_t gfs2_file_write_iter(stru
+ * the direct I/O range as we don't know if the buffered pages
+ * made it to disk.
+ */
+- iocb->ki_pos += buffered;
+ ret2 = generic_write_sync(iocb, buffered);
+ invalidate_mapping_pages(mapping,
+ (iocb->ki_pos - buffered) >> PAGE_SHIFT,
+@@ -952,13 +963,9 @@ static ssize_t gfs2_file_write_iter(stru
+ if (!ret || ret2 > 0)
+ ret += ret2;
+ } else {
+- current->backing_dev_info = inode_to_bdi(inode);
+- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+- current->backing_dev_info = NULL;
+- if (likely(ret > 0)) {
+- iocb->ki_pos += ret;
++ ret = gfs2_file_buffered_write(iocb, from);
++ if (likely(ret > 0))
+ ret = generic_write_sync(iocb, ret);
+- }
+ }
+
+ out_unlock:
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:43 +0800
+Subject: gfs2: Clean up function may_grant
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <16061e1d0b15ee024905913510b9569e0c5011b4.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 6144464937fe1e6135b13a30502a339d549bf093 upstream
+
+Pass the first current glock holder into function may_grant and
+deobfuscate the logic there.
+
+While at it, switch from BUG_ON to GLOCK_BUG_ON in may_grant. To make
+that build cleanly, de-constify the may_grant arguments.
+
+We're now using function find_first_holder in do_promote, so move the
+function's definition above do_promote.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/gfs2/glock.c | 119 ++++++++++++++++++++++++++++++++------------------------
+ 1 file changed, 69 insertions(+), 50 deletions(-)
+
+--- a/fs/gfs2/glock.c
++++ b/fs/gfs2/glock.c
+@@ -301,46 +301,59 @@ void gfs2_glock_put(struct gfs2_glock *g
+ }
+
+ /**
+- * may_grant - check if its ok to grant a new lock
++ * may_grant - check if it's ok to grant a new lock
+ * @gl: The glock
++ * @current_gh: One of the current holders of @gl
+ * @gh: The lock request which we wish to grant
+ *
+- * Returns: true if its ok to grant the lock
++ * With our current compatibility rules, if a glock has one or more active
++ * holders (HIF_HOLDER flag set), any of those holders can be passed in as
++ * @current_gh; they are all the same as far as compatibility with the new @gh
++ * goes.
++ *
++ * Returns true if it's ok to grant the lock.
+ */
+
+-static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+-{
+- const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list);
++static inline bool may_grant(struct gfs2_glock *gl,
++ struct gfs2_holder *current_gh,
++ struct gfs2_holder *gh)
++{
++ if (current_gh) {
++ GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, ¤t_gh->gh_iflags));
++
++ switch(current_gh->gh_state) {
++ case LM_ST_EXCLUSIVE:
++ /*
++ * Here we make a special exception to grant holders
++ * who agree to share the EX lock with other holders
++ * who also have the bit set. If the original holder
++ * has the LM_FLAG_NODE_SCOPE bit set, we grant more
++ * holders with the bit set.
++ */
++ return gh->gh_state == LM_ST_EXCLUSIVE &&
++ (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) &&
++ (gh->gh_flags & LM_FLAG_NODE_SCOPE);
+
+- if (gh != gh_head) {
+- /**
+- * Here we make a special exception to grant holders who agree
+- * to share the EX lock with other holders who also have the
+- * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit
+- * is set, we grant more holders with the bit set.
+- */
+- if (gh_head->gh_state == LM_ST_EXCLUSIVE &&
+- (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) &&
+- gh->gh_state == LM_ST_EXCLUSIVE &&
+- (gh->gh_flags & LM_FLAG_NODE_SCOPE))
+- return 1;
+- if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+- gh_head->gh_state == LM_ST_EXCLUSIVE))
+- return 0;
++ case LM_ST_SHARED:
++ case LM_ST_DEFERRED:
++ return gh->gh_state == current_gh->gh_state;
++
++ default:
++ return false;
++ }
+ }
++
+ if (gl->gl_state == gh->gh_state)
+- return 1;
++ return true;
+ if (gh->gh_flags & GL_EXACT)
+- return 0;
++ return false;
+ if (gl->gl_state == LM_ST_EXCLUSIVE) {
+- if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+- return 1;
+- if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+- return 1;
++ return gh->gh_state == LM_ST_SHARED ||
++ gh->gh_state == LM_ST_DEFERRED;
+ }
+- if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+- return 1;
+- return 0;
++ if (gh->gh_flags & LM_FLAG_ANY)
++ return gl->gl_state != LM_ST_UNLOCKED;
++ return false;
+ }
+
+ static void gfs2_holder_wake(struct gfs2_holder *gh)
+@@ -381,6 +394,24 @@ static void do_error(struct gfs2_glock *
+ }
+
+ /**
++ * find_first_holder - find the first "holder" gh
++ * @gl: the glock
++ */
++
++static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
++{
++ struct gfs2_holder *gh;
++
++ if (!list_empty(&gl->gl_holders)) {
++ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder,
++ gh_list);
++ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
++ return gh;
++ }
++ return NULL;
++}
++
++/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ *
+@@ -393,14 +424,15 @@ __releases(&gl->gl_lockref.lock)
+ __acquires(&gl->gl_lockref.lock)
+ {
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+- struct gfs2_holder *gh, *tmp;
++ struct gfs2_holder *gh, *tmp, *first_gh;
+ int ret;
+
+ restart:
++ first_gh = find_first_holder(gl);
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ continue;
+- if (may_grant(gl, gh)) {
++ if (may_grant(gl, first_gh, gh)) {
+ if (gh->gh_list.prev == &gl->gl_holders &&
+ glops->go_lock) {
+ spin_unlock(&gl->gl_lockref.lock);
+@@ -723,23 +755,6 @@ out:
+ }
+
+ /**
+- * find_first_holder - find the first "holder" gh
+- * @gl: the glock
+- */
+-
+-static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+-{
+- struct gfs2_holder *gh;
+-
+- if (!list_empty(&gl->gl_holders)) {
+- gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
+- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+- return gh;
+- }
+- return NULL;
+-}
+-
+-/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+@@ -1354,8 +1369,12 @@ __acquires(&gl->gl_lockref.lock)
+ GLOCK_BUG_ON(gl, true);
+
+ if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+- if (test_bit(GLF_LOCK, &gl->gl_flags))
+- try_futile = !may_grant(gl, gh);
++ if (test_bit(GLF_LOCK, &gl->gl_flags)) {
++ struct gfs2_holder *first_gh;
++
++ first_gh = find_first_holder(gl);
++ try_futile = !may_grant(gl, first_gh, gh);
++ }
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+ goto fail;
+ }
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:46 +0800
+Subject: gfs2: Eliminate ip->i_gh
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <844b20e15b0e730c43faa93347d7a65ac4e7b465.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 1b223f7065bc7d89c4677c27381817cc95b117a8 upstream
+
+Now that gfs2_file_buffered_write is the only remaining user of
+ip->i_gh, we can move the glock holder to the stack (or rather, use the
+one we already have on the stack); there is no need for keeping the
+holder in the inode anymore.
+
+This is slightly complicated by the fact that we're using ip->i_gh for
+the statfs inode in gfs2_file_buffered_write as well. Writing to the
+statfs inode isn't very common, so allocate the statfs holder
+dynamically when needed.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/gfs2/file.c | 34 +++++++++++++++++++++-------------
+ fs/gfs2/incore.h | 3 +--
+ 2 files changed, 22 insertions(+), 15 deletions(-)
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -877,16 +877,25 @@ out_uninit:
+ return written ? written : ret;
+ }
+
+-static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *from)
++static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
++ struct iov_iter *from,
++ struct gfs2_holder *gh)
+ {
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
++ struct gfs2_holder *statfs_gh = NULL;
+ ssize_t ret;
+
+- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+- ret = gfs2_glock_nq(&ip->i_gh);
++ if (inode == sdp->sd_rindex) {
++ statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
++ if (!statfs_gh)
++ return -ENOMEM;
++ }
++
++ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
++ ret = gfs2_glock_nq(gh);
+ if (ret)
+ goto out_uninit;
+
+@@ -894,7 +903,7 @@ static ssize_t gfs2_file_buffered_write(
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+ ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+- GL_NOCACHE, &m_ip->i_gh);
++ GL_NOCACHE, statfs_gh);
+ if (ret)
+ goto out_unlock;
+ }
+@@ -905,16 +914,15 @@ static ssize_t gfs2_file_buffered_write(
+ if (ret > 0)
+ iocb->ki_pos += ret;
+
+- if (inode == sdp->sd_rindex) {
+- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+-
+- gfs2_glock_dq_uninit(&m_ip->i_gh);
+- }
++ if (inode == sdp->sd_rindex)
++ gfs2_glock_dq_uninit(statfs_gh);
+
+ out_unlock:
+- gfs2_glock_dq(&ip->i_gh);
++ gfs2_glock_dq(gh);
+ out_uninit:
+- gfs2_holder_uninit(&ip->i_gh);
++ gfs2_holder_uninit(gh);
++ if (statfs_gh)
++ kfree(statfs_gh);
+ return ret;
+ }
+
+@@ -969,7 +977,7 @@ static ssize_t gfs2_file_write_iter(stru
+ goto out_unlock;
+
+ iocb->ki_flags |= IOCB_DSYNC;
+- buffered = gfs2_file_buffered_write(iocb, from);
++ buffered = gfs2_file_buffered_write(iocb, from, &gh);
+ if (unlikely(buffered <= 0)) {
+ if (!ret)
+ ret = buffered;
+@@ -990,7 +998,7 @@ static ssize_t gfs2_file_write_iter(stru
+ if (!ret || ret2 > 0)
+ ret += ret2;
+ } else {
+- ret = gfs2_file_buffered_write(iocb, from);
++ ret = gfs2_file_buffered_write(iocb, from, &gh);
+ if (likely(ret > 0))
+ ret = generic_write_sync(iocb, ret);
+ }
+--- a/fs/gfs2/incore.h
++++ b/fs/gfs2/incore.h
+@@ -387,9 +387,8 @@ struct gfs2_inode {
+ u64 i_generation;
+ u64 i_eattr;
+ unsigned long i_flags; /* GIF_... */
+- struct gfs2_glock *i_gl; /* Move into i_gh? */
++ struct gfs2_glock *i_gl;
+ struct gfs2_holder i_iopen_gh;
+- struct gfs2_holder i_gh; /* for prepare/commit_write only */
+ struct gfs2_qadata *i_qadata; /* quota allocation data */
+ struct gfs2_holder i_rgd_gh;
+ struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:47 +0800
+Subject: gfs2: Fix mmap + page fault deadlocks for buffered I/O
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <087a752bc8848ad8814bee4648d8b9d855c8438c.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 00bfe02f479688a67a29019d1228f1470e26f014 upstream
+
+In the .read_iter and .write_iter file operations, we're accessing
+user-space memory while holding the inode glock. There is a possibility
+that the memory is mapped to the same file, in which case we'd recurse
+on the same glock.
+
+We could detect and work around this simple case of recursive locking,
+but more complex scenarios exist that involve multiple glocks,
+processes, and cluster nodes, and working around all of those cases
+isn't practical or even possible.
+
+Avoid these kinds of problems by disabling page faults while holding the
+inode glock. If a page fault would occur, we either end up with a
+partial read or write or with -EFAULT if nothing could be read or
+written. In either case, we know that we're not done with the
+operation, so we indicate that we're willing to give up the inode glock
+and then we fault in the missing pages. If that made us lose the inode
+glock, we return a partial read or write. Otherwise, we resume the
+operation.
+
+This locking problem was originally reported by Jan Kara. Linus came up
+with the idea of disabling page faults. Many thanks to Al Viro and
+Matthew Wilcox for their feedback.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/gfs2/file.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 94 insertions(+), 5 deletions(-)
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -777,6 +777,36 @@ static int gfs2_fsync(struct file *file,
+ return ret ? ret : ret1;
+ }
+
++static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
++ size_t *prev_count,
++ size_t *window_size)
++{
++ char __user *p = i->iov[0].iov_base + i->iov_offset;
++ size_t count = iov_iter_count(i);
++ int pages = 1;
++
++ if (likely(!count))
++ return false;
++ if (ret <= 0 && ret != -EFAULT)
++ return false;
++ if (!iter_is_iovec(i))
++ return false;
++
++ if (*prev_count != count || !*window_size) {
++ int pages, nr_dirtied;
++
++ pages = min_t(int, BIO_MAX_VECS,
++ DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
++ nr_dirtied = max(current->nr_dirtied_pause -
++ current->nr_dirtied, 1);
++ pages = min(pages, nr_dirtied);
++ }
++
++ *prev_count = count;
++ *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
++ return true;
++}
++
+ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
+ struct gfs2_holder *gh)
+ {
+@@ -841,9 +871,17 @@ static ssize_t gfs2_file_read_iter(struc
+ {
+ struct gfs2_inode *ip;
+ struct gfs2_holder gh;
++ size_t prev_count = 0, window_size = 0;
+ size_t written = 0;
+ ssize_t ret;
+
++ /*
++ * In this function, we disable page faults when we're holding the
++ * inode glock while doing I/O. If a page fault occurs, we indicate
++ * that the inode glock may be dropped, fault in the pages manually,
++ * and retry.
++ */
++
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = gfs2_file_direct_read(iocb, to, &gh);
+ if (likely(ret != -ENOTBLK))
+@@ -865,13 +903,34 @@ static ssize_t gfs2_file_read_iter(struc
+ }
+ ip = GFS2_I(iocb->ki_filp->f_mapping->host);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
++retry:
+ ret = gfs2_glock_nq(&gh);
+ if (ret)
+ goto out_uninit;
++retry_under_glock:
++ pagefault_disable();
+ ret = generic_file_read_iter(iocb, to);
++ pagefault_enable();
+ if (ret > 0)
+ written += ret;
+- gfs2_glock_dq(&gh);
++
++ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
++ size_t leftover;
++
++ gfs2_holder_allow_demote(&gh);
++ leftover = fault_in_iov_iter_writeable(to, window_size);
++ gfs2_holder_disallow_demote(&gh);
++ if (leftover != window_size) {
++ if (!gfs2_holder_queued(&gh)) {
++ if (written)
++ goto out_uninit;
++ goto retry;
++ }
++ goto retry_under_glock;
++ }
++ }
++ if (gfs2_holder_queued(&gh))
++ gfs2_glock_dq(&gh);
+ out_uninit:
+ gfs2_holder_uninit(&gh);
+ return written ? written : ret;
+@@ -886,8 +945,17 @@ static ssize_t gfs2_file_buffered_write(
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_holder *statfs_gh = NULL;
++ size_t prev_count = 0, window_size = 0;
++ size_t read = 0;
+ ssize_t ret;
+
++ /*
++ * In this function, we disable page faults when we're holding the
++ * inode glock while doing I/O. If a page fault occurs, we indicate
++ * that the inode glock may be dropped, fault in the pages manually,
++ * and retry.
++ */
++
+ if (inode == sdp->sd_rindex) {
+ statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
+ if (!statfs_gh)
+@@ -895,10 +963,11 @@ static ssize_t gfs2_file_buffered_write(
+ }
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
++retry:
+ ret = gfs2_glock_nq(gh);
+ if (ret)
+ goto out_uninit;
+-
++retry_under_glock:
+ if (inode == sdp->sd_rindex) {
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+@@ -909,21 +978,41 @@ static ssize_t gfs2_file_buffered_write(
+ }
+
+ current->backing_dev_info = inode_to_bdi(inode);
++ pagefault_disable();
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
++ pagefault_enable();
+ current->backing_dev_info = NULL;
+- if (ret > 0)
++ if (ret > 0) {
+ iocb->ki_pos += ret;
++ read += ret;
++ }
+
+ if (inode == sdp->sd_rindex)
+ gfs2_glock_dq_uninit(statfs_gh);
+
++ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
++ size_t leftover;
++
++ gfs2_holder_allow_demote(gh);
++ leftover = fault_in_iov_iter_readable(from, window_size);
++ gfs2_holder_disallow_demote(gh);
++ if (leftover != window_size) {
++ if (!gfs2_holder_queued(gh)) {
++ if (read)
++ goto out_uninit;
++ goto retry;
++ }
++ goto retry_under_glock;
++ }
++ }
+ out_unlock:
+- gfs2_glock_dq(gh);
++ if (gfs2_holder_queued(gh))
++ gfs2_glock_dq(gh);
+ out_uninit:
+ gfs2_holder_uninit(gh);
+ if (statfs_gh)
+ kfree(statfs_gh);
+- return ret;
++ return read ? read : ret;
+ }
+
+ /**
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:53 +0800
+Subject: gfs2: Fix mmap + page fault deadlocks for direct I/O
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <02aca00403b19d316add3a4c835d40436a615103.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit b01b2d72da25c000aeb124bc78daf3fb998be2b6 upstream
+
+Also disable page faults during direct I/O requests and implement a
+similar kind of retry logic as in the buffered I/O case.
+
+The retry logic in the direct I/O case differs from the buffered I/O
+case in the following way: direct I/O doesn't provide the kinds of
+consistency guarantees between concurrent reads and writes that buffered
+I/O provides, so once we lose the inode glock while faulting in user
+pages, we always resume the operation. We never need to return a
+partial read or write.
+
+This locking problem was originally reported by Jan Kara. Linus came up
+with the idea of disabling page faults. Many thanks to Al Viro and
+Matthew Wilcox for their feedback.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/gfs2/file.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 88 insertions(+), 13 deletions(-)
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -812,22 +812,64 @@ static ssize_t gfs2_file_direct_read(str
+ {
+ struct file *file = iocb->ki_filp;
+ struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+- size_t count = iov_iter_count(to);
++ size_t prev_count = 0, window_size = 0;
++ size_t written = 0;
+ ssize_t ret;
+
+- if (!count)
++ /*
++ * In this function, we disable page faults when we're holding the
++ * inode glock while doing I/O. If a page fault occurs, we indicate
++ * that the inode glock may be dropped, fault in the pages manually,
++ * and retry.
++ *
++ * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
++ * physical as well as manual page faults, and we need to disable both
++ * kinds.
++ *
++ * For direct I/O, gfs2 takes the inode glock in deferred mode. This
++ * locking mode is compatible with other deferred holders, so multiple
++ * processes and nodes can do direct I/O to a file at the same time.
++ * There's no guarantee that reads or writes will be atomic. Any
++ * coordination among readers and writers needs to happen externally.
++ */
++
++ if (!iov_iter_count(to))
+ return 0; /* skip atime */
+
+ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
++retry:
+ ret = gfs2_glock_nq(gh);
+ if (ret)
+ goto out_uninit;
+-
+- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0, 0);
+- gfs2_glock_dq(gh);
++retry_under_glock:
++ pagefault_disable();
++ to->nofault = true;
++ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
++ IOMAP_DIO_PARTIAL, written);
++ to->nofault = false;
++ pagefault_enable();
++ if (ret > 0)
++ written = ret;
++
++ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
++ size_t leftover;
++
++ gfs2_holder_allow_demote(gh);
++ leftover = fault_in_iov_iter_writeable(to, window_size);
++ gfs2_holder_disallow_demote(gh);
++ if (leftover != window_size) {
++ if (!gfs2_holder_queued(gh))
++ goto retry;
++ goto retry_under_glock;
++ }
++ }
++ if (gfs2_holder_queued(gh))
++ gfs2_glock_dq(gh);
+ out_uninit:
+ gfs2_holder_uninit(gh);
+- return ret;
++ if (ret < 0)
++ return ret;
++ return written;
+ }
+
+ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
+@@ -836,11 +878,21 @@ static ssize_t gfs2_file_direct_write(st
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+- size_t len = iov_iter_count(from);
+- loff_t offset = iocb->ki_pos;
++ size_t prev_count = 0, window_size = 0;
++ size_t read = 0;
+ ssize_t ret;
+
+ /*
++ * In this function, we disable page faults when we're holding the
++ * inode glock while doing I/O. If a page fault occurs, we indicate
++ * that the inode glock may be dropped, fault in the pages manually,
++ * and retry.
++ *
++ * For writes, iomap_dio_rw only triggers manual page faults, so we
++ * don't need to disable physical ones.
++ */
++
++ /*
+ * Deferred lock, even if its a write, since we do no allocation on
+ * this path. All we need to change is the atime, and this lock mode
+ * ensures that other nodes have flushed their buffered read caches
+@@ -849,22 +901,45 @@ static ssize_t gfs2_file_direct_write(st
+ * VFS does.
+ */
+ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
++retry:
+ ret = gfs2_glock_nq(gh);
+ if (ret)
+ goto out_uninit;
+-
++retry_under_glock:
+ /* Silently fall back to buffered I/O when writing beyond EOF */
+- if (offset + len > i_size_read(&ip->i_inode))
++ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
+ goto out;
+
+- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0, 0);
++ from->nofault = true;
++ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
++ IOMAP_DIO_PARTIAL, read);
++ from->nofault = false;
++
+ if (ret == -ENOTBLK)
+ ret = 0;
++ if (ret > 0)
++ read = ret;
++
++ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
++ size_t leftover;
++
++ gfs2_holder_allow_demote(gh);
++ leftover = fault_in_iov_iter_readable(from, window_size);
++ gfs2_holder_disallow_demote(gh);
++ if (leftover != window_size) {
++ if (!gfs2_holder_queued(gh))
++ goto retry;
++ goto retry_under_glock;
++ }
++ }
+ out:
+- gfs2_glock_dq(gh);
++ if (gfs2_holder_queued(gh))
++ gfs2_glock_dq(gh);
+ out_uninit:
+ gfs2_holder_uninit(gh);
+- return ret;
++ if (ret < 0)
++ return ret;
++ return read;
+ }
+
+ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:44 +0800
+Subject: gfs2: Introduce flag for glock holder auto-demotion
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Bob Peterson <rpeterso@redhat.com>, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <51a4309baa83be7f31064db7fad3b9d3649d239d.1649951733.git.anand.jain@oracle.com>
+
+From: Bob Peterson <rpeterso@redhat.com>
+
+commit dc732906c2450939c319fec6e258aa89ecb5a632 upstream
+
+This patch introduces a new HIF_MAY_DEMOTE flag and infrastructure that
+will allow glocks to be demoted automatically on locking conflicts.
+When a locking request comes in that isn't compatible with the locking
+state of an active holder and that holder has the HIF_MAY_DEMOTE flag
+set, the holder will be demoted before the incoming locking request is
+granted.
+
+Note that this mechanism demotes active holders (with the HIF_HOLDER
+flag set), while before we were only demoting glocks without any active
+holders. This allows processes to keep hold of locks that may form a
+cyclic locking dependency; the core glock logic will then break those
+dependencies in case a conflicting locking request occurs. We'll use
+this to avoid giving up the inode glock proactively before faulting in
+pages.
+
+Processes that allow a glock holder to be taken away indicate this by
+calling gfs2_holder_allow_demote(), which sets the HIF_MAY_DEMOTE flag.
+Later, they call gfs2_holder_disallow_demote() to clear the flag again,
+and then they check if their holder is still queued: if it is, they are
+still holding the glock; if it isn't, they can re-acquire the glock (or
+abort).
+
+Signed-off-by: Bob Peterson <rpeterso@redhat.com>
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/gfs2/glock.c | 215 +++++++++++++++++++++++++++++++++++++++++++++----------
+ fs/gfs2/glock.h | 20 +++++
+ fs/gfs2/incore.h | 1
+ 3 files changed, 200 insertions(+), 36 deletions(-)
+
+--- a/fs/gfs2/glock.c
++++ b/fs/gfs2/glock.c
+@@ -58,6 +58,7 @@ struct gfs2_glock_iter {
+ typedef void (*glock_examiner) (struct gfs2_glock * gl);
+
+ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
++static void __gfs2_glock_dq(struct gfs2_holder *gh);
+
+ static struct dentry *gfs2_root;
+ static struct workqueue_struct *glock_workqueue;
+@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_g
+
+ if (gl->gl_state == LM_ST_UNLOCKED)
+ return 0;
++ /*
++ * Note that demote_ok is used for the lru process of disposing of
++ * glocks. For this purpose, we don't care if the glock's holders
++ * have the HIF_MAY_DEMOTE flag set or not. If someone is using
++ * them, don't demote.
++ */
+ if (!list_empty(&gl->gl_holders))
+ return 0;
+ if (glops->go_demote_ok)
+@@ -379,7 +386,7 @@ static void do_error(struct gfs2_glock *
+ struct gfs2_holder *gh, *tmp;
+
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
++ if (!test_bit(HIF_WAIT, &gh->gh_iflags))
+ continue;
+ if (ret & LM_OUT_ERROR)
+ gh->gh_error = -EIO;
+@@ -394,6 +401,40 @@ static void do_error(struct gfs2_glock *
+ }
+
+ /**
++ * demote_incompat_holders - demote incompatible demoteable holders
++ * @gl: the glock we want to promote
++ * @new_gh: the new holder to be promoted
++ */
++static void demote_incompat_holders(struct gfs2_glock *gl,
++ struct gfs2_holder *new_gh)
++{
++ struct gfs2_holder *gh;
++
++ /*
++ * Demote incompatible holders before we make ourselves eligible.
++ * (This holder may or may not allow auto-demoting, but we don't want
++ * to demote the new holder before it's even granted.)
++ */
++ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
++ /*
++ * Since holders are at the front of the list, we stop when we
++ * find the first non-holder.
++ */
++ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
++ return;
++ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
++ !may_grant(gl, new_gh, gh)) {
++ /*
++ * We should not recurse into do_promote because
++ * __gfs2_glock_dq only calls handle_callback,
++ * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
++ */
++ __gfs2_glock_dq(gh);
++ }
++ }
++}
++
++/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+@@ -412,6 +453,26 @@ static inline struct gfs2_holder *find_f
+ }
+
+ /**
++ * find_first_strong_holder - find the first non-demoteable holder
++ * @gl: the glock
++ *
++ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
++ */
++static inline struct gfs2_holder *
++find_first_strong_holder(struct gfs2_glock *gl)
++{
++ struct gfs2_holder *gh;
++
++ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
++ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
++ return NULL;
++ if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
++ return gh;
++ }
++ return NULL;
++}
++
++/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ *
+@@ -425,14 +486,20 @@ __acquires(&gl->gl_lockref.lock)
+ {
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh, *tmp, *first_gh;
++ bool incompat_holders_demoted = false;
+ int ret;
+
+ restart:
+- first_gh = find_first_holder(gl);
++ first_gh = find_first_strong_holder(gl);
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
++ if (!test_bit(HIF_WAIT, &gh->gh_iflags))
+ continue;
+ if (may_grant(gl, first_gh, gh)) {
++ if (!incompat_holders_demoted) {
++ demote_incompat_holders(gl, first_gh);
++ incompat_holders_demoted = true;
++ first_gh = gh;
++ }
+ if (gh->gh_list.prev == &gl->gl_holders &&
+ glops->go_lock) {
+ spin_unlock(&gl->gl_lockref.lock);
+@@ -458,6 +525,11 @@ restart:
+ gfs2_holder_wake(gh);
+ continue;
+ }
++ /*
++ * If we get here, it means we may not grant this holder for
++ * some reason. If this holder is the head of the list, it
++ * means we have a blocked holder at the head, so return 1.
++ */
+ if (gh->gh_list.prev == &gl->gl_holders)
+ return 1;
+ do_error(gl, 0);
+@@ -1372,7 +1444,7 @@ __acquires(&gl->gl_lockref.lock)
+ if (test_bit(GLF_LOCK, &gl->gl_flags)) {
+ struct gfs2_holder *first_gh;
+
+- first_gh = find_first_holder(gl);
++ first_gh = find_first_strong_holder(gl);
+ try_futile = !may_grant(gl, first_gh, gh);
+ }
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+@@ -1381,7 +1453,8 @@ __acquires(&gl->gl_lockref.lock)
+
+ list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+ if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
+- (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
++ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
++ !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
+ goto trap_recursive;
+ if (try_futile &&
+ !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+@@ -1477,51 +1550,83 @@ int gfs2_glock_poll(struct gfs2_holder *
+ return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
+ }
+
+-/**
+- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+- * @gh: the glock holder
+- *
+- */
++static inline bool needs_demote(struct gfs2_glock *gl)
++{
++ return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
++ test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
++}
+
+-void gfs2_glock_dq(struct gfs2_holder *gh)
++static void __gfs2_glock_dq(struct gfs2_holder *gh)
+ {
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ unsigned delay = 0;
+ int fast_path = 0;
+
+- spin_lock(&gl->gl_lockref.lock);
+ /*
+- * If we're in the process of file system withdraw, we cannot just
+- * dequeue any glocks until our journal is recovered, lest we
+- * introduce file system corruption. We need two exceptions to this
+- * rule: We need to allow unlocking of nondisk glocks and the glock
+- * for our own journal that needs recovery.
++ * This while loop is similar to function demote_incompat_holders:
++ * If the glock is due to be demoted (which may be from another node
++ * or even if this holder is GL_NOCACHE), the weak holders are
++ * demoted as well, allowing the glock to be demoted.
+ */
+- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+- glock_blocked_by_withdraw(gl) &&
+- gh->gh_gl != sdp->sd_jinode_gl) {
+- sdp->sd_glock_dqs_held++;
+- spin_unlock(&gl->gl_lockref.lock);
+- might_sleep();
+- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+- TASK_UNINTERRUPTIBLE);
+- spin_lock(&gl->gl_lockref.lock);
+- }
+- if (gh->gh_flags & GL_NOCACHE)
+- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
++ while (gh) {
++ /*
++ * If we're in the process of file system withdraw, we cannot
++ * just dequeue any glocks until our journal is recovered, lest
++ * we introduce file system corruption. We need two exceptions
++ * to this rule: We need to allow unlocking of nondisk glocks
++ * and the glock for our own journal that needs recovery.
++ */
++ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
++ glock_blocked_by_withdraw(gl) &&
++ gh->gh_gl != sdp->sd_jinode_gl) {
++ sdp->sd_glock_dqs_held++;
++ spin_unlock(&gl->gl_lockref.lock);
++ might_sleep();
++ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
++ TASK_UNINTERRUPTIBLE);
++ spin_lock(&gl->gl_lockref.lock);
++ }
+
+- list_del_init(&gh->gh_list);
+- clear_bit(HIF_HOLDER, &gh->gh_iflags);
+- if (list_empty(&gl->gl_holders) &&
+- !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+- !test_bit(GLF_DEMOTE, &gl->gl_flags))
+- fast_path = 1;
++ /*
++ * This holder should not be cached, so mark it for demote.
++ * Note: this should be done before the check for needs_demote
++ * below.
++ */
++ if (gh->gh_flags & GL_NOCACHE)
++ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
++
++ list_del_init(&gh->gh_list);
++ clear_bit(HIF_HOLDER, &gh->gh_iflags);
++ trace_gfs2_glock_queue(gh, 0);
++
++ /*
++ * If there hasn't been a demote request we are done.
++ * (Let the remaining holders, if any, keep holding it.)
++ */
++ if (!needs_demote(gl)) {
++ if (list_empty(&gl->gl_holders))
++ fast_path = 1;
++ break;
++ }
++ /*
++ * If we have another strong holder (we cannot auto-demote)
++ * we are done. It keeps holding it until it is done.
++ */
++ if (find_first_strong_holder(gl))
++ break;
++
++ /*
++ * If we have a weak holder at the head of the list, it
++ * (and all others like it) must be auto-demoted. If there
++ * are no more weak holders, we exit the while loop.
++ */
++ gh = find_first_holder(gl);
++ }
+
+ if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
+ gfs2_glock_add_to_lru(gl);
+
+- trace_gfs2_glock_queue(gh, 0);
+ if (unlikely(!fast_path)) {
+ gl->gl_lockref.count++;
+ if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+@@ -1530,6 +1635,19 @@ void gfs2_glock_dq(struct gfs2_holder *g
+ delay = gl->gl_hold_time;
+ __gfs2_glock_queue_work(gl, delay);
+ }
++}
++
++/**
++ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
++ * @gh: the glock holder
++ *
++ */
++void gfs2_glock_dq(struct gfs2_holder *gh)
++{
++ struct gfs2_glock *gl = gh->gh_gl;
++
++ spin_lock(&gl->gl_lockref.lock);
++ __gfs2_glock_dq(gh);
+ spin_unlock(&gl->gl_lockref.lock);
+ }
+
+@@ -1692,6 +1810,7 @@ void gfs2_glock_dq_m(unsigned int num_gh
+
+ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
+ {
++ struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, };
+ unsigned long delay = 0;
+ unsigned long holdtime;
+ unsigned long now = jiffies;
+@@ -1706,6 +1825,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl
+ if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+ delay = gl->gl_hold_time;
+ }
++ /*
++ * Note 1: We cannot call demote_incompat_holders from handle_callback
++ * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
++ * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
++ * Plus, we only want to demote the holders if the request comes from
++ * a remote cluster node because local holder conflicts are resolved
++ * elsewhere.
++ *
++ * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
++ * request that we set our state to UNLOCKED. Here we mock up a holder
++ * to make it look like someone wants the lock EX locally. Any SH
++ * and DF requests should be able to share the lock without demoting.
++ *
++ * Note 3: We only want to demote the demoteable holders when there
++ * are no more strong holders. The demoteable holders might as well
++ * keep the glock until the last strong holder is done with it.
++ */
++ if (!find_first_strong_holder(gl)) {
++ if (state == LM_ST_UNLOCKED)
++ mock_gh.gh_state = LM_ST_EXCLUSIVE;
++ demote_incompat_holders(gl, &mock_gh);
++ }
+ handle_callback(gl, state, delay, true);
+ __gfs2_glock_queue_work(gl, delay);
+ spin_unlock(&gl->gl_lockref.lock);
+@@ -2097,6 +2238,8 @@ static const char *hflags2str(char *buf,
+ *p++ = 'H';
+ if (test_bit(HIF_WAIT, &iflags))
+ *p++ = 'W';
++ if (test_bit(HIF_MAY_DEMOTE, &iflags))
++ *p++ = 'D';
+ *p = 0;
+ return buf;
+ }
+--- a/fs/gfs2/glock.h
++++ b/fs/gfs2/glock.h
+@@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_g
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ break;
++ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
++ continue;
+ if (gh->gh_owner_pid == pid)
+ goto out;
+ }
+@@ -325,6 +327,24 @@ static inline void glock_clear_object(st
+ spin_unlock(&gl->gl_lockref.lock);
+ }
+
++static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
++{
++ struct gfs2_glock *gl = gh->gh_gl;
++
++ spin_lock(&gl->gl_lockref.lock);
++ set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
++ spin_unlock(&gl->gl_lockref.lock);
++}
++
++static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
++{
++ struct gfs2_glock *gl = gh->gh_gl;
++
++ spin_lock(&gl->gl_lockref.lock);
++ clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
++ spin_unlock(&gl->gl_lockref.lock);
++}
++
+ extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
+ extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
+
+--- a/fs/gfs2/incore.h
++++ b/fs/gfs2/incore.h
+@@ -252,6 +252,7 @@ struct gfs2_lkstats {
+
+ enum {
+ /* States */
++ HIF_MAY_DEMOTE = 1,
+ HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
+ HIF_WAIT = 10,
+ };
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:45 +0800
+Subject: gfs2: Move the inode glock locking to gfs2_file_buffered_write
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <cc3db66fcbea7329e3cc7246cd329b719f76f323.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit b924bdab7445946e2ed364a0e6e249d36f1f1158 upstream
+
+So far, for buffered writes, we were taking the inode glock in
+gfs2_iomap_begin and dropping it in gfs2_iomap_end with the intention of
+not holding the inode glock while iomap_write_actor faults in user
+pages. It turns out that iomap_write_actor is called inside iomap_begin
+... iomap_end, so the user pages were still faulted in while holding the
+inode glock and the locking code in iomap_begin / iomap_end was
+completely pointless.
+
+Move the locking into gfs2_file_buffered_write instead. We'll take care
+of the potential deadlocks due to faulting in user pages while holding a
+glock in a subsequent patch.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/gfs2/bmap.c | 60 ---------------------------------------------------------
+ fs/gfs2/file.c | 27 +++++++++++++++++++++++++
+ 2 files changed, 28 insertions(+), 59 deletions(-)
+
+--- a/fs/gfs2/bmap.c
++++ b/fs/gfs2/bmap.c
+@@ -961,46 +961,6 @@ hole_found:
+ goto out;
+ }
+
+-static int gfs2_write_lock(struct inode *inode)
+-{
+- struct gfs2_inode *ip = GFS2_I(inode);
+- struct gfs2_sbd *sdp = GFS2_SB(inode);
+- int error;
+-
+- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+- error = gfs2_glock_nq(&ip->i_gh);
+- if (error)
+- goto out_uninit;
+- if (&ip->i_inode == sdp->sd_rindex) {
+- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+-
+- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+- GL_NOCACHE, &m_ip->i_gh);
+- if (error)
+- goto out_unlock;
+- }
+- return 0;
+-
+-out_unlock:
+- gfs2_glock_dq(&ip->i_gh);
+-out_uninit:
+- gfs2_holder_uninit(&ip->i_gh);
+- return error;
+-}
+-
+-static void gfs2_write_unlock(struct inode *inode)
+-{
+- struct gfs2_inode *ip = GFS2_I(inode);
+- struct gfs2_sbd *sdp = GFS2_SB(inode);
+-
+- if (&ip->i_inode == sdp->sd_rindex) {
+- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+-
+- gfs2_glock_dq_uninit(&m_ip->i_gh);
+- }
+- gfs2_glock_dq_uninit(&ip->i_gh);
+-}
+-
+ static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
+ unsigned len)
+ {
+@@ -1118,11 +1078,6 @@ out_qunlock:
+ return ret;
+ }
+
+-static inline bool gfs2_iomap_need_write_lock(unsigned flags)
+-{
+- return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
+-}
+-
+ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap *iomap,
+ struct iomap *srcmap)
+@@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode
+ iomap->flags |= IOMAP_F_BUFFER_HEAD;
+
+ trace_gfs2_iomap_start(ip, pos, length, flags);
+- if (gfs2_iomap_need_write_lock(flags)) {
+- ret = gfs2_write_lock(inode);
+- if (ret)
+- goto out;
+- }
+-
+ ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+ if (ret)
+ goto out_unlock;
+@@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode
+ ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
+
+ out_unlock:
+- if (ret && gfs2_iomap_need_write_lock(flags))
+- gfs2_write_unlock(inode);
+ release_metapath(&mp);
+-out:
+ trace_gfs2_iomap_end(ip, iomap, ret);
+ return ret;
+ }
+@@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *
+ }
+
+ if (unlikely(!written))
+- goto out_unlock;
++ return 0;
+
+ if (iomap->flags & IOMAP_F_SIZE_CHANGED)
+ mark_inode_dirty(inode);
+ set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+-
+-out_unlock:
+- if (gfs2_iomap_need_write_lock(flags))
+- gfs2_write_unlock(inode);
+ return 0;
+ }
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -881,13 +881,40 @@ static ssize_t gfs2_file_buffered_write(
+ {
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
++ struct gfs2_inode *ip = GFS2_I(inode);
++ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ ssize_t ret;
+
++ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
++ ret = gfs2_glock_nq(&ip->i_gh);
++ if (ret)
++ goto out_uninit;
++
++ if (inode == sdp->sd_rindex) {
++ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
++
++ ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
++ GL_NOCACHE, &m_ip->i_gh);
++ if (ret)
++ goto out_unlock;
++ }
++
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ current->backing_dev_info = NULL;
+ if (ret > 0)
+ iocb->ki_pos += ret;
++
++ if (inode == sdp->sd_rindex) {
++ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
++
++ gfs2_glock_dq_uninit(&m_ip->i_gh);
++ }
++
++out_unlock:
++ gfs2_glock_dq(&ip->i_gh);
++out_uninit:
++ gfs2_holder_uninit(&ip->i_gh);
+ return ret;
+ }
+
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:51 +0800
+Subject: gup: Introduce FOLL_NOFAULT flag to disable page faults
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <2ee1e383ae1cca975426b54ab251257f6d4e12c0.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 55b8fe703bc51200d4698596c90813453b35ae63 upstream
+
+Introduce a new FOLL_NOFAULT flag that causes get_user_pages to return
+-EFAULT when it would otherwise trigger a page fault. This is roughly
+similar to FOLL_FAST_ONLY but available on all architectures, and less
+fragile.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h | 3 ++-
+ mm/gup.c | 4 +++-
+ 2 files changed, 5 insertions(+), 2 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2858,7 +2858,8 @@ struct page *follow_page(struct vm_area_
+ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
+ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
+ * and return without waiting upon it */
+-#define FOLL_POPULATE 0x40 /* fault in page */
++#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */
++#define FOLL_NOFAULT 0x80 /* do not fault in pages */
+ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
+ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
+ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -943,6 +943,8 @@ static int faultin_page(struct vm_area_s
+ /* mlock all present pages, but do not fault in new pages */
+ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
+ return -ENOENT;
++ if (*flags & FOLL_NOFAULT)
++ return -EFAULT;
+ if (*flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (*flags & FOLL_REMOTE)
+@@ -2868,7 +2870,7 @@ static int internal_get_user_pages_fast(
+
+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
+ FOLL_FORCE | FOLL_PIN | FOLL_GET |
+- FOLL_FAST_ONLY)))
++ FOLL_FAST_ONLY | FOLL_NOFAULT)))
+ return -EINVAL;
+
+ if (gup_flags & FOLL_PIN)
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:39 +0800
+Subject: gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable}
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <92b6e65e73dd2764bef59e0e20b65143ab28914a.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit bb523b406c849eef8f265a07cd7f320f1f177743 upstream
+
+Turn fault_in_pages_{readable,writeable} into versions that return the
+number of bytes not faulted in, similar to copy_to_user, instead of
+returning a non-zero value when any of the requested pages couldn't be
+faulted in. This supports the existing users that require all pages to
+be faulted in as well as new users that are happy if any pages can be
+faulted in.
+
+Rename the functions to fault_in_{readable,writeable} to make sure
+this change doesn't silently break things.
+
+Neither of these functions is entirely trivial and it doesn't seem
+useful to inline them, so move them to mm/gup.c.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/kvm.c | 3 +
+ arch/powerpc/kernel/signal_32.c | 4 +-
+ arch/powerpc/kernel/signal_64.c | 2 -
+ arch/x86/kernel/fpu/signal.c | 7 +--
+ drivers/gpu/drm/armada/armada_gem.c | 7 +--
+ fs/btrfs/ioctl.c | 5 +-
+ include/linux/pagemap.h | 57 +---------------------------
+ lib/iov_iter.c | 10 ++---
+ mm/filemap.c | 2 -
+ mm/gup.c | 72 ++++++++++++++++++++++++++++++++++++
+ 10 files changed, 93 insertions(+), 76 deletions(-)
+
+--- a/arch/powerpc/kernel/kvm.c
++++ b/arch/powerpc/kernel/kvm.c
+@@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(vo
+ on_each_cpu(kvm_map_magic_page, &features, 1);
+
+ /* Quick self-test to see if the mapping works */
+- if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) {
++ if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE,
++ sizeof(u32))) {
+ kvm_patching_worked = false;
+ return;
+ }
+--- a/arch/powerpc/kernel/signal_32.c
++++ b/arch/powerpc/kernel/signal_32.c
+@@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucon
+ if (new_ctx == NULL)
+ return 0;
+ if (!access_ok(new_ctx, ctx_size) ||
+- fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
++ fault_in_readable((char __user *)new_ctx, ctx_size))
+ return -EFAULT;
+
+ /*
+@@ -1239,7 +1239,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct
+ #endif
+
+ if (!access_ok(ctx, sizeof(*ctx)) ||
+- fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
++ fault_in_readable((char __user *)ctx, sizeof(*ctx)))
+ return -EFAULT;
+
+ /*
+--- a/arch/powerpc/kernel/signal_64.c
++++ b/arch/powerpc/kernel/signal_64.c
+@@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucon
+ if (new_ctx == NULL)
+ return 0;
+ if (!access_ok(new_ctx, ctx_size) ||
+- fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
++ fault_in_readable((char __user *)new_ctx, ctx_size))
+ return -EFAULT;
+
+ /*
+--- a/arch/x86/kernel/fpu/signal.c
++++ b/arch/x86/kernel/fpu/signal.c
+@@ -205,7 +205,7 @@ retry:
+ fpregs_unlock();
+
+ if (ret) {
+- if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
++ if (!fault_in_writeable(buf_fx, fpu_user_xstate_size))
+ goto retry;
+ return -EFAULT;
+ }
+@@ -278,10 +278,9 @@ retry:
+ if (ret != -EFAULT)
+ return -EINVAL;
+
+- ret = fault_in_pages_readable(buf, size);
+- if (!ret)
++ if (!fault_in_readable(buf, size))
+ goto retry;
+- return ret;
++ return -EFAULT;
+ }
+
+ /*
+--- a/drivers/gpu/drm/armada/armada_gem.c
++++ b/drivers/gpu/drm/armada/armada_gem.c
+@@ -336,7 +336,7 @@ int armada_gem_pwrite_ioctl(struct drm_d
+ struct drm_armada_gem_pwrite *args = data;
+ struct armada_gem_object *dobj;
+ char __user *ptr;
+- int ret;
++ int ret = 0;
+
+ DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n",
+ args->handle, args->offset, args->size, args->ptr);
+@@ -349,9 +349,8 @@ int armada_gem_pwrite_ioctl(struct drm_d
+ if (!access_ok(ptr, args->size))
+ return -EFAULT;
+
+- ret = fault_in_pages_readable(ptr, args->size);
+- if (ret)
+- return ret;
++ if (fault_in_readable(ptr, args->size))
++ return -EFAULT;
+
+ dobj = armada_gem_object_lookup(file, args->handle);
+ if (dobj == NULL)
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -2258,9 +2258,8 @@ static noinline int search_ioctl(struct
+ key.offset = sk->min_offset;
+
+ while (1) {
+- ret = fault_in_pages_writeable(ubuf + sk_offset,
+- *buf_size - sk_offset);
+- if (ret)
++ ret = -EFAULT;
++ if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
+ break;
+
+ ret = btrfs_search_forward(root, &key, path, sk->min_transid);
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -733,61 +733,10 @@ int wait_on_page_private_2_killable(stru
+ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
+
+ /*
+- * Fault everything in given userspace address range in.
++ * Fault in userspace address range.
+ */
+-static inline int fault_in_pages_writeable(char __user *uaddr, size_t size)
+-{
+- char __user *end = uaddr + size - 1;
+-
+- if (unlikely(size == 0))
+- return 0;
+-
+- if (unlikely(uaddr > end))
+- return -EFAULT;
+- /*
+- * Writing zeroes into userspace here is OK, because we know that if
+- * the zero gets there, we'll be overwriting it.
+- */
+- do {
+- if (unlikely(__put_user(0, uaddr) != 0))
+- return -EFAULT;
+- uaddr += PAGE_SIZE;
+- } while (uaddr <= end);
+-
+- /* Check whether the range spilled into the next page. */
+- if (((unsigned long)uaddr & PAGE_MASK) ==
+- ((unsigned long)end & PAGE_MASK))
+- return __put_user(0, end);
+-
+- return 0;
+-}
+-
+-static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
+-{
+- volatile char c;
+- const char __user *end = uaddr + size - 1;
+-
+- if (unlikely(size == 0))
+- return 0;
+-
+- if (unlikely(uaddr > end))
+- return -EFAULT;
+-
+- do {
+- if (unlikely(__get_user(c, uaddr) != 0))
+- return -EFAULT;
+- uaddr += PAGE_SIZE;
+- } while (uaddr <= end);
+-
+- /* Check whether the range spilled into the next page. */
+- if (((unsigned long)uaddr & PAGE_MASK) ==
+- ((unsigned long)end & PAGE_MASK)) {
+- return __get_user(c, end);
+- }
+-
+- (void)c;
+- return 0;
+-}
++size_t fault_in_writeable(char __user *uaddr, size_t size);
++size_t fault_in_readable(const char __user *uaddr, size_t size);
+
+ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+ pgoff_t index, gfp_t gfp_mask);
+--- a/lib/iov_iter.c
++++ b/lib/iov_iter.c
+@@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(st
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
++ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
+ kaddr = kmap_atomic(page);
+ from = kaddr + offset;
+
+@@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
++ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
+ kaddr = kmap_atomic(page);
+ to = kaddr + offset;
+
+@@ -447,13 +447,11 @@ int iov_iter_fault_in_readable(const str
+ bytes = i->count;
+ for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
+ size_t len = min(bytes, p->iov_len - skip);
+- int err;
+
+ if (unlikely(!len))
+ continue;
+- err = fault_in_pages_readable(p->iov_base + skip, len);
+- if (unlikely(err))
+- return err;
++ if (fault_in_readable(p->iov_base + skip, len))
++ return -EFAULT;
+ bytes -= len;
+ }
+ }
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -90,7 +90,7 @@
+ * ->lock_page (filemap_fault, access_process_vm)
+ *
+ * ->i_rwsem (generic_perform_write)
+- * ->mmap_lock (fault_in_pages_readable->do_page_fault)
++ * ->mmap_lock (fault_in_readable->do_page_fault)
+ *
+ * bdi->wb.list_lock
+ * sb_lock (fs/fs-writeback.c)
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1682,6 +1682,78 @@ finish_or_fault:
+ #endif /* !CONFIG_MMU */
+
+ /**
++ * fault_in_writeable - fault in userspace address range for writing
++ * @uaddr: start of address range
++ * @size: size of address range
++ *
++ * Returns the number of bytes not faulted in (like copy_to_user() and
++ * copy_from_user()).
++ */
++size_t fault_in_writeable(char __user *uaddr, size_t size)
++{
++ char __user *start = uaddr, *end;
++
++ if (unlikely(size == 0))
++ return 0;
++ if (!PAGE_ALIGNED(uaddr)) {
++ if (unlikely(__put_user(0, uaddr) != 0))
++ return size;
++ uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
++ }
++ end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
++ if (unlikely(end < start))
++ end = NULL;
++ while (uaddr != end) {
++ if (unlikely(__put_user(0, uaddr) != 0))
++ goto out;
++ uaddr += PAGE_SIZE;
++ }
++
++out:
++ if (size > uaddr - start)
++ return size - (uaddr - start);
++ return 0;
++}
++EXPORT_SYMBOL(fault_in_writeable);
++
++/**
++ * fault_in_readable - fault in userspace address range for reading
++ * @uaddr: start of user address range
++ * @size: size of user address range
++ *
++ * Returns the number of bytes not faulted in (like copy_to_user() and
++ * copy_from_user()).
++ */
++size_t fault_in_readable(const char __user *uaddr, size_t size)
++{
++ const char __user *start = uaddr, *end;
++ volatile char c;
++
++ if (unlikely(size == 0))
++ return 0;
++ if (!PAGE_ALIGNED(uaddr)) {
++ if (unlikely(__get_user(c, uaddr) != 0))
++ return size;
++ uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
++ }
++ end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
++ if (unlikely(end < start))
++ end = NULL;
++ while (uaddr != end) {
++ if (unlikely(__get_user(c, uaddr) != 0))
++ goto out;
++ uaddr += PAGE_SIZE;
++ }
++
++out:
++ (void)c;
++ if (size > uaddr - start)
++ return size - (uaddr - start);
++ return 0;
++}
++EXPORT_SYMBOL(fault_in_readable);
++
++/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:50 +0800
+Subject: iomap: Add done_before argument to iomap_dio_rw
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <db3340e7b4b9e65960ecdd2c4e1b08f3fe5a09ec.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 4fdccaa0d184c202f98d73b24e3ec8eeee88ab8d upstream
+
+Add a done_before argument to iomap_dio_rw that indicates how much of
+the request has already been transferred. When the request succeeds, we
+report that done_before additional bytes were tranferred. This is
+useful for finishing a request asynchronously when part of the request
+has already been completed synchronously.
+
+We'll use that to allow iomap_dio_rw to be used with page faults
+disabled: when a page fault occurs while submitting a request, we
+synchronously complete the part of the request that has already been
+submitted. The caller can then take care of the page fault and call
+iomap_dio_rw again for the rest of the request, passing in the number of
+bytes already tranferred.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/file.c | 5 +++--
+ fs/erofs/data.c | 2 +-
+ fs/ext4/file.c | 5 +++--
+ fs/gfs2/file.c | 4 ++--
+ fs/iomap/direct-io.c | 19 ++++++++++++++++---
+ fs/xfs/xfs_file.c | 6 +++---
+ fs/zonefs/super.c | 4 ++--
+ include/linux/iomap.h | 4 ++--
+ 8 files changed, 32 insertions(+), 17 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1956,7 +1956,7 @@ relock:
+ }
+
+ dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+- 0);
++ 0, 0);
+
+ btrfs_inode_unlock(inode, ilock_flags);
+
+@@ -3668,7 +3668,8 @@ static ssize_t btrfs_direct_read(struct
+ return 0;
+
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
++ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
++ 0, 0);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+--- a/fs/erofs/data.c
++++ b/fs/erofs/data.c
+@@ -287,7 +287,7 @@ static ssize_t erofs_file_read_iter(stru
+
+ if (!err)
+ return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
+- NULL, 0);
++ NULL, 0, 0);
+ if (err < 0)
+ return err;
+ }
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct
+ return generic_file_read_iter(iocb, to);
+ }
+
+- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
++ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+@@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struc
+ if (ilock_shared)
+ iomap_ops = &ext4_iomap_overwrite_ops;
+ ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
+- (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
++ (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
++ 0);
+ if (ret == -ENOTBLK)
+ ret = 0;
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -823,7 +823,7 @@ static ssize_t gfs2_file_direct_read(str
+ if (ret)
+ goto out_uninit;
+
+- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
++ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0, 0);
+ gfs2_glock_dq(gh);
+ out_uninit:
+ gfs2_holder_uninit(gh);
+@@ -857,7 +857,7 @@ static ssize_t gfs2_file_direct_write(st
+ if (offset + len > i_size_read(&ip->i_inode))
+ goto out;
+
+- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
++ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0, 0);
+ if (ret == -ENOTBLK)
+ ret = 0;
+ out:
+--- a/fs/iomap/direct-io.c
++++ b/fs/iomap/direct-io.c
+@@ -31,6 +31,7 @@ struct iomap_dio {
+ atomic_t ref;
+ unsigned flags;
+ int error;
++ size_t done_before;
+ bool wait_for_completion;
+
+ union {
+@@ -124,6 +125,9 @@ ssize_t iomap_dio_complete(struct iomap_
+ if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+ ret = generic_write_sync(iocb, ret);
+
++ if (ret > 0)
++ ret += dio->done_before;
++
+ kfree(dio);
+
+ return ret;
+@@ -450,13 +454,21 @@ static loff_t iomap_dio_iter(const struc
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
+ *
++ * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
++ * __iomap_dio_rw can return a partial result if it encounters a non-resident
++ * page in @iter after preparing a transfer. In that case, the non-resident
++ * pages can be faulted in and the request resumed with @done_before set to the
++ * number of bytes previously transferred. The request will then complete with
++ * the correct total number of bytes transferred; this is essential for
++ * completing partial requests asynchronously.
++ *
+ * Returns -ENOTBLK In case of a page invalidation invalidation failure for
+ * writes. The callers needs to fall back to buffered I/O in this case.
+ */
+ struct iomap_dio *
+ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+- unsigned int dio_flags)
++ unsigned int dio_flags, size_t done_before)
+ {
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = file_inode(iocb->ki_filp);
+@@ -486,6 +498,7 @@ __iomap_dio_rw(struct kiocb *iocb, struc
+ dio->dops = dops;
+ dio->error = 0;
+ dio->flags = 0;
++ dio->done_before = done_before;
+
+ dio->submit.iter = iter;
+ dio->submit.waiter = current;
+@@ -652,11 +665,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
+ ssize_t
+ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+- unsigned int dio_flags)
++ unsigned int dio_flags, size_t done_before)
+ {
+ struct iomap_dio *dio;
+
+- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
++ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
+ if (IS_ERR_OR_NULL(dio))
+ return PTR_ERR_OR_ZERO(dio);
+ return iomap_dio_complete(dio);
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -259,7 +259,7 @@ xfs_file_dio_read(
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
+- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
++ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ return ret;
+@@ -569,7 +569,7 @@ xfs_file_dio_write_aligned(
+ }
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+- &xfs_dio_write_ops, 0);
++ &xfs_dio_write_ops, 0, 0);
+ out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+@@ -647,7 +647,7 @@ retry_exclusive:
+
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+- &xfs_dio_write_ops, flags);
++ &xfs_dio_write_ops, flags, 0);
+
+ /*
+ * Retry unaligned I/O with exclusive blocking semantics if the DIO
+--- a/fs/zonefs/super.c
++++ b/fs/zonefs/super.c
+@@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(str
+ ret = zonefs_file_dio_append(iocb, from);
+ else
+ ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
+- &zonefs_write_dio_ops, 0);
++ &zonefs_write_dio_ops, 0, 0);
+ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
+ (ret > 0 || ret == -EIOCBQUEUED)) {
+ if (ret > 0)
+@@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(str
+ }
+ file_accessed(iocb->ki_filp);
+ ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
+- &zonefs_read_dio_ops, 0);
++ &zonefs_read_dio_ops, 0, 0);
+ } else {
+ ret = generic_file_read_iter(iocb, to);
+ if (ret == -EIO)
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -339,10 +339,10 @@ struct iomap_dio_ops {
+
+ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+- unsigned int dio_flags);
++ unsigned int dio_flags, size_t done_before);
+ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+- unsigned int dio_flags);
++ unsigned int dio_flags, size_t done_before);
+ ssize_t iomap_dio_complete(struct iomap_dio *dio);
+ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
+
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:48 +0800
+Subject: iomap: Fix iomap_dio_rw return value for user copies
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Christoph Hellwig <hch@lst.de>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <63440885619fdfa1a520a9528e38207311f44f2a.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 42c498c18a94eed79896c50871889af52fa0822e upstream
+
+When a user copy fails in one of the helpers of iomap_dio_rw, fail with
+-EFAULT instead of returning 0. This matches what iomap_dio_bio_actor
+returns when it gets an -EFAULT from bio_iov_iter_get_pages. With these
+changes, iomap_dio_actor now consistently fails with -EFAULT when a user
+page cannot be faulted in.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap/direct-io.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/iomap/direct-io.c
++++ b/fs/iomap/direct-io.c
+@@ -371,6 +371,8 @@ static loff_t iomap_dio_hole_iter(const
+ loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
+
+ dio->size += length;
++ if (!length)
++ return -EFAULT;
+ return length;
+ }
+
+@@ -402,6 +404,8 @@ static loff_t iomap_dio_inline_iter(cons
+ copied = copy_to_iter(inline_data, length, iter);
+ }
+ dio->size += copied;
++ if (!copied)
++ return -EFAULT;
+ return copied;
+ }
+
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:49 +0800
+Subject: iomap: Support partial direct I/O on user copy failures
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <a85564f9b06b5bae198a27c7f60cd02b39c2ce79.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 97308f8b0d867e9ef59528cd97f0db55ffdf5651 upstream
+
+In iomap_dio_rw, when iomap_apply returns an -EFAULT error and the
+IOMAP_DIO_PARTIAL flag is set, complete the request synchronously and
+return a partial result. This allows the caller to deal with the page
+fault and retry the remainder of the request.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap/direct-io.c | 6 ++++++
+ include/linux/iomap.h | 7 +++++++
+ 2 files changed, 13 insertions(+)
+
+--- a/fs/iomap/direct-io.c
++++ b/fs/iomap/direct-io.c
+@@ -581,6 +581,12 @@ __iomap_dio_rw(struct kiocb *iocb, struc
+ if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
+ iov_iter_revert(iter, iomi.pos - dio->i_size);
+
++ if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {
++ if (!(iocb->ki_flags & IOCB_NOWAIT))
++ wait_for_completion = true;
++ ret = 0;
++ }
++
+ /* magic error code to fall back to buffered I/O */
+ if (ret == -ENOTBLK) {
+ wait_for_completion = true;
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -330,6 +330,13 @@ struct iomap_dio_ops {
+ */
+ #define IOMAP_DIO_OVERWRITE_ONLY (1 << 1)
+
++/*
++ * When a page fault occurs, return a partial synchronous result and allow
++ * the caller to retry the rest of the operation after dealing with the page
++ * fault.
++ */
++#define IOMAP_DIO_PARTIAL (1 << 2)
++
+ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+ unsigned int dio_flags);
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:41 +0800
+Subject: iov_iter: Introduce fault_in_iov_iter_writeable
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <8181618a0badc14fd9bbe13e26164bc601c59df9.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit cdd591fc86e38ad3899196066219fbbd845f3162 upstream
+
+Introduce a new fault_in_iov_iter_writeable helper for safely faulting
+in an iterator for writing. Uses get_user_pages() to fault in the pages
+without actually writing to them, which would be destructive.
+
+We'll use fault_in_iov_iter_writeable in gfs2 once we've determined that
+the iterator passed to .read_iter isn't in memory.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/pagemap.h | 1
+ include/linux/uio.h | 1
+ lib/iov_iter.c | 39 +++++++++++++++++++++++++++++
+ mm/gup.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
+ 4 files changed, 104 insertions(+)
+
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -736,6 +736,7 @@ extern void add_page_wait_queue(struct p
+ * Fault in userspace address range.
+ */
+ size_t fault_in_writeable(char __user *uaddr, size_t size);
++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
+ size_t fault_in_readable(const char __user *uaddr, size_t size);
+
+ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+--- a/include/linux/uio.h
++++ b/include/linux/uio.h
+@@ -134,6 +134,7 @@ size_t copy_page_from_iter_atomic(struct
+ void iov_iter_advance(struct iov_iter *i, size_t bytes);
+ void iov_iter_revert(struct iov_iter *i, size_t bytes);
+ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
+ size_t iov_iter_single_seg_count(const struct iov_iter *i);
+ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i);
+--- a/lib/iov_iter.c
++++ b/lib/iov_iter.c
+@@ -468,6 +468,45 @@ size_t fault_in_iov_iter_readable(const
+ }
+ EXPORT_SYMBOL(fault_in_iov_iter_readable);
+
++/*
++ * fault_in_iov_iter_writeable - fault in iov iterator for writing
++ * @i: iterator
++ * @size: maximum length
++ *
++ * Faults in the iterator using get_user_pages(), i.e., without triggering
++ * hardware page faults. This is primarily useful when we already know that
++ * some or all of the pages in @i aren't in memory.
++ *
++ * Returns the number of bytes not faulted in, like copy_to_user() and
++ * copy_from_user().
++ *
++ * Always returns 0 for non-user-space iterators.
++ */
++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
++{
++ if (iter_is_iovec(i)) {
++ size_t count = min(size, iov_iter_count(i));
++ const struct iovec *p;
++ size_t skip;
++
++ size -= count;
++ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
++ size_t len = min(count, p->iov_len - skip);
++ size_t ret;
++
++ if (unlikely(!len))
++ continue;
++ ret = fault_in_safe_writeable(p->iov_base + skip, len);
++ count -= len - ret;
++ if (ret)
++ break;
++ }
++ return count + size;
++ }
++ return 0;
++}
++EXPORT_SYMBOL(fault_in_iov_iter_writeable);
++
+ void iov_iter_init(struct iov_iter *i, unsigned int direction,
+ const struct iovec *iov, unsigned long nr_segs,
+ size_t count)
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1716,6 +1716,69 @@ out:
+ }
+ EXPORT_SYMBOL(fault_in_writeable);
+
++/*
++ * fault_in_safe_writeable - fault in an address range for writing
++ * @uaddr: start of address range
++ * @size: length of address range
++ *
++ * Faults in an address range using get_user_pages, i.e., without triggering
++ * hardware page faults. This is primarily useful when we already know that
++ * some or all of the pages in the address range aren't in memory.
++ *
++ * Other than fault_in_writeable(), this function is non-destructive.
++ *
++ * Note that we don't pin or otherwise hold the pages referenced that we fault
++ * in. There's no guarantee that they'll stay in memory for any duration of
++ * time.
++ *
++ * Returns the number of bytes not faulted in, like copy_to_user() and
++ * copy_from_user().
++ */
++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
++{
++ unsigned long start = (unsigned long)untagged_addr(uaddr);
++ unsigned long end, nstart, nend;
++ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma = NULL;
++ int locked = 0;
++
++ nstart = start & PAGE_MASK;
++ end = PAGE_ALIGN(start + size);
++ if (end < nstart)
++ end = 0;
++ for (; nstart != end; nstart = nend) {
++ unsigned long nr_pages;
++ long ret;
++
++ if (!locked) {
++ locked = 1;
++ mmap_read_lock(mm);
++ vma = find_vma(mm, nstart);
++ } else if (nstart >= vma->vm_end)
++ vma = vma->vm_next;
++ if (!vma || vma->vm_start >= end)
++ break;
++ nend = end ? min(end, vma->vm_end) : vma->vm_end;
++ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
++ continue;
++ if (nstart < vma->vm_start)
++ nstart = vma->vm_start;
++ nr_pages = (nend - nstart) / PAGE_SIZE;
++ ret = __get_user_pages_locked(mm, nstart, nr_pages,
++ NULL, NULL, &locked,
++ FOLL_TOUCH | FOLL_WRITE);
++ if (ret <= 0)
++ break;
++ nend = nstart + ret * PAGE_SIZE;
++ }
++ if (locked)
++ mmap_read_unlock(mm);
++ if (nstart == end)
++ return 0;
++ return size - min_t(size_t, nstart - start, size);
++}
++EXPORT_SYMBOL(fault_in_safe_writeable);
++
+ /**
+ * fault_in_readable - fault in userspace address range for reading
+ * @uaddr: start of user address range
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:52 +0800
+Subject: iov_iter: Introduce nofault flag to disable page faults
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <56bf354a8e9c5f2d3d9482c90510d4ff0890d996.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 3337ab08d08b1a375f88471d9c8b1cac968cb054 upstream
+
+Introduce a new nofault flag to indicate to iov_iter_get_pages not to
+fault in user pages.
+
+This is implemented by passing the FOLL_NOFAULT flag to get_user_pages,
+which causes get_user_pages to fail when it would otherwise fault in a
+page. We'll use the ->nofault flag to prevent iomap_dio_rw from faulting
+in pages when page faults are not allowed.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/uio.h | 1 +
+ lib/iov_iter.c | 20 +++++++++++++++-----
+ 2 files changed, 16 insertions(+), 5 deletions(-)
+
+--- a/include/linux/uio.h
++++ b/include/linux/uio.h
+@@ -35,6 +35,7 @@ struct iov_iter_state {
+
+ struct iov_iter {
+ u8 iter_type;
++ bool nofault;
+ bool data_source;
+ size_t iov_offset;
+ size_t count;
+--- a/lib/iov_iter.c
++++ b/lib/iov_iter.c
+@@ -514,6 +514,7 @@ void iov_iter_init(struct iov_iter *i, u
+ WARN_ON(direction & ~(READ | WRITE));
+ *i = (struct iov_iter) {
+ .iter_type = ITER_IOVEC,
++ .nofault = false,
+ .data_source = direction,
+ .iov = iov,
+ .nr_segs = nr_segs,
+@@ -1529,13 +1530,17 @@ ssize_t iov_iter_get_pages(struct iov_it
+ return 0;
+
+ if (likely(iter_is_iovec(i))) {
++ unsigned int gup_flags = 0;
+ unsigned long addr;
+
++ if (iov_iter_rw(i) != WRITE)
++ gup_flags |= FOLL_WRITE;
++ if (i->nofault)
++ gup_flags |= FOLL_NOFAULT;
++
+ addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
+ n = DIV_ROUND_UP(len, PAGE_SIZE);
+- res = get_user_pages_fast(addr, n,
+- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
+- pages);
++ res = get_user_pages_fast(addr, n, gup_flags, pages);
+ if (unlikely(res <= 0))
+ return res;
+ return (res == n ? len : res * PAGE_SIZE) - *start;
+@@ -1651,15 +1656,20 @@ ssize_t iov_iter_get_pages_alloc(struct
+ return 0;
+
+ if (likely(iter_is_iovec(i))) {
++ unsigned int gup_flags = 0;
+ unsigned long addr;
+
++ if (iov_iter_rw(i) != WRITE)
++ gup_flags |= FOLL_WRITE;
++ if (i->nofault)
++ gup_flags |= FOLL_NOFAULT;
++
+ addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
+ n = DIV_ROUND_UP(len, PAGE_SIZE);
+ p = get_pages_array(n);
+ if (!p)
+ return -ENOMEM;
+- res = get_user_pages_fast(addr, n,
+- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
++ res = get_user_pages_fast(addr, n, gup_flags, p);
+ if (unlikely(res <= 0)) {
+ kvfree(p);
+ *pages = NULL;
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:40 +0800
+Subject: iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Andreas Gruenbacher <agruenba@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <2f18cef5634943c5bcd007b3753c3839feee9bd9.1649951733.git.anand.jain@oracle.com>
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit a6294593e8a1290091d0b078d5d33da5e0cd3dfe upstream
+
+Turn iov_iter_fault_in_readable into a function that returns the number
+of bytes not faulted in, similar to copy_to_user, instead of returning a
+non-zero value when any of the requested pages couldn't be faulted in.
+This supports the existing users that require all pages to be faulted in
+as well as new users that are happy if any pages can be faulted in.
+
+Rename iov_iter_fault_in_readable to fault_in_iov_iter_readable to make
+sure this change doesn't silently break things.
+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/file.c | 2 +-
+ fs/f2fs/file.c | 2 +-
+ fs/fuse/file.c | 2 +-
+ fs/iomap/buffered-io.c | 2 +-
+ fs/ntfs/file.c | 2 +-
+ fs/ntfs3/file.c | 2 +-
+ include/linux/uio.h | 2 +-
+ lib/iov_iter.c | 33 +++++++++++++++++++++------------
+ mm/filemap.c | 2 +-
+ 9 files changed, 29 insertions(+), 20 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1709,7 +1709,7 @@ static noinline ssize_t btrfs_buffered_w
+ * Fault pages before locking them in prepare_pages
+ * to avoid recursive lock
+ */
+- if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
++ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
+ ret = -EFAULT;
+ break;
+ }
+--- a/fs/f2fs/file.c
++++ b/fs/f2fs/file.c
+@@ -4279,7 +4279,7 @@ static ssize_t f2fs_file_write_iter(stru
+ size_t target_size = 0;
+ int err;
+
+- if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
++ if (fault_in_iov_iter_readable(from, iov_iter_count(from)))
+ set_inode_flag(inode, FI_NO_PREALLOC);
+
+ if ((iocb->ki_flags & IOCB_NOWAIT)) {
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -1164,7 +1164,7 @@ static ssize_t fuse_fill_write_pages(str
+
+ again:
+ err = -EFAULT;
+- if (iov_iter_fault_in_readable(ii, bytes))
++ if (fault_in_iov_iter_readable(ii, bytes))
+ break;
+
+ err = -ENOMEM;
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -757,7 +757,7 @@ again:
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ */
+- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+--- a/fs/ntfs/file.c
++++ b/fs/ntfs/file.c
+@@ -1829,7 +1829,7 @@ again:
+ * pages being swapped out between us bringing them into memory
+ * and doing the actual copying.
+ */
+- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+--- a/fs/ntfs3/file.c
++++ b/fs/ntfs3/file.c
+@@ -989,7 +989,7 @@ static ssize_t ntfs_compress_write(struc
+ frame_vbo = pos & ~(frame_size - 1);
+ index = frame_vbo >> PAGE_SHIFT;
+
+- if (unlikely(iov_iter_fault_in_readable(from, bytes))) {
++ if (unlikely(fault_in_iov_iter_readable(from, bytes))) {
+ err = -EFAULT;
+ goto out;
+ }
+--- a/include/linux/uio.h
++++ b/include/linux/uio.h
+@@ -133,7 +133,7 @@ size_t copy_page_from_iter_atomic(struct
+ size_t bytes, struct iov_iter *i);
+ void iov_iter_advance(struct iov_iter *i, size_t bytes);
+ void iov_iter_revert(struct iov_iter *i, size_t bytes);
+-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes);
++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
+ size_t iov_iter_single_seg_count(const struct iov_iter *i);
+ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i);
+--- a/lib/iov_iter.c
++++ b/lib/iov_iter.c
+@@ -431,33 +431,42 @@ out:
+ }
+
+ /*
++ * fault_in_iov_iter_readable - fault in iov iterator for reading
++ * @i: iterator
++ * @size: maximum length
++ *
+ * Fault in one or more iovecs of the given iov_iter, to a maximum length of
+- * bytes. For each iovec, fault in each page that constitutes the iovec.
++ * @size. For each iovec, fault in each page that constitutes the iovec.
++ *
++ * Returns the number of bytes not faulted in (like copy_to_user() and
++ * copy_from_user()).
+ *
+- * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
+- * because it is an invalid address).
++ * Always returns 0 for non-userspace iterators.
+ */
+-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
+ {
+ if (iter_is_iovec(i)) {
++ size_t count = min(size, iov_iter_count(i));
+ const struct iovec *p;
+ size_t skip;
+
+- if (bytes > i->count)
+- bytes = i->count;
+- for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
+- size_t len = min(bytes, p->iov_len - skip);
++ size -= count;
++ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
++ size_t len = min(count, p->iov_len - skip);
++ size_t ret;
+
+ if (unlikely(!len))
+ continue;
+- if (fault_in_readable(p->iov_base + skip, len))
+- return -EFAULT;
+- bytes -= len;
++ ret = fault_in_readable(p->iov_base + skip, len);
++ count -= len - ret;
++ if (ret)
++ break;
+ }
++ return count + size;
+ }
+ return 0;
+ }
+-EXPORT_SYMBOL(iov_iter_fault_in_readable);
++EXPORT_SYMBOL(fault_in_iov_iter_readable);
+
+ void iov_iter_init(struct iov_iter *i, unsigned int direction,
+ const struct iovec *iov, unsigned long nr_segs,
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -3760,7 +3760,7 @@ again:
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ */
+- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
--- /dev/null
+From foo@baz Fri Apr 29 11:07:48 AM CEST 2022
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 15 Apr 2022 06:28:56 +0800
+Subject: mm: gup: make fault_in_safe_writeable() use fixup_user_fault()
+To: stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Linus Torvalds <torvalds@linux-foundation.org>, Andreas Gruenbacher <agruenba@redhat.com>, David Hildenbrand <david@redhat.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <f0f656eec295ba30808cdaaaf7e8187b7fae162e.1649951733.git.anand.jain@oracle.com>
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit fe673d3f5bf1fc50cdc4b754831db91a2ec10126 upstream
+
+Instead of using GUP, make fault_in_safe_writeable() actually force a
+'handle_mm_fault()' using the same fixup_user_fault() machinery that
+futexes already use.
+
+Using the GUP machinery meant that fault_in_safe_writeable() did not do
+everything that a real fault would do, ranging from not auto-expanding
+the stack segment, to not updating accessed or dirty flags in the page
+tables (GUP sets those flags on the pages themselves).
+
+The latter causes problems on architectures (like s390) that do accessed
+bit handling in software, which meant that fault_in_safe_writeable()
+didn't actually do all the fault handling it needed to, and trying to
+access the user address afterwards would still cause faults.
+
+Reported-and-tested-by: Andreas Gruenbacher <agruenba@redhat.com>
+Fixes: cdd591fc86e3 ("iov_iter: Introduce fault_in_iov_iter_writeable")
+Link: https://lore.kernel.org/all/CAHc6FU5nP+nziNGG0JAF1FUx-GV7kKFvM7aZuU_XD2_1v4vnvg@mail.gmail.com/
+Acked-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/gup.c | 57 +++++++++++++++++++--------------------------------------
+ 1 file changed, 19 insertions(+), 38 deletions(-)
+
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1723,11 +1723,11 @@ EXPORT_SYMBOL(fault_in_writeable);
+ * @uaddr: start of address range
+ * @size: length of address range
+ *
+- * Faults in an address range using get_user_pages, i.e., without triggering
+- * hardware page faults. This is primarily useful when we already know that
+- * some or all of the pages in the address range aren't in memory.
++ * Faults in an address range for writing. This is primarily useful when we
++ * already know that some or all of the pages in the address range aren't in
++ * memory.
+ *
+- * Other than fault_in_writeable(), this function is non-destructive.
++ * Unlike fault_in_writeable(), this function is non-destructive.
+ *
+ * Note that we don't pin or otherwise hold the pages referenced that we fault
+ * in. There's no guarantee that they'll stay in memory for any duration of
+@@ -1738,46 +1738,27 @@ EXPORT_SYMBOL(fault_in_writeable);
+ */
+ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
+ {
+- unsigned long start = (unsigned long)untagged_addr(uaddr);
+- unsigned long end, nstart, nend;
++ unsigned long start = (unsigned long)uaddr, end;
+ struct mm_struct *mm = current->mm;
+- struct vm_area_struct *vma = NULL;
+- int locked = 0;
++ bool unlocked = false;
+
+- nstart = start & PAGE_MASK;
++ if (unlikely(size == 0))
++ return 0;
+ end = PAGE_ALIGN(start + size);
+- if (end < nstart)
++ if (end < start)
+ end = 0;
+- for (; nstart != end; nstart = nend) {
+- unsigned long nr_pages;
+- long ret;
+
+- if (!locked) {
+- locked = 1;
+- mmap_read_lock(mm);
+- vma = find_vma(mm, nstart);
+- } else if (nstart >= vma->vm_end)
+- vma = vma->vm_next;
+- if (!vma || vma->vm_start >= end)
+- break;
+- nend = end ? min(end, vma->vm_end) : vma->vm_end;
+- if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+- continue;
+- if (nstart < vma->vm_start)
+- nstart = vma->vm_start;
+- nr_pages = (nend - nstart) / PAGE_SIZE;
+- ret = __get_user_pages_locked(mm, nstart, nr_pages,
+- NULL, NULL, &locked,
+- FOLL_TOUCH | FOLL_WRITE);
+- if (ret <= 0)
++ mmap_read_lock(mm);
++ do {
++ if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
+ break;
+- nend = nstart + ret * PAGE_SIZE;
+- }
+- if (locked)
+- mmap_read_unlock(mm);
+- if (nstart == end)
+- return 0;
+- return size - min_t(size_t, nstart - start, size);
++ start = (start + PAGE_SIZE) & PAGE_MASK;
++ } while (start != end);
++ mmap_read_unlock(mm);
++
++ if (size > (unsigned long)uaddr - start)
++ return size - ((unsigned long)uaddr - start);
++ return 0;
+ }
+ EXPORT_SYMBOL(fault_in_safe_writeable);
+
--- /dev/null
+From 8f0b36497303487d5a32c75789c77859cc2ee895 Mon Sep 17 00:00:00 2001
+From: Muchun Song <songmuchun@bytedance.com>
+Date: Fri, 1 Apr 2022 11:28:36 -0700
+Subject: mm: kfence: fix objcgs vector allocation
+
+From: Muchun Song <songmuchun@bytedance.com>
+
+commit 8f0b36497303487d5a32c75789c77859cc2ee895 upstream.
+
+If the kfence object is allocated to be used for objects vector, then
+this slot of the pool eventually being occupied permanently since the
+vector is never freed. The solutions could be (1) freeing vector when
+the kfence object is freed or (2) allocating all vectors statically.
+
+Since the memory consumption of object vectors is low, it is better to
+chose (2) to fix the issue and it is also can reduce overhead of vectors
+allocating in the future.
+
+Link: https://lkml.kernel.org/r/20220328132843.16624-1-songmuchun@bytedance.com
+Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB")
+Signed-off-by: Muchun Song <songmuchun@bytedance.com>
+Reviewed-by: Marco Elver <elver@google.com>
+Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/kfence/core.c | 11 ++++++++++-
+ mm/kfence/kfence.h | 3 +++
+ 2 files changed, 13 insertions(+), 1 deletion(-)
+
+--- a/mm/kfence/core.c
++++ b/mm/kfence/core.c
+@@ -528,6 +528,8 @@ static bool __init kfence_init_pool(void
+ * enters __slab_free() slow-path.
+ */
+ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
++ struct page *page = &pages[i];
++
+ if (!i || (i % 2))
+ continue;
+
+@@ -535,7 +537,11 @@ static bool __init kfence_init_pool(void
+ if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
+ goto err;
+
+- __SetPageSlab(&pages[i]);
++ __SetPageSlab(page);
++#ifdef CONFIG_MEMCG
++ page->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
++ MEMCG_DATA_OBJCGS;
++#endif
+ }
+
+ /*
+@@ -911,6 +917,9 @@ void __kfence_free(void *addr)
+ {
+ struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
++#ifdef CONFIG_MEMCG
++ KFENCE_WARN_ON(meta->objcg);
++#endif
+ /*
+ * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
+ * the object, as the object page may be recycled for other-typed
+--- a/mm/kfence/kfence.h
++++ b/mm/kfence/kfence.h
+@@ -89,6 +89,9 @@ struct kfence_metadata {
+ struct kfence_track free_track;
+ /* For updating alloc_covered on frees. */
+ u32 alloc_stack_hash;
++#ifdef CONFIG_MEMCG
++ struct obj_cgroup *objcg;
++#endif
+ };
+
+ extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
bpf-fix-crash-due-to-out-of-bounds-access-into-reg2btf_ids.patch
spi-cadence-quadspi-fix-write-completion-support.patch
arm-dts-socfpga-change-qspi-to-intel-socfpga-qspi.patch
+mm-kfence-fix-objcgs-vector-allocation.patch
+gup-turn-fault_in_pages_-readable-writeable-into-fault_in_-readable-writeable.patch
+iov_iter-turn-iov_iter_fault_in_readable-into-fault_in_iov_iter_readable.patch
+iov_iter-introduce-fault_in_iov_iter_writeable.patch
+gfs2-add-wrapper-for-iomap_file_buffered_write.patch
+gfs2-clean-up-function-may_grant.patch
+gfs2-introduce-flag-for-glock-holder-auto-demotion.patch
+gfs2-move-the-inode-glock-locking-to-gfs2_file_buffered_write.patch
+gfs2-eliminate-ip-i_gh.patch
+gfs2-fix-mmap-page-fault-deadlocks-for-buffered-i-o.patch
+iomap-fix-iomap_dio_rw-return-value-for-user-copies.patch
+iomap-support-partial-direct-i-o-on-user-copy-failures.patch
+iomap-add-done_before-argument-to-iomap_dio_rw.patch
+gup-introduce-foll_nofault-flag-to-disable-page-faults.patch
+iov_iter-introduce-nofault-flag-to-disable-page-faults.patch
+gfs2-fix-mmap-page-fault-deadlocks-for-direct-i-o.patch
+btrfs-fix-deadlock-due-to-page-faults-during-direct-io-reads-and-writes.patch
+btrfs-fallback-to-blocking-mode-when-doing-async-dio-over-multiple-extents.patch
+mm-gup-make-fault_in_safe_writeable-use-fixup_user_fault.patch