5.10-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 23 Feb 2024 16:00:55 +0000 (17:00 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 23 Feb 2024 16:00:55 +0000 (17:00 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 23 Feb 2024 16:00:55 +0000 (17:00 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 23 Feb 2024 16:00:55 +0000 (17:00 +0100)
diff --git a/queue-5.10/series b/queue-5.10/series

index 39d0d6259ccdecfff2eaa8fab5a87e48ce7ce5b0..285c8a1eeb07ea422f0d4006dfe249b70f3bf35d 100644 (file)
--- a/queue-5.10/series
+++ b/queue-5.10/series
@@ -5,3 +5,5 @@ smb-client-fix-oob-in-receive_encrypted_standard.patch
  smb-client-fix-potential-oobs-in-smb2_parse_contexts.patch
  smb-client-fix-parsing-of-smb3.1.1-posix-create-context.patch
  sched-rt-sysctl_sched_rr_timeslice-show-default-timeslice-after-reset.patch
+userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch
+zonefs-improve-error-handling.patch
diff --git a/queue-5.10/userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch b/queue-5.10/userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch

new file mode 100644 (file)

index 0000000..0242def
--- /dev/null
+++ b/queue-5.10/userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch
@@ -0,0 +1,80 @@
+From 67695f18d55924b2013534ef3bdc363bc9e14605 Mon Sep 17 00:00:00 2001
+From: Lokesh Gidra <lokeshgidra@google.com>
+Date: Wed, 17 Jan 2024 14:37:29 -0800
+Subject: userfaultfd: fix mmap_changing checking in mfill_atomic_hugetlb
+
+From: Lokesh Gidra <lokeshgidra@google.com>
+
+commit 67695f18d55924b2013534ef3bdc363bc9e14605 upstream.
+
+In mfill_atomic_hugetlb(), mmap_changing isn't being checked
+again if we drop mmap_lock and reacquire it. When the lock is not held,
+mmap_changing could have been incremented. This is also inconsistent
+with the behavior in mfill_atomic().
+
+Link: https://lkml.kernel.org/r/20240117223729.1444522-1-lokeshgidra@google.com
+Fixes: df2cc96e77011 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races")
+Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Mike Rapoport <rppt@kernel.org>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Brian Geffon <bgeffon@google.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Nicolas Geoffray <ngeoffray@google.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/userfaultfd.c |   14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -209,6 +209,7 @@ static __always_inline ssize_t __mcopy_a
+                                             unsigned long dst_start,
+                                             unsigned long src_start,
+                                             unsigned long len,
++                                            bool *mmap_changing,
+                                             bool zeropage)
+ {
+       int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
+@@ -329,6 +330,15 @@ retry:
+                               goto out;
+                       }
+                       mmap_read_lock(dst_mm);
++                      /*
++                       * If memory mappings are changing because of non-cooperative
++                       * operation (e.g. mremap) running in parallel, bail out and
++                       * request the user to retry later
++                       */
++                      if (mmap_changing && READ_ONCE(*mmap_changing)) {
++                              err = -EAGAIN;
++                              break;
++                      }
+ 
+                       dst_vma = NULL;
+                       goto retry;
+@@ -410,6 +420,7 @@ extern ssize_t __mcopy_atomic_hugetlb(st
+                                     unsigned long dst_start,
+                                     unsigned long src_start,
+                                     unsigned long len,
++                                    bool *mmap_changing,
+                                     bool zeropage);
+ #endif /* CONFIG_HUGETLB_PAGE */
+ 
+@@ -529,7 +540,8 @@ retry:
+        */
+       if (is_vm_hugetlb_page(dst_vma))
+               return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+-                                              src_start, len, zeropage);
++                                             src_start, len, mmap_changing,
++                                             zeropage);
+ 
+       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
+               goto out_unlock;
diff --git a/queue-5.10/zonefs-improve-error-handling.patch b/queue-5.10/zonefs-improve-error-handling.patch

new file mode 100644 (file)

index 0000000..b72b6a9
--- /dev/null
+++ b/queue-5.10/zonefs-improve-error-handling.patch
@@ -0,0 +1,188 @@
+From 14db5f64a971fce3d8ea35de4dfc7f443a3efb92 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <dlemoal@kernel.org>
+Date: Thu, 8 Feb 2024 17:26:59 +0900
+Subject: zonefs: Improve error handling
+
+From: Damien Le Moal <dlemoal@kernel.org>
+
+commit 14db5f64a971fce3d8ea35de4dfc7f443a3efb92 upstream.
+
+Write error handling is racy and can sometime lead to the error recovery
+path wrongly changing the inode size of a sequential zone file to an
+incorrect value  which results in garbage data being readable at the end
+of a file. There are 2 problems:
+
+1) zonefs_file_dio_write() updates a zone file write pointer offset
+   after issuing a direct IO with iomap_dio_rw(). This update is done
+   only if the IO succeed for synchronous direct writes. However, for
+   asynchronous direct writes, the update is done without waiting for
+   the IO completion so that the next asynchronous IO can be
+   immediately issued. However, if an asynchronous IO completes with a
+   failure right before the i_truncate_mutex lock protecting the update,
+   the update may change the value of the inode write pointer offset
+   that was corrected by the error path (zonefs_io_error() function).
+
+2) zonefs_io_error() is called when a read or write error occurs. This
+   function executes a report zone operation using the callback function
+   zonefs_io_error_cb(), which does all the error recovery handling
+   based on the current zone condition, write pointer position and
+   according to the mount options being used. However, depending on the
+   zoned device being used, a report zone callback may be executed in a
+   context that is different from the context of __zonefs_io_error(). As
+   a result, zonefs_io_error_cb() may be executed without the inode
+   truncate mutex lock held, which can lead to invalid error processing.
+
+Fix both problems as follows:
+- Problem 1: Perform the inode write pointer offset update before a
+  direct write is issued with iomap_dio_rw(). This is safe to do as
+  partial direct writes are not supported (IOMAP_DIO_PARTIAL is not
+  set) and any failed IO will trigger the execution of zonefs_io_error()
+  which will correct the inode write pointer offset to reflect the
+  current state of the one on the device.
+- Problem 2: Change zonefs_io_error_cb() into zonefs_handle_io_error()
+  and call this function directly from __zonefs_io_error() after
+  obtaining the zone information using blkdev_report_zones() with a
+  simple callback function that copies to a local stack variable the
+  struct blk_zone obtained from the device. This ensures that error
+  handling is performed holding the inode truncate mutex.
+  This change also simplifies error handling for conventional zone files
+  by bypassing the execution of report zones entirely. This is safe to
+  do because the condition of conventional zones cannot be read-only or
+  offline and conventional zone files are always fully mapped with a
+  constant file size.
+
+Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
+Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
+Cc: stable@vger.kernel.org
+Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
+Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/zonefs/super.c |   70 ++++++++++++++++++++++++++++++------------------------
+ 1 file changed, 40 insertions(+), 30 deletions(-)
+
+--- a/fs/zonefs/super.c
++++ b/fs/zonefs/super.c
+@@ -319,16 +319,18 @@ static loff_t zonefs_check_zone_conditio
+       }
+ }
+ 
+-struct zonefs_ioerr_data {
+-      struct inode    *inode;
+-      bool            write;
+-};
+-
+ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+                             void *data)
+ {
+-      struct zonefs_ioerr_data *err = data;
+-      struct inode *inode = err->inode;
++      struct blk_zone *z = data;
++
++      *z = *zone;
++      return 0;
++}
++
++static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone,
++                                 bool write)
++{
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       struct super_block *sb = inode->i_sb;
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+@@ -344,8 +346,8 @@ static int zonefs_io_error_cb(struct blk
+       isize = i_size_read(inode);
+       if (zone->cond != BLK_ZONE_COND_OFFLINE &&
+           zone->cond != BLK_ZONE_COND_READONLY &&
+-          !err->write && isize == data_size)
+-              return 0;
++          !write && isize == data_size)
++              return;
+ 
+       /*
+        * At this point, we detected either a bad zone or an inconsistency
+@@ -366,8 +368,9 @@ static int zonefs_io_error_cb(struct blk
+        * In all cases, warn about inode size inconsistency and handle the
+        * IO error according to the zone condition and to the mount options.
+        */
+-      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size)
+-              zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n",
++      if (isize != data_size)
++              zonefs_warn(sb,
++                          "inode %lu: invalid size %lld (should be %lld)\n",
+                           inode->i_ino, isize, data_size);
+ 
+       /*
+@@ -427,8 +430,6 @@ static int zonefs_io_error_cb(struct blk
+       zonefs_update_stats(inode, data_size);
+       zonefs_i_size_write(inode, data_size);
+       zi->i_wpoffset = data_size;
+-
+-      return 0;
+ }
+ 
+ /*
+@@ -442,23 +443,25 @@ static void __zonefs_io_error(struct ino
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       struct super_block *sb = inode->i_sb;
+-      struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+       unsigned int noio_flag;
+-      unsigned int nr_zones = 1;
+-      struct zonefs_ioerr_data err = {
+-              .inode = inode,
+-              .write = write,
+-      };
++      struct blk_zone zone;
+       int ret;
+ 
+       /*
+-       * The only files that have more than one zone are conventional zone
+-       * files with aggregated conventional zones, for which the inode zone
+-       * size is always larger than the device zone size.
+-       */
+-      if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev))
+-              nr_zones = zi->i_zone_size >>
+-                      (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
++       * Conventional zone have no write pointer and cannot become read-only
++       * or offline. So simply fake a report for a single or aggregated zone
++       * and let zonefs_handle_io_error() correct the zone inode information
++       * according to the mount options.
++       */
++      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) {
++              zone.start = zi->i_zsector;
++              zone.len = zi->i_max_size >> SECTOR_SHIFT;
++              zone.wp = zone.start + zone.len;
++              zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
++              zone.cond = BLK_ZONE_COND_NOT_WP;
++              zone.capacity = zone.len;
++              goto handle_io_error;
++      }
+ 
+       /*
+        * Memory allocations in blkdev_report_zones() can trigger a memory
+@@ -469,12 +472,19 @@ static void __zonefs_io_error(struct ino
+        * the GFP_NOIO context avoids both problems.
+        */
+       noio_flag = memalloc_noio_save();
+-      ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones,
+-                                zonefs_io_error_cb, &err);
+-      if (ret != nr_zones)
++      ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, 1,
++                                zonefs_io_error_cb, &zone);
++      memalloc_noio_restore(noio_flag);
++      if (ret != 1) {
+               zonefs_err(sb, "Get inode %lu zone information failed %d\n",
+                          inode->i_ino, ret);
+-      memalloc_noio_restore(noio_flag);
++              zonefs_warn(sb, "remounting filesystem read-only\n");
++              sb->s_flags |= SB_RDONLY;
++              return;
++      }
++
++handle_io_error:
++      zonefs_handle_io_error(inode, &zone, write);
+ }
+ 
+ static void zonefs_io_error(struct inode *inode, bool write)
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 23 Feb 2024 16:00:55 +0000 (17:00 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 23 Feb 2024 16:00:55 +0000 (17:00 +0100)
queue-5.10/series		patch \| blob \| blame \| history
queue-5.10/userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/zonefs-improve-error-handling.patch	[new file with mode: 0644]	patch \| blob