Fixes for 5.10

author Sasha Levin <sashal@kernel.org>

Mon, 4 Jan 2021 15:53:13 +0000 (10:53 -0500)

committer Sasha Levin <sashal@kernel.org>

Mon, 4 Jan 2021 15:53:13 +0000 (10:53 -0500)
author Sasha Levin <sashal@kernel.org>
Mon, 4 Jan 2021 15:53:13 +0000 (10:53 -0500)
committer Sasha Levin <sashal@kernel.org>
Mon, 4 Jan 2021 15:53:13 +0000 (10:53 -0500)
diff --git a/queue-5.10/alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch b/queue-5.10/alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch

new file mode 100644 (file)

index 0000000..3578a52
--- /dev/null
+++ b/queue-5.10/alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch
@@ -0,0 +1,52 @@
+From 2d6a2a446f4ca540fd548786c39da13db8c1a33a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 18 Dec 2020 15:56:25 +0100
+Subject: ALSA: pcm: Clear the full allocated memory at hw_params
+
+From: Takashi Iwai <tiwai@suse.de>
+
+[ Upstream commit 618de0f4ef11acd8cf26902e65493d46cc20cc89 ]
+
+The PCM hw_params core function tries to clear up the PCM buffer
+before actually using for avoiding the information leak from the
+previous usages or the usage before a new allocation.  It performs the
+memset() with runtime->dma_bytes, but this might still leave some
+remaining bytes untouched; namely, the PCM buffer size is aligned in
+page size for mmap, hence runtime->dma_bytes doesn't necessarily cover
+all PCM buffer pages, and the remaining bytes are exposed via mmap.
+
+This patch changes the memory clearance to cover the all buffer pages
+if the stream is supposed to be mmap-ready (that guarantees that the
+buffer size is aligned in page size).
+
+Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
+Link: https://lore.kernel.org/r/20201218145625.2045-3-tiwai@suse.de
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/core/pcm_native.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
+index 47b155a49226f..9f3f8e953ff04 100644
+--- a/sound/core/pcm_native.c
++++ b/sound/core/pcm_native.c
+@@ -755,8 +755,13 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream,
+               runtime->boundary *= 2;
+ 
+       /* clear the buffer for avoiding possible kernel info leaks */
+-      if (runtime->dma_area && !substream->ops->copy_user)
+-              memset(runtime->dma_area, 0, runtime->dma_bytes);
++      if (runtime->dma_area && !substream->ops->copy_user) {
++              size_t size = runtime->dma_bytes;
++
++              if (runtime->info & SNDRV_PCM_INFO_MMAP)
++                      size = PAGE_ALIGN(size);
++              memset(runtime->dma_area, 0, size);
++      }
+ 
+       snd_pcm_timer_resolution_change(substream);
+       snd_pcm_set_state(substream, SNDRV_PCM_STATE_SETUP);
+-- 
+2.27.0
+
diff --git a/queue-5.10/ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch b/queue-5.10/ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch

new file mode 100644 (file)

index 0000000..b79e128
--- /dev/null
+++ b/queue-5.10/ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch
@@ -0,0 +1,34 @@
+From de7b4ea194839e2bfd2dac67a676a20d88923ec1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 Nov 2020 09:37:59 -0500
+Subject: ceph: fix inode refcount leak when ceph_fill_inode on non-I_NEW inode
+ fails
+
+From: Jeff Layton <jlayton@kernel.org>
+
+[ Upstream commit 68cbb8056a4c24c6a38ad2b79e0a9764b235e8fa ]
+
+Signed-off-by: Jeff Layton <jlayton@kernel.org>
+Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ceph/inode.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
+index 526faf4778ce4..2462a9a84b956 100644
+--- a/fs/ceph/inode.c
++++ b/fs/ceph/inode.c
+@@ -1335,6 +1335,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
+                               in, ceph_vinop(in));
+                       if (in->i_state & I_NEW)
+                               discard_new_inode(in);
++                      else
++                              iput(in);
+                       goto done;
+               }
+               req->r_target_inode = in;
+-- 
+2.27.0
+
diff --git a/queue-5.10/device-dax-fix-range-release.patch b/queue-5.10/device-dax-fix-range-release.patch

new file mode 100644 (file)

index 0000000..3160013
--- /dev/null
+++ b/queue-5.10/device-dax-fix-range-release.patch
@@ -0,0 +1,134 @@
+From a09dd794b26d82c969c4563bda1eea962aebf87c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 18 Dec 2020 18:41:41 -0800
+Subject: device-dax: Fix range release
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+[ Upstream commit 6268d7da4d192af339f4d688942b9ccb45a65e04 ]
+
+There are multiple locations that open-code the release of the last
+range in a device-dax instance. Consolidate this into a new
+dev_dax_trim_range() helper.
+
+This also addresses a kmemleak report:
+
+# cat /sys/kernel/debug/kmemleak
+[..]
+unreferenced object 0xffff976bd46f6240 (size 64):
+   comm "ndctl", pid 23556, jiffies 4299514316 (age 5406.733s)
+   hex dump (first 32 bytes):
+     00 00 00 00 00 00 00 00 00 00 20 c3 37 00 00 00  .......... .7...
+     ff ff ff 7f 38 00 00 00 00 00 00 00 00 00 00 00  ....8...........
+   backtrace:
+     [<00000000064003cf>] __kmalloc_track_caller+0x136/0x379
+     [<00000000d85e3c52>] krealloc+0x67/0x92
+     [<00000000d7d3ba8a>] __alloc_dev_dax_range+0x73/0x25c
+     [<0000000027d58626>] devm_create_dev_dax+0x27d/0x416
+     [<00000000434abd43>] __dax_pmem_probe+0x1c9/0x1000 [dax_pmem_core]
+     [<0000000083726c1c>] dax_pmem_probe+0x10/0x1f [dax_pmem]
+     [<00000000b5f2319c>] nvdimm_bus_probe+0x9d/0x340 [libnvdimm]
+     [<00000000c055e544>] really_probe+0x230/0x48d
+     [<000000006cabd38e>] driver_probe_device+0x122/0x13b
+     [<0000000029c7b95a>] device_driver_attach+0x5b/0x60
+     [<0000000053e5659b>] bind_store+0xb7/0xc3
+     [<00000000d3bdaadc>] drv_attr_store+0x27/0x31
+     [<00000000949069c5>] sysfs_kf_write+0x4a/0x57
+     [<000000004a8b5adf>] kernfs_fop_write+0x150/0x1e5
+     [<00000000bded60f0>] __vfs_write+0x1b/0x34
+     [<00000000b92900f0>] vfs_write+0xd8/0x1d1
+
+Reported-by: Jane Chu <jane.chu@oracle.com>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>
+Link: https://lore.kernel.org/r/160834570161.1791850.14911670304441510419.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/dax/bus.c | 44 +++++++++++++++++++++-----------------------
+ 1 file changed, 21 insertions(+), 23 deletions(-)
+
+diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
+index 27513d311242e..de7b74505e75e 100644
+--- a/drivers/dax/bus.c
++++ b/drivers/dax/bus.c
+@@ -367,19 +367,28 @@ void kill_dev_dax(struct dev_dax *dev_dax)
+ }
+ EXPORT_SYMBOL_GPL(kill_dev_dax);
+ 
+-static void free_dev_dax_ranges(struct dev_dax *dev_dax)
++static void trim_dev_dax_range(struct dev_dax *dev_dax)
+ {
++      int i = dev_dax->nr_range - 1;
++      struct range *range = &dev_dax->ranges[i].range;
+       struct dax_region *dax_region = dev_dax->region;
+-      int i;
+ 
+       device_lock_assert(dax_region->dev);
+-      for (i = 0; i < dev_dax->nr_range; i++) {
+-              struct range *range = &dev_dax->ranges[i].range;
+-
+-              __release_region(&dax_region->res, range->start,
+-                              range_len(range));
++      dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i,
++              (unsigned long long)range->start,
++              (unsigned long long)range->end);
++
++      __release_region(&dax_region->res, range->start, range_len(range));
++      if (--dev_dax->nr_range == 0) {
++              kfree(dev_dax->ranges);
++              dev_dax->ranges = NULL;
+       }
+-      dev_dax->nr_range = 0;
++}
++
++static void free_dev_dax_ranges(struct dev_dax *dev_dax)
++{
++      while (dev_dax->nr_range)
++              trim_dev_dax_range(dev_dax);
+ }
+ 
+ static void unregister_dev_dax(void *dev)
+@@ -804,15 +813,10 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
+               return 0;
+ 
+       rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1);
+-      if (rc) {
+-              dev_dbg(dev, "delete range[%d]: %pa:%pa\n", dev_dax->nr_range - 1,
+-                              &alloc->start, &alloc->end);
+-              dev_dax->nr_range--;
+-              __release_region(res, alloc->start, resource_size(alloc));
+-              return rc;
+-      }
++      if (rc)
++              trim_dev_dax_range(dev_dax);
+ 
+-      return 0;
++      return rc;
+ }
+ 
+ static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size)
+@@ -885,12 +889,7 @@ static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
+               if (shrink >= range_len(range)) {
+                       devm_release_action(dax_region->dev,
+                                       unregister_dax_mapping, &mapping->dev);
+-                      __release_region(&dax_region->res, range->start,
+-                                      range_len(range));
+-                      dev_dax->nr_range--;
+-                      dev_dbg(dev, "delete range[%d]: %#llx:%#llx\n", i,
+-                                      (unsigned long long) range->start,
+-                                      (unsigned long long) range->end);
++                      trim_dev_dax_range(dev_dax);
+                       to_shrink -= shrink;
+                       if (!to_shrink)
+                               break;
+@@ -1274,7 +1273,6 @@ static void dev_dax_release(struct device *dev)
+       put_dax(dax_dev);
+       free_dev_dax_id(dev_dax);
+       dax_region_put(dax_region);
+-      kfree(dev_dax->ranges);
+       kfree(dev_dax->pgmap);
+       kfree(dev_dax);
+ }
+-- 
+2.27.0
+
diff --git a/queue-5.10/dm-verity-skip-verity-work-if-i-o-error-when-system-.patch b/queue-5.10/dm-verity-skip-verity-work-if-i-o-error-when-system-.patch

new file mode 100644 (file)

index 0000000..a4b22dc
--- /dev/null
+++ b/queue-5.10/dm-verity-skip-verity-work-if-i-o-error-when-system-.patch
@@ -0,0 +1,59 @@
+From 5e7a77e76c8ba928983f571ef463fe624bc25d0c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 3 Dec 2020 09:46:59 +0900
+Subject: dm verity: skip verity work if I/O error when system is shutting down
+
+From: Hyeongseok Kim <hyeongseok@gmail.com>
+
+[ Upstream commit 252bd1256396cebc6fc3526127fdb0b317601318 ]
+
+If emergency system shutdown is called, like by thermal shutdown,
+a dm device could be alive when the block device couldn't process
+I/O requests anymore. In this state, the handling of I/O errors
+by new dm I/O requests or by those already in-flight can lead to
+a verity corruption state, which is a misjudgment.
+
+So, skip verity work in response to I/O error when system is shutting
+down.
+
+Signed-off-by: Hyeongseok Kim <hyeongseok@gmail.com>
+Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/dm-verity-target.c | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
+index f74982dcbea0d..6b8e5bdd8526d 100644
+--- a/drivers/md/dm-verity-target.c
++++ b/drivers/md/dm-verity-target.c
+@@ -537,6 +537,15 @@ static int verity_verify_io(struct dm_verity_io *io)
+       return 0;
+ }
+ 
++/*
++ * Skip verity work in response to I/O error when system is shutting down.
++ */
++static inline bool verity_is_system_shutting_down(void)
++{
++      return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
++              || system_state == SYSTEM_RESTART;
++}
++
+ /*
+  * End one "io" structure with a given error.
+  */
+@@ -564,7 +573,8 @@ static void verity_end_io(struct bio *bio)
+ {
+       struct dm_verity_io *io = bio->bi_private;
+ 
+-      if (bio->bi_status && !verity_fec_is_enabled(io->v)) {
++      if (bio->bi_status &&
++          (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) {
+               verity_finish_io(io, bio->bi_status);
+               return;
+       }
+-- 
+2.27.0
+
diff --git a/queue-5.10/drm-amd-display-updated-wm-table-for-renoir.patch b/queue-5.10/drm-amd-display-updated-wm-table-for-renoir.patch

new file mode 100644 (file)

index 0000000..f6382b6
--- /dev/null
+++ b/queue-5.10/drm-amd-display-updated-wm-table-for-renoir.patch
@@ -0,0 +1,63 @@
+From 395e3945ad577c47057fe6b8883920be561092c7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 3 Dec 2020 14:05:56 -0500
+Subject: drm/amd/display: updated wm table for Renoir
+
+From: Jake Wang <haonan.wang2@amd.com>
+
+[ Upstream commit 410066d24cfc1071be25e402510367aca9db5cb6 ]
+
+[Why]
+For certain timings, Renoir may underflow due to sr exit
+latency being too slow.
+
+[How]
+Updated wm table for renoir.
+
+Signed-off-by: Jake Wang <haonan.wang2@amd.com>
+Reviewed-by: Yongqiang Sun <yongqiang.sun@amd.com>
+Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c    | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c
+index 6b431db146cd9..1c6e401dd4cce 100644
+--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c
++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c
+@@ -704,24 +704,24 @@ static struct wm_table ddr4_wm_table_rn = {
+                       .wm_inst = WM_B,
+                       .wm_type = WM_TYPE_PSTATE_CHG,
+                       .pstate_latency_us = 11.72,
+-                      .sr_exit_time_us = 10.12,
+-                      .sr_enter_plus_exit_time_us = 11.48,
++                      .sr_exit_time_us = 11.12,
++                      .sr_enter_plus_exit_time_us = 12.48,
+                       .valid = true,
+               },
+               {
+                       .wm_inst = WM_C,
+                       .wm_type = WM_TYPE_PSTATE_CHG,
+                       .pstate_latency_us = 11.72,
+-                      .sr_exit_time_us = 10.12,
+-                      .sr_enter_plus_exit_time_us = 11.48,
++                      .sr_exit_time_us = 11.12,
++                      .sr_enter_plus_exit_time_us = 12.48,
+                       .valid = true,
+               },
+               {
+                       .wm_inst = WM_D,
+                       .wm_type = WM_TYPE_PSTATE_CHG,
+                       .pstate_latency_us = 11.72,
+-                      .sr_exit_time_us = 10.12,
+-                      .sr_enter_plus_exit_time_us = 11.48,
++                      .sr_exit_time_us = 11.12,
++                      .sr_enter_plus_exit_time_us = 12.48,
+                       .valid = true,
+               },
+       }
+-- 
+2.27.0
+
diff --git a/queue-5.10/ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch b/queue-5.10/ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch

new file mode 100644 (file)

index 0000000..2644928
--- /dev/null
+++ b/queue-5.10/ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch
@@ -0,0 +1,71 @@
+From 0781df1b520bf59b12c373d6925f3300f23c801d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 4 Dec 2020 11:05:43 +0800
+Subject: ext4: avoid s_mb_prefetch to be zero in individual scenarios
+
+From: Chunguang Xu <brookxu@tencent.com>
+
+[ Upstream commit 82ef1370b0c1757ab4ce29f34c52b4e93839b0aa ]
+
+Commit cfd732377221 ("ext4: add prefetching for block allocation
+bitmaps") introduced block bitmap prefetch, and expects to read block
+bitmaps of flex_bg through an IO.  However, it seems to ignore the
+value range of s_log_groups_per_flex.  In the scenario where the value
+of s_log_groups_per_flex is greater than 27, s_mb_prefetch or
+s_mb_prefetch_limit will overflow, cause a divide zero exception.
+
+In addition, the logic of calculating nr is also flawed, because the
+size of flexbg is fixed during a single mount, but s_mb_prefetch can
+be modified, which causes nr to fail to meet the value condition of
+[1, flexbg_size].
+
+To solve this problem, we need to set the upper limit of
+s_mb_prefetch.  Since we expect to load block bitmaps of a flex_bg
+through an IO, we can consider determining a reasonable upper limit
+among the IO limit parameters.  After consideration, we chose
+BLK_MAX_SEGMENT_SIZE.  This is a good choice to solve divide zero
+problem and avoiding performance degradation.
+
+[ Some minor code simplifications to make the changes easy to follow -- TYT ]
+
+Reported-by: Tosk Robot <tencent_os_robot@tencent.com>
+Signed-off-by: Chunguang Xu <brookxu@tencent.com>
+Reviewed-by: Samuel Liao <samuelliao@tencent.com>
+Reviewed-by: Andreas Dilger <adilger@dilger.ca>
+Link: https://lore.kernel.org/r/1607051143-24508-1-git-send-email-brookxu@tencent.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/mballoc.c | 9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 37a619bf1ac7c..e67d5de6f28ca 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2395,9 +2395,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+ 
+                               nr = sbi->s_mb_prefetch;
+                               if (ext4_has_feature_flex_bg(sb)) {
+-                                      nr = (group / sbi->s_mb_prefetch) *
+-                                              sbi->s_mb_prefetch;
+-                                      nr = nr + sbi->s_mb_prefetch - group;
++                                      nr = 1 << sbi->s_log_groups_per_flex;
++                                      nr -= group & (nr - 1);
++                                      nr = min(nr, sbi->s_mb_prefetch);
+                               }
+                               prefetch_grp = ext4_mb_prefetch(sb, group,
+                                                       nr, &prefetch_ios);
+@@ -2733,7 +2733,8 @@ static int ext4_mb_init_backend(struct super_block *sb)
+ 
+       if (ext4_has_feature_flex_bg(sb)) {
+               /* a single flex group is supposed to be read by a single IO */
+-              sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
++              sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex,
++                      BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
+               sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+       } else {
+               sbi->s_mb_prefetch = 32;
+-- 
+2.27.0
+
diff --git a/queue-5.10/f2fs-avoid-race-condition-for-shrinker-count.patch b/queue-5.10/f2fs-avoid-race-condition-for-shrinker-count.patch

new file mode 100644 (file)

index 0000000..1f99b36
--- /dev/null
+++ b/queue-5.10/f2fs-avoid-race-condition-for-shrinker-count.patch
@@ -0,0 +1,236 @@
+From b113fd5a4b803810a8474423935612e94c5ef906 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 6 Nov 2020 13:22:05 -0800
+Subject: f2fs: avoid race condition for shrinker count
+
+From: Jaegeuk Kim <jaegeuk@kernel.org>
+
+[ Upstream commit a95ba66ac1457b76fe472c8e092ab1006271f16c ]
+
+Light reported sometimes shinker gets nat_cnt < dirty_nat_cnt resulting in
+wrong do_shinker work. Let's avoid to return insane overflowed value by adding
+single tracking value.
+
+Reported-by: Light Hsieh <Light.Hsieh@mediatek.com>
+Reviewed-by: Chao Yu <yuchao0@huawei.com>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/f2fs/checkpoint.c |  2 +-
+ fs/f2fs/debug.c      | 11 ++++++-----
+ fs/f2fs/f2fs.h       | 10 ++++++++--
+ fs/f2fs/node.c       | 29 ++++++++++++++++++-----------
+ fs/f2fs/node.h       |  4 ++--
+ fs/f2fs/shrinker.c   |  4 +---
+ 6 files changed, 36 insertions(+), 24 deletions(-)
+
+diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
+index 023462e80e58d..b39bf416d5114 100644
+--- a/fs/f2fs/checkpoint.c
++++ b/fs/f2fs/checkpoint.c
+@@ -1600,7 +1600,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+                       goto out;
+               }
+ 
+-              if (NM_I(sbi)->dirty_nat_cnt == 0 &&
++              if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 &&
+                               SIT_I(sbi)->dirty_sentries == 0 &&
+                               prefree_segments(sbi) == 0) {
+                       f2fs_flush_sit_entries(sbi, cpc);
+diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
+index a8357fd4f5fab..197c914119da8 100644
+--- a/fs/f2fs/debug.c
++++ b/fs/f2fs/debug.c
+@@ -145,8 +145,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
+               si->node_pages = NODE_MAPPING(sbi)->nrpages;
+       if (sbi->meta_inode)
+               si->meta_pages = META_MAPPING(sbi)->nrpages;
+-      si->nats = NM_I(sbi)->nat_cnt;
+-      si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
++      si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT];
++      si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT];
+       si->sits = MAIN_SEGS(sbi);
+       si->dirty_sits = SIT_I(sbi)->dirty_sentries;
+       si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
+@@ -278,9 +278,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
+       si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] +
+                               NM_I(sbi)->nid_cnt[PREALLOC_NID]) *
+                               sizeof(struct free_nid);
+-      si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
+-      si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
+-                                      sizeof(struct nat_entry_set);
++      si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] *
++                              sizeof(struct nat_entry);
++      si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] *
++                              sizeof(struct nat_entry_set);
+       si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
+       for (i = 0; i < MAX_INO_ENTRY; i++)
+               si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
+diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
+index 9a321c52facec..e4344d98a780c 100644
+--- a/fs/f2fs/f2fs.h
++++ b/fs/f2fs/f2fs.h
+@@ -894,6 +894,13 @@ enum nid_state {
+       MAX_NID_STATE,
+ };
+ 
++enum nat_state {
++      TOTAL_NAT,
++      DIRTY_NAT,
++      RECLAIMABLE_NAT,
++      MAX_NAT_STATE,
++};
++
+ struct f2fs_nm_info {
+       block_t nat_blkaddr;            /* base disk address of NAT */
+       nid_t max_nid;                  /* maximum possible node ids */
+@@ -909,8 +916,7 @@ struct f2fs_nm_info {
+       struct rw_semaphore nat_tree_lock;      /* protect nat_tree_lock */
+       struct list_head nat_entries;   /* cached nat entry list (clean) */
+       spinlock_t nat_list_lock;       /* protect clean nat entry list */
+-      unsigned int nat_cnt;           /* the # of cached nat entries */
+-      unsigned int dirty_nat_cnt;     /* total num of nat entries in set */
++      unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
+       unsigned int nat_blocks;        /* # of nat blocks */
+ 
+       /* free node ids management */
+diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
+index 42394de6c7eb1..e65d73293a3f6 100644
+--- a/fs/f2fs/node.c
++++ b/fs/f2fs/node.c
+@@ -62,8 +62,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
+                               sizeof(struct free_nid)) >> PAGE_SHIFT;
+               res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
+       } else if (type == NAT_ENTRIES) {
+-              mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
+-                                                      PAGE_SHIFT;
++              mem_size = (nm_i->nat_cnt[TOTAL_NAT] *
++                              sizeof(struct nat_entry)) >> PAGE_SHIFT;
+               res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
+               if (excess_cached_nats(sbi))
+                       res = false;
+@@ -177,7 +177,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
+       list_add_tail(&ne->list, &nm_i->nat_entries);
+       spin_unlock(&nm_i->nat_list_lock);
+ 
+-      nm_i->nat_cnt++;
++      nm_i->nat_cnt[TOTAL_NAT]++;
++      nm_i->nat_cnt[RECLAIMABLE_NAT]++;
+       return ne;
+ }
+ 
+@@ -207,7 +208,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
+ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
+ {
+       radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
+-      nm_i->nat_cnt--;
++      nm_i->nat_cnt[TOTAL_NAT]--;
++      nm_i->nat_cnt[RECLAIMABLE_NAT]--;
+       __free_nat_entry(e);
+ }
+ 
+@@ -253,7 +255,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+       if (get_nat_flag(ne, IS_DIRTY))
+               goto refresh_list;
+ 
+-      nm_i->dirty_nat_cnt++;
++      nm_i->nat_cnt[DIRTY_NAT]++;
++      nm_i->nat_cnt[RECLAIMABLE_NAT]--;
+       set_nat_flag(ne, IS_DIRTY, true);
+ refresh_list:
+       spin_lock(&nm_i->nat_list_lock);
+@@ -273,7 +276,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ 
+       set_nat_flag(ne, IS_DIRTY, false);
+       set->entry_cnt--;
+-      nm_i->dirty_nat_cnt--;
++      nm_i->nat_cnt[DIRTY_NAT]--;
++      nm_i->nat_cnt[RECLAIMABLE_NAT]++;
+ }
+ 
+ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
+@@ -2944,14 +2948,17 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+       LIST_HEAD(sets);
+       int err = 0;
+ 
+-      /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */
++      /*
++       * during unmount, let's flush nat_bits before checking
++       * nat_cnt[DIRTY_NAT].
++       */
+       if (enabled_nat_bits(sbi, cpc)) {
+               down_write(&nm_i->nat_tree_lock);
+               remove_nats_in_journal(sbi);
+               up_write(&nm_i->nat_tree_lock);
+       }
+ 
+-      if (!nm_i->dirty_nat_cnt)
++      if (!nm_i->nat_cnt[DIRTY_NAT])
+               return 0;
+ 
+       down_write(&nm_i->nat_tree_lock);
+@@ -2962,7 +2969,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+        * into nat entry set.
+        */
+       if (enabled_nat_bits(sbi, cpc) ||
+-              !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
++              !__has_cursum_space(journal,
++                      nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
+               remove_nats_in_journal(sbi);
+ 
+       while ((found = __gang_lookup_nat_set(nm_i,
+@@ -3086,7 +3094,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
+                                               F2FS_RESERVED_NODE_NUM;
+       nm_i->nid_cnt[FREE_NID] = 0;
+       nm_i->nid_cnt[PREALLOC_NID] = 0;
+-      nm_i->nat_cnt = 0;
+       nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+       nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
+       nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
+@@ -3220,7 +3227,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
+                       __del_from_nat_cache(nm_i, natvec[idx]);
+               }
+       }
+-      f2fs_bug_on(sbi, nm_i->nat_cnt);
++      f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]);
+ 
+       /* destroy nat set cache */
+       nid = 0;
+diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
+index 69e5859e993cf..f84541b57acbb 100644
+--- a/fs/f2fs/node.h
++++ b/fs/f2fs/node.h
+@@ -126,13 +126,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
+ 
+ static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
+ {
+-      return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
++      return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid *
+                                       NM_I(sbi)->dirty_nats_ratio / 100;
+ }
+ 
+ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
+ {
+-      return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
++      return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD;
+ }
+ 
+ static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi)
+diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
+index d66de5999a26d..dd3c3c7a90ec8 100644
+--- a/fs/f2fs/shrinker.c
++++ b/fs/f2fs/shrinker.c
+@@ -18,9 +18,7 @@ static unsigned int shrinker_run_no;
+ 
+ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
+ {
+-      long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+-
+-      return count > 0 ? count : 0;
++      return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT];
+ }
+ 
+ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
+-- 
+2.27.0
+
diff --git a/queue-5.10/f2fs-fix-race-of-pending_pages-in-decompression.patch b/queue-5.10/f2fs-fix-race-of-pending_pages-in-decompression.patch

new file mode 100644 (file)

index 0000000..682cdf5
--- /dev/null
+++ b/queue-5.10/f2fs-fix-race-of-pending_pages-in-decompression.patch
@@ -0,0 +1,240 @@
+From cdbb5d70b70a016bc4b9045dfc7adb08e8718ff6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 5 Dec 2020 13:26:26 +0900
+Subject: f2fs: fix race of pending_pages in decompression
+
+From: Daeho Jeong <daehojeong@google.com>
+
+[ Upstream commit 6422a71ef40e4751d59b8c9412e7e2dafe085878 ]
+
+I found out f2fs_free_dic() is invoked in a wrong timing, but
+f2fs_verify_bio() still needed the dic info and it triggered the
+below kernel panic. It has been caused by the race condition of
+pending_pages value between decompression and verity logic, when
+the same compression cluster had been split in different bios.
+By split bios, f2fs_verify_bio() ended up with decreasing
+pending_pages value before it is reset to nr_cpages by
+f2fs_decompress_pages() and caused the kernel panic.
+
+[ 4416.564763] Unable to handle kernel NULL pointer dereference
+               at virtual address 0000000000000000
+...
+[ 4416.896016] Workqueue: fsverity_read_queue f2fs_verity_work
+[ 4416.908515] pc : fsverity_verify_page+0x20/0x78
+[ 4416.913721] lr : f2fs_verify_bio+0x11c/0x29c
+[ 4416.913722] sp : ffffffc019533cd0
+[ 4416.913723] x29: ffffffc019533cd0 x28: 0000000000000402
+[ 4416.913724] x27: 0000000000000001 x26: 0000000000000100
+[ 4416.913726] x25: 0000000000000001 x24: 0000000000000004
+[ 4416.913727] x23: 0000000000001000 x22: 0000000000000000
+[ 4416.913728] x21: 0000000000000000 x20: ffffffff2076f9c0
+[ 4416.913729] x19: ffffffff2076f9c0 x18: ffffff8a32380c30
+[ 4416.913731] x17: ffffffc01f966d97 x16: 0000000000000298
+[ 4416.913732] x15: 0000000000000000 x14: 0000000000000000
+[ 4416.913733] x13: f074faec89ffffff x12: 0000000000000000
+[ 4416.913734] x11: 0000000000001000 x10: 0000000000001000
+[ 4416.929176] x9 : ffffffff20d1f5c7 x8 : 0000000000000000
+[ 4416.929178] x7 : 626d7464ff286b6b x6 : ffffffc019533ade
+[ 4416.929179] x5 : 000000008049000e x4 : ffffffff2793e9e0
+[ 4416.929180] x3 : 000000008049000e x2 : ffffff89ecfa74d0
+[ 4416.929181] x1 : 0000000000000c40 x0 : ffffffff2076f9c0
+[ 4416.929184] Call trace:
+[ 4416.929187]  fsverity_verify_page+0x20/0x78
+[ 4416.929189]  f2fs_verify_bio+0x11c/0x29c
+[ 4416.929192]  f2fs_verity_work+0x58/0x84
+[ 4417.050667]  process_one_work+0x270/0x47c
+[ 4417.055354]  worker_thread+0x27c/0x4d8
+[ 4417.059784]  kthread+0x13c/0x320
+[ 4417.063693]  ret_from_fork+0x10/0x18
+
+Chao pointed this can happen by the below race condition.
+
+Thread A        f2fs_post_read_wq          fsverity_wq
+- f2fs_read_multi_pages()
+  - f2fs_alloc_dic
+   - dic->pending_pages = 2
+   - submit_bio()
+   - submit_bio()
+               - f2fs_post_read_work() handle first bio
+                - f2fs_decompress_work()
+                 - __read_end_io()
+                  - f2fs_decompress_pages()
+                   - dic->pending_pages--
+                - enqueue f2fs_verity_work()
+                                           - f2fs_verity_work() handle first bio
+                                            - f2fs_verify_bio()
+                                             - dic->pending_pages--
+               - f2fs_post_read_work() handle second bio
+                - f2fs_decompress_work()
+                - enqueue f2fs_verity_work()
+                                            - f2fs_verify_pages()
+                                            - f2fs_free_dic()
+
+                                          - f2fs_verity_work() handle second bio
+                                           - f2fs_verfy_bio()
+                                                 - use-after-free on dic
+
+Signed-off-by: Daeho Jeong <daehojeong@google.com>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/f2fs/compress.c |  2 --
+ fs/f2fs/data.c     | 58 +++++++++++++++++++++++++++++++++++++---------
+ fs/f2fs/f2fs.h     |  1 +
+ 3 files changed, 48 insertions(+), 13 deletions(-)
+
+diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
+index 14262e0f1cd60..c5fee4d7ea72f 100644
+--- a/fs/f2fs/compress.c
++++ b/fs/f2fs/compress.c
+@@ -798,8 +798,6 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
+       if (cops->destroy_decompress_ctx)
+               cops->destroy_decompress_ctx(dic);
+ out_free_dic:
+-      if (verity)
+-              atomic_set(&dic->pending_pages, dic->nr_cpages);
+       if (!verity)
+               f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
+                                                               ret, false);
+diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
+index be4da52604edc..b29243ee1c3e5 100644
+--- a/fs/f2fs/data.c
++++ b/fs/f2fs/data.c
+@@ -202,7 +202,7 @@ static void f2fs_verify_bio(struct bio *bio)
+               dic = (struct decompress_io_ctx *)page_private(page);
+ 
+               if (dic) {
+-                      if (atomic_dec_return(&dic->pending_pages))
++                      if (atomic_dec_return(&dic->verity_pages))
+                               continue;
+                       f2fs_verify_pages(dic->rpages,
+                                               dic->cluster_size);
+@@ -1027,7 +1027,8 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
+ 
+ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
+                                     unsigned nr_pages, unsigned op_flag,
+-                                    pgoff_t first_idx, bool for_write)
++                                    pgoff_t first_idx, bool for_write,
++                                    bool for_verity)
+ {
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+       struct bio *bio;
+@@ -1049,7 +1050,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
+               post_read_steps |= 1 << STEP_DECRYPT;
+       if (f2fs_compressed_file(inode))
+               post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ;
+-      if (f2fs_need_verity(inode, first_idx))
++      if (for_verity && f2fs_need_verity(inode, first_idx))
+               post_read_steps |= 1 << STEP_VERITY;
+ 
+       if (post_read_steps) {
+@@ -1079,7 +1080,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
+       struct bio *bio;
+ 
+       bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags,
+-                                      page->index, for_write);
++                                      page->index, for_write, true);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+ 
+@@ -2133,7 +2134,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
+       if (bio == NULL) {
+               bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
+                               is_readahead ? REQ_RAHEAD : 0, page->index,
+-                              false);
++                              false, true);
+               if (IS_ERR(bio)) {
+                       ret = PTR_ERR(bio);
+                       bio = NULL;
+@@ -2180,6 +2181,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
+       const unsigned blkbits = inode->i_blkbits;
+       const unsigned blocksize = 1 << blkbits;
+       struct decompress_io_ctx *dic = NULL;
++      struct bio_post_read_ctx *ctx;
++      bool for_verity = false;
+       int i;
+       int ret = 0;
+ 
+@@ -2245,10 +2248,29 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
+               goto out_put_dnode;
+       }
+ 
++      /*
++       * It's possible to enable fsverity on the fly when handling a cluster,
++       * which requires complicated error handling. Instead of adding more
++       * complexity, let's give a rule where end_io post-processes fsverity
++       * per cluster. In order to do that, we need to submit bio, if previous
++       * bio sets a different post-process policy.
++       */
++      if (fsverity_active(cc->inode)) {
++              atomic_set(&dic->verity_pages, cc->nr_cpages);
++              for_verity = true;
++
++              if (bio) {
++                      ctx = bio->bi_private;
++                      if (!(ctx->enabled_steps & (1 << STEP_VERITY))) {
++                              __submit_bio(sbi, bio, DATA);
++                              bio = NULL;
++                      }
++              }
++      }
++
+       for (i = 0; i < dic->nr_cpages; i++) {
+               struct page *page = dic->cpages[i];
+               block_t blkaddr;
+-              struct bio_post_read_ctx *ctx;
+ 
+               blkaddr = data_blkaddr(dn.inode, dn.node_page,
+                                               dn.ofs_in_node + i + 1);
+@@ -2264,17 +2286,31 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
+               if (!bio) {
+                       bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
+                                       is_readahead ? REQ_RAHEAD : 0,
+-                                      page->index, for_write);
++                                      page->index, for_write, for_verity);
+                       if (IS_ERR(bio)) {
++                              unsigned int remained = dic->nr_cpages - i;
++                              bool release = false;
++
+                               ret = PTR_ERR(bio);
+                               dic->failed = true;
+-                              if (!atomic_sub_return(dic->nr_cpages - i,
+-                                                      &dic->pending_pages)) {
++
++                              if (for_verity) {
++                                      if (!atomic_sub_return(remained,
++                                              &dic->verity_pages))
++                                              release = true;
++                              } else {
++                                      if (!atomic_sub_return(remained,
++                                              &dic->pending_pages))
++                                              release = true;
++                              }
++
++                              if (release) {
+                                       f2fs_decompress_end_io(dic->rpages,
+-                                                      cc->cluster_size, true,
+-                                                      false);
++                                              cc->cluster_size, true,
++                                              false);
+                                       f2fs_free_dic(dic);
+                               }
++
+                               f2fs_put_dnode(&dn);
+                               *bio_ret = NULL;
+                               return ret;
+diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
+index e4344d98a780c..06e5a6053f3f9 100644
+--- a/fs/f2fs/f2fs.h
++++ b/fs/f2fs/f2fs.h
+@@ -1410,6 +1410,7 @@ struct decompress_io_ctx {
+       size_t rlen;                    /* valid data length in rbuf */
+       size_t clen;                    /* valid data length in cbuf */
+       atomic_t pending_pages;         /* in-flight compressed page count */
++      atomic_t verity_pages;          /* in-flight page count for verity */
+       bool failed;                    /* indicate IO error during decompression */
+       void *private;                  /* payload buffer for specified decompression algorithm */
+       void *private2;                 /* extra payload buffer */
+-- 
+2.27.0
+
diff --git a/queue-5.10/fs-namespace.c-warn-if-mnt_count-has-become-negative.patch b/queue-5.10/fs-namespace.c-warn-if-mnt_count-has-become-negative.patch

new file mode 100644 (file)

index 0000000..c2c9e18
--- /dev/null
+++ b/queue-5.10/fs-namespace.c-warn-if-mnt_count-has-become-negative.patch
@@ -0,0 +1,87 @@
+From 74e61d8287006d6d2ce2df987bafe3b803da24fb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 31 Oct 2020 21:40:21 -0700
+Subject: fs/namespace.c: WARN if mnt_count has become negative
+
+From: Eric Biggers <ebiggers@google.com>
+
+[ Upstream commit edf7ddbf1c5eb98b720b063b73e20e8a4a1ce673 ]
+
+Missing calls to mntget() (or equivalently, too many calls to mntput())
+are hard to detect because mntput() delays freeing mounts using
+task_work_add(), then again using call_rcu().  As a result, mnt_count
+can often be decremented to -1 without getting a KASAN use-after-free
+report.  Such cases are still bugs though, and they point to real
+use-after-frees being possible.
+
+For an example of this, see the bug fixed by commit 1b0b9cc8d379
+("vfs: fsmount: add missing mntget()"), discussed at
+https://lkml.kernel.org/linux-fsdevel/20190605135401.GB30925@xxxxxxxxxxxxxxxxxxxxxxxxx/T/#u.
+This bug *should* have been trivial to find.  But actually, it wasn't
+found until syzkaller happened to use fchdir() to manipulate the
+reference count just right for the bug to be noticeable.
+
+Address this by making mntput_no_expire() issue a WARN if mnt_count has
+become negative.
+
+Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/namespace.c | 9 ++++++---
+ fs/pnode.h     | 2 +-
+ 2 files changed, 7 insertions(+), 4 deletions(-)
+
+diff --git a/fs/namespace.c b/fs/namespace.c
+index cebaa3e817940..93006abe7946a 100644
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -156,10 +156,10 @@ static inline void mnt_add_count(struct mount *mnt, int n)
+ /*
+  * vfsmount lock must be held for write
+  */
+-unsigned int mnt_get_count(struct mount *mnt)
++int mnt_get_count(struct mount *mnt)
+ {
+ #ifdef CONFIG_SMP
+-      unsigned int count = 0;
++      int count = 0;
+       int cpu;
+ 
+       for_each_possible_cpu(cpu) {
+@@ -1139,6 +1139,7 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
+ static void mntput_no_expire(struct mount *mnt)
+ {
+       LIST_HEAD(list);
++      int count;
+ 
+       rcu_read_lock();
+       if (likely(READ_ONCE(mnt->mnt_ns))) {
+@@ -1162,7 +1163,9 @@ static void mntput_no_expire(struct mount *mnt)
+        */
+       smp_mb();
+       mnt_add_count(mnt, -1);
+-      if (mnt_get_count(mnt)) {
++      count = mnt_get_count(mnt);
++      if (count != 0) {
++              WARN_ON(count < 0);
+               rcu_read_unlock();
+               unlock_mount_hash();
+               return;
+diff --git a/fs/pnode.h b/fs/pnode.h
+index 49a058c73e4c7..26f74e092bd98 100644
+--- a/fs/pnode.h
++++ b/fs/pnode.h
+@@ -44,7 +44,7 @@ int propagate_mount_busy(struct mount *, int);
+ void propagate_mount_unlock(struct mount *);
+ void mnt_release_group_id(struct mount *);
+ int get_dominating_id(struct mount *mnt, const struct path *root);
+-unsigned int mnt_get_count(struct mount *mnt);
++int mnt_get_count(struct mount *mnt);
+ void mnt_set_mountpoint(struct mount *, struct mountpoint *,
+                       struct mount *);
+ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
+-- 
+2.27.0
+
diff --git a/queue-5.10/i3c-master-fix-missing-destroy_workqueue-on-error-in.patch b/queue-5.10/i3c-master-fix-missing-destroy_workqueue-on-error-in.patch

new file mode 100644 (file)

index 0000000..cc804fe
--- /dev/null
+++ b/queue-5.10/i3c-master-fix-missing-destroy_workqueue-on-error-in.patch
@@ -0,0 +1,47 @@
+From 0bbe3ee7a0495873767eef9bc1a81a0d426b7ec3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 28 Oct 2020 17:15:43 +0800
+Subject: i3c master: fix missing destroy_workqueue() on error in
+ i3c_master_register
+
+From: Qinglang Miao <miaoqinglang@huawei.com>
+
+[ Upstream commit 59165d16c699182b86b5c65181013f1fd88feb62 ]
+
+Add the missing destroy_workqueue() before return from
+i3c_master_register in the error handling case.
+
+Signed-off-by: Qinglang Miao <miaoqinglang@huawei.com>
+Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
+Link: https://lore.kernel.org/linux-i3c/20201028091543.136167-1-miaoqinglang@huawei.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/i3c/master.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
+index 1c6b78ad5ade4..b61bf53ec07af 100644
+--- a/drivers/i3c/master.c
++++ b/drivers/i3c/master.c
+@@ -2537,7 +2537,7 @@ int i3c_master_register(struct i3c_master_controller *master,
+ 
+       ret = i3c_master_bus_init(master);
+       if (ret)
+-              goto err_put_dev;
++              goto err_destroy_wq;
+ 
+       ret = device_add(&master->dev);
+       if (ret)
+@@ -2568,6 +2568,9 @@ int i3c_master_register(struct i3c_master_controller *master,
+ err_cleanup_bus:
+       i3c_master_bus_cleanup(master);
+ 
++err_destroy_wq:
++      destroy_workqueue(master->wq);
++
+ err_put_dev:
+       put_device(&master->dev);
+ 
+-- 
+2.27.0
+
diff --git a/queue-5.10/io_uring-remove-racy-overflow-list-fast-checks.patch b/queue-5.10/io_uring-remove-racy-overflow-list-fast-checks.patch

new file mode 100644 (file)

index 0000000..ba404e4
--- /dev/null
+++ b/queue-5.10/io_uring-remove-racy-overflow-list-fast-checks.patch
@@ -0,0 +1,48 @@
+From 4f47f5ce6be4ce5f29501a5511bf56080b0bd25c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Dec 2020 00:24:36 +0000
+Subject: io_uring: remove racy overflow list fast checks
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+[ Upstream commit 9cd2be519d05ee78876d55e8e902b7125f78b74f ]
+
+list_empty_careful() is not racy only if some conditions are met, i.e.
+no re-adds after del_init. io_cqring_overflow_flush() does list_move(),
+so it's actually racy.
+
+Remove those checks, we have ->cq_check_overflow for the fast path.
+
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/io_uring.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/fs/io_uring.c b/fs/io_uring.c
+index e28eedab5365f..1f798c5c4213e 100644
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1636,8 +1636,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
+       LIST_HEAD(list);
+ 
+       if (!force) {
+-              if (list_empty_careful(&ctx->cq_overflow_list))
+-                      return true;
+               if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
+                   rings->cq_ring_entries))
+                       return false;
+@@ -6579,8 +6577,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
+ 
+       /* if we have a backlog and couldn't flush it all, return BUSY */
+       if (test_bit(0, &ctx->sq_check_overflow)) {
+-              if (!list_empty(&ctx->cq_overflow_list) &&
+-                  !io_cqring_overflow_flush(ctx, false, NULL, NULL))
++              if (!io_cqring_overflow_flush(ctx, false, NULL, NULL))
+                       return -EBUSY;
+       }
+ 
+-- 
+2.27.0
+
diff --git a/queue-5.10/module-delay-kobject-uevent-until-after-module-init-.patch b/queue-5.10/module-delay-kobject-uevent-until-after-module-init-.patch

new file mode 100644 (file)

index 0000000..ce0bf05
--- /dev/null
+++ b/queue-5.10/module-delay-kobject-uevent-until-after-module-init-.patch
@@ -0,0 +1,72 @@
+From 92413ae233b6e4d6dbc685871a90c8a74fc43573 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 27 Nov 2020 10:09:39 +0100
+Subject: module: delay kobject uevent until after module init call
+
+From: Jessica Yu <jeyu@kernel.org>
+
+[ Upstream commit 38dc717e97153e46375ee21797aa54777e5498f3 ]
+
+Apparently there has been a longstanding race between udev/systemd and
+the module loader. Currently, the module loader sends a uevent right
+after sysfs initialization, but before the module calls its init
+function. However, some udev rules expect that the module has
+initialized already upon receiving the uevent.
+
+This race has been triggered recently (see link in references) in some
+systemd mount unit files. For instance, the configfs module creates the
+/sys/kernel/config mount point in its init function, however the module
+loader issues the uevent before this happens. sys-kernel-config.mount
+expects to be able to mount /sys/kernel/config upon receipt of the
+module loading uevent, but if the configfs module has not called its
+init function yet, then this directory will not exist and the mount unit
+fails. A similar situation exists for sys-fs-fuse-connections.mount, as
+the fuse sysfs mount point is created during the fuse module's init
+function. If udev is faster than module initialization then the mount
+unit would fail in a similar fashion.
+
+To fix this race, delay the module KOBJ_ADD uevent until after the
+module has finished calling its init routine.
+
+References: https://github.com/systemd/systemd/issues/17586
+Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Tested-By: Nicolas Morey-Chaisemartin <nmoreychaisemartin@suse.com>
+Signed-off-by: Jessica Yu <jeyu@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/module.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/module.c b/kernel/module.c
+index b34235082394b..e20499309b2af 100644
+--- a/kernel/module.c
++++ b/kernel/module.c
+@@ -1895,7 +1895,6 @@ static int mod_sysfs_init(struct module *mod)
+       if (err)
+               mod_kobject_put(mod);
+ 
+-      /* delay uevent until full sysfs population */
+ out:
+       return err;
+ }
+@@ -1932,7 +1931,6 @@ static int mod_sysfs_setup(struct module *mod,
+       add_sect_attrs(mod, info);
+       add_notes_attrs(mod, info);
+ 
+-      kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+       return 0;
+ 
+ out_unreg_modinfo_attrs:
+@@ -3639,6 +3637,9 @@ static noinline int do_init_module(struct module *mod)
+       blocking_notifier_call_chain(&module_notify_list,
+                                    MODULE_STATE_LIVE, mod);
+ 
++      /* Delay uevent until module has finished its init routine */
++      kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
++
+       /*
+        * We need to finish all async code before the module init sequence
+        * is done.  This has potential to deadlock.  For example, a newly
+-- 
+2.27.0
+
diff --git a/queue-5.10/module-set-module_state_going-state-when-a-module-fa.patch b/queue-5.10/module-set-module_state_going-state-when-a-module-fa.patch

new file mode 100644 (file)

index 0000000..437f30b
--- /dev/null
+++ b/queue-5.10/module-set-module_state_going-state-when-a-module-fa.patch
@@ -0,0 +1,36 @@
+From d2bb58a0425a26b741ef1868664c1dd83906a6ae Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Oct 2020 15:03:36 +0100
+Subject: module: set MODULE_STATE_GOING state when a module fails to load
+
+From: Miroslav Benes <mbenes@suse.cz>
+
+[ Upstream commit 5e8ed280dab9eeabc1ba0b2db5dbe9fe6debb6b5 ]
+
+If a module fails to load due to an error in prepare_coming_module(),
+the following error handling in load_module() runs with
+MODULE_STATE_COMING in module's state. Fix it by correctly setting
+MODULE_STATE_GOING under "bug_cleanup" label.
+
+Signed-off-by: Miroslav Benes <mbenes@suse.cz>
+Signed-off-by: Jessica Yu <jeyu@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/module.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/kernel/module.c b/kernel/module.c
+index a4fa44a652a75..b34235082394b 100644
+--- a/kernel/module.c
++++ b/kernel/module.c
+@@ -3991,6 +3991,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
+                                    MODULE_STATE_GOING, mod);
+       klp_module_going(mod);
+  bug_cleanup:
++      mod->state = MODULE_STATE_GOING;
+       /* module_bug_cleanup needs module_mutex protection */
+       mutex_lock(&module_mutex);
+       module_bug_cleanup(mod);
+-- 
+2.27.0
+
diff --git a/queue-5.10/nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch b/queue-5.10/nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch

new file mode 100644 (file)

index 0000000..b6547ca
--- /dev/null
+++ b/queue-5.10/nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch
@@ -0,0 +1,131 @@
+From 8b2e2c9ed5303e21f128fc068ff6d3eb0577996f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 Nov 2020 12:06:14 -0500
+Subject: NFSv4: Fix a pNFS layout related use-after-free race when freeing the
+ inode
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit b6d49ecd1081740b6e632366428b960461f8158b ]
+
+When returning the layout in nfs4_evict_inode(), we need to ensure that
+the layout is actually done being freed before we can proceed to free the
+inode itself.
+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/nfs4super.c |  2 +-
+ fs/nfs/pnfs.c      | 33 +++++++++++++++++++++++++++++++--
+ fs/nfs/pnfs.h      |  5 +++++
+ 3 files changed, 37 insertions(+), 3 deletions(-)
+
+diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
+index 93f5c1678ec29..984cc42ee54d8 100644
+--- a/fs/nfs/nfs4super.c
++++ b/fs/nfs/nfs4super.c
+@@ -67,7 +67,7 @@ static void nfs4_evict_inode(struct inode *inode)
+       nfs_inode_evict_delegation(inode);
+       /* Note that above delegreturn would trigger pnfs return-on-close */
+       pnfs_return_layout(inode);
+-      pnfs_destroy_layout(NFS_I(inode));
++      pnfs_destroy_layout_final(NFS_I(inode));
+       /* First call standard NFS clear_inode() code */
+       nfs_clear_inode(inode);
+       nfs4_xattr_cache_zap(inode);
+diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
+index 0e50b9d45c320..07f59dc8cb2e7 100644
+--- a/fs/nfs/pnfs.c
++++ b/fs/nfs/pnfs.c
+@@ -294,6 +294,7 @@ void
+ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
+ {
+       struct inode *inode;
++      unsigned long i_state;
+ 
+       if (!lo)
+               return;
+@@ -304,8 +305,12 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
+               if (!list_empty(&lo->plh_segs))
+                       WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
+               pnfs_detach_layout_hdr(lo);
++              i_state = inode->i_state;
+               spin_unlock(&inode->i_lock);
+               pnfs_free_layout_hdr(lo);
++              /* Notify pnfs_destroy_layout_final() that we're done */
++              if (i_state & (I_FREEING | I_CLEAR))
++                      wake_up_var(lo);
+       }
+ }
+ 
+@@ -734,8 +739,7 @@ pnfs_free_lseg_list(struct list_head *free_me)
+       }
+ }
+ 
+-void
+-pnfs_destroy_layout(struct nfs_inode *nfsi)
++static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi)
+ {
+       struct pnfs_layout_hdr *lo;
+       LIST_HEAD(tmp_list);
+@@ -753,9 +757,34 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
+               pnfs_put_layout_hdr(lo);
+       } else
+               spin_unlock(&nfsi->vfs_inode.i_lock);
++      return lo;
++}
++
++void pnfs_destroy_layout(struct nfs_inode *nfsi)
++{
++      __pnfs_destroy_layout(nfsi);
+ }
+ EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
+ 
++static bool pnfs_layout_removed(struct nfs_inode *nfsi,
++                              struct pnfs_layout_hdr *lo)
++{
++      bool ret;
++
++      spin_lock(&nfsi->vfs_inode.i_lock);
++      ret = nfsi->layout != lo;
++      spin_unlock(&nfsi->vfs_inode.i_lock);
++      return ret;
++}
++
++void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
++{
++      struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi);
++
++      if (lo)
++              wait_var_event(lo, pnfs_layout_removed(nfsi, lo));
++}
++
+ static bool
+ pnfs_layout_add_bulk_destroy_list(struct inode *inode,
+               struct list_head *layout_list)
+diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
+index 2661c44c62db4..78c3893918486 100644
+--- a/fs/nfs/pnfs.h
++++ b/fs/nfs/pnfs.h
+@@ -266,6 +266,7 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
+ void pnfs_layoutget_free(struct nfs4_layoutget *lgp);
+ void pnfs_free_lseg_list(struct list_head *tmp_list);
+ void pnfs_destroy_layout(struct nfs_inode *);
++void pnfs_destroy_layout_final(struct nfs_inode *);
+ void pnfs_destroy_all_layouts(struct nfs_client *);
+ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+               struct nfs_fsid *fsid,
+@@ -710,6 +711,10 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+ {
+ }
+ 
++static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
++{
++}
++
+ static inline struct pnfs_layout_segment *
+ pnfs_get_lseg(struct pnfs_layout_segment *lseg)
+ {
+-- 
+2.27.0
+
diff --git a/queue-5.10/nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch b/queue-5.10/nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch

new file mode 100644 (file)

index 0000000..077d921
--- /dev/null
+++ b/queue-5.10/nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch
@@ -0,0 +1,118 @@
+From 3b77fb9e8c4f6a6b5e1ae071b96467ef3619aa34 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Dec 2020 07:51:29 -0500
+Subject: NFSv4.2: Don't error when exiting early on a READ_PLUS buffer
+ overflow
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit 503b934a752f7e789a5f33217520e0a79f3096ac ]
+
+Expanding the READ_PLUS extents can cause the read buffer to overflow.
+If it does, then don't error, but just exit early.
+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/nfs42xdr.c | 36 +++++++++++++++++-------------------
+ 1 file changed, 17 insertions(+), 19 deletions(-)
+
+diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
+index 8432bd6b95f08..c078f88552695 100644
+--- a/fs/nfs/nfs42xdr.c
++++ b/fs/nfs/nfs42xdr.c
+@@ -1019,29 +1019,24 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
+       return decode_op_hdr(xdr, OP_DEALLOCATE);
+ }
+ 
+-static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res,
+-                               uint32_t *eof)
++static int decode_read_plus_data(struct xdr_stream *xdr,
++                               struct nfs_pgio_res *res)
+ {
+       uint32_t count, recvd;
+       uint64_t offset;
+       __be32 *p;
+ 
+       p = xdr_inline_decode(xdr, 8 + 4);
+-      if (unlikely(!p))
+-              return -EIO;
++      if (!p)
++              return 1;
+ 
+       p = xdr_decode_hyper(p, &offset);
+       count = be32_to_cpup(p);
+       recvd = xdr_align_data(xdr, res->count, count);
+       res->count += recvd;
+ 
+-      if (count > recvd) {
+-              dprintk("NFS: server cheating in read reply: "
+-                              "count %u > recvd %u\n", count, recvd);
+-              *eof = 0;
++      if (count > recvd)
+               return 1;
+-      }
+-
+       return 0;
+ }
+ 
+@@ -1052,18 +1047,16 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *re
+       __be32 *p;
+ 
+       p = xdr_inline_decode(xdr, 8 + 8);
+-      if (unlikely(!p))
+-              return -EIO;
++      if (!p)
++              return 1;
+ 
+       p = xdr_decode_hyper(p, &offset);
+       p = xdr_decode_hyper(p, &length);
+       recvd = xdr_expand_hole(xdr, res->count, length);
+       res->count += recvd;
+ 
+-      if (recvd < length) {
+-              *eof = 0;
++      if (recvd < length)
+               return 1;
+-      }
+       return 0;
+ }
+ 
+@@ -1088,12 +1081,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+ 
+       for (i = 0; i < segments; i++) {
+               p = xdr_inline_decode(xdr, 4);
+-              if (unlikely(!p))
+-                      return -EIO;
++              if (!p)
++                      goto early_out;
+ 
+               type = be32_to_cpup(p++);
+               if (type == NFS4_CONTENT_DATA)
+-                      status = decode_read_plus_data(xdr, res, &eof);
++                      status = decode_read_plus_data(xdr, res);
+               else if (type == NFS4_CONTENT_HOLE)
+                       status = decode_read_plus_hole(xdr, res, &eof);
+               else
+@@ -1102,12 +1095,17 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+               if (status < 0)
+                       return status;
+               if (status > 0)
+-                      break;
++                      goto early_out;
+       }
+ 
+ out:
+       res->eof = eof;
+       return 0;
++early_out:
++      if (unlikely(!i))
++              return -EIO;
++      res->eof = 0;
++      return 0;
+ }
+ 
+ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
+-- 
+2.27.0
+
diff --git a/queue-5.10/powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch b/queue-5.10/powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch

new file mode 100644 (file)

index 0000000..ab65a13
--- /dev/null
+++ b/queue-5.10/powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch
@@ -0,0 +1,171 @@
+From a2d81a7069f97534eff5249a8448a288358086c3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 7 Nov 2020 11:43:36 +1000
+Subject: powerpc/64: irq replay remove decrementer overflow check
+
+From: Nicholas Piggin <npiggin@gmail.com>
+
+[ Upstream commit 59d512e4374b2d8a6ad341475dc94c4a4bdec7d3 ]
+
+This is way to catch some cases of decrementer overflow, when the
+decrementer has underflowed an odd number of times, while MSR[EE] was
+disabled.
+
+With a typical small decrementer, a timer that fires when MSR[EE] is
+disabled will be "lost" if MSR[EE] remains disabled for between 4.3 and
+8.6 seconds after the timer expires. In any case, the decrementer
+interrupt would be taken at 8.6 seconds and the timer would be found at
+that point.
+
+So this check is for catching extreme latency events, and it prevents
+those latencies from being a further few seconds long.  It's not obvious
+this is a good tradeoff. This is already a watchdog magnitude event and
+that situation is not improved a significantly with this check. For
+large decrementers, it's useless.
+
+Therefore remove this check, which avoids a mftb when enabling hard
+disabled interrupts (e.g., when enabling after coming from hardware
+interrupt handlers). Perhaps more importantly, it also removes the
+clunky MSR[EE] vs PACA_IRQ_HARD_DIS incoherency in soft-interrupt replay
+which simplifies the code.
+
+Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20201107014336.2337337-1-npiggin@gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/powerpc/kernel/irq.c             | 53 ++-------------------------
+ arch/powerpc/kernel/time.c            |  9 ++---
+ arch/powerpc/platforms/powernv/opal.c |  2 +-
+ 3 files changed, 8 insertions(+), 56 deletions(-)
+
+diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
+index 7d0f7682d01df..6b1eca53e36cc 100644
+--- a/arch/powerpc/kernel/irq.c
++++ b/arch/powerpc/kernel/irq.c
+@@ -102,14 +102,6 @@ static inline notrace unsigned long get_irq_happened(void)
+       return happened;
+ }
+ 
+-static inline notrace int decrementer_check_overflow(void)
+-{
+-      u64 now = get_tb();
+-      u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
+- 
+-      return now >= *next_tb;
+-}
+-
+ #ifdef CONFIG_PPC_BOOK3E
+ 
+ /* This is called whenever we are re-enabling interrupts
+@@ -142,35 +134,6 @@ notrace unsigned int __check_irq_replay(void)
+       trace_hardirqs_on();
+       trace_hardirqs_off();
+ 
+-      /*
+-       * We are always hard disabled here, but PACA_IRQ_HARD_DIS may
+-       * not be set, which means interrupts have only just been hard
+-       * disabled as part of the local_irq_restore or interrupt return
+-       * code. In that case, skip the decrementr check becaus it's
+-       * expensive to read the TB.
+-       *
+-       * HARD_DIS then gets cleared here, but it's reconciled later.
+-       * Either local_irq_disable will replay the interrupt and that
+-       * will reconcile state like other hard interrupts. Or interrupt
+-       * retur will replay the interrupt and in that case it sets
+-       * PACA_IRQ_HARD_DIS by hand (see comments in entry_64.S).
+-       */
+-      if (happened & PACA_IRQ_HARD_DIS) {
+-              local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+-
+-              /*
+-               * We may have missed a decrementer interrupt if hard disabled.
+-               * Check the decrementer register in case we had a rollover
+-               * while hard disabled.
+-               */
+-              if (!(happened & PACA_IRQ_DEC)) {
+-                      if (decrementer_check_overflow()) {
+-                              local_paca->irq_happened |= PACA_IRQ_DEC;
+-                              happened |= PACA_IRQ_DEC;
+-                      }
+-              }
+-      }
+-
+       if (happened & PACA_IRQ_DEC) {
+               local_paca->irq_happened &= ~PACA_IRQ_DEC;
+               return 0x900;
+@@ -186,6 +149,9 @@ notrace unsigned int __check_irq_replay(void)
+               return 0x280;
+       }
+ 
++      if (happened & PACA_IRQ_HARD_DIS)
++              local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
++
+       /* There should be nothing left ! */
+       BUG_ON(local_paca->irq_happened != 0);
+ 
+@@ -229,18 +195,6 @@ void replay_soft_interrupts(void)
+       if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+               WARN_ON_ONCE(mfmsr() & MSR_EE);
+ 
+-      if (happened & PACA_IRQ_HARD_DIS) {
+-              /*
+-               * We may have missed a decrementer interrupt if hard disabled.
+-               * Check the decrementer register in case we had a rollover
+-               * while hard disabled.
+-               */
+-              if (!(happened & PACA_IRQ_DEC)) {
+-                      if (decrementer_check_overflow())
+-                              happened |= PACA_IRQ_DEC;
+-              }
+-      }
+-
+       /*
+        * Force the delivery of pending soft-disabled interrupts on PS3.
+        * Any HV call will have this side effect.
+@@ -345,6 +299,7 @@ notrace void arch_local_irq_restore(unsigned long mask)
+               if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+                       WARN_ON_ONCE(!(mfmsr() & MSR_EE));
+               __hard_irq_disable();
++              local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+       } else {
+               /*
+                * We should already be hard disabled here. We had bugs
+diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
+index 74efe46f55327..7d372ff3504b2 100644
+--- a/arch/powerpc/kernel/time.c
++++ b/arch/powerpc/kernel/time.c
+@@ -552,14 +552,11 @@ void timer_interrupt(struct pt_regs *regs)
+       struct pt_regs *old_regs;
+       u64 now;
+ 
+-      /* Some implementations of hotplug will get timer interrupts while
+-       * offline, just ignore these and we also need to set
+-       * decrementers_next_tb as MAX to make sure __check_irq_replay
+-       * don't replay timer interrupt when return, otherwise we'll trap
+-       * here infinitely :(
++      /*
++       * Some implementations of hotplug will get timer interrupts while
++       * offline, just ignore these.
+        */
+       if (unlikely(!cpu_online(smp_processor_id()))) {
+-              *next_tb = ~(u64)0;
+               set_dec(decrementer_max);
+               return;
+       }
+diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
+index d95954ad4c0af..c61c3b62c8c62 100644
+--- a/arch/powerpc/platforms/powernv/opal.c
++++ b/arch/powerpc/platforms/powernv/opal.c
+@@ -731,7 +731,7 @@ int opal_hmi_exception_early2(struct pt_regs *regs)
+       return 1;
+ }
+ 
+-/* HMI exception handler called in virtual mode during check_irq_replay. */
++/* HMI exception handler called in virtual mode when irqs are next enabled. */
+ int opal_handle_hmi_exception(struct pt_regs *regs)
+ {
+       /*
+-- 
+2.27.0
+
diff --git a/queue-5.10/powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch b/queue-5.10/powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch

new file mode 100644 (file)

index 0000000..157ee1a
--- /dev/null
+++ b/queue-5.10/powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch
@@ -0,0 +1,39 @@
+From 0679d328640a54ecbb2338a544b32d729121fd31 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 28 Oct 2020 17:15:51 +0800
+Subject: powerpc: sysdev: add missing iounmap() on error in mpic_msgr_probe()
+
+From: Qinglang Miao <miaoqinglang@huawei.com>
+
+[ Upstream commit ffa1797040c5da391859a9556be7b735acbe1242 ]
+
+I noticed that iounmap() of msgr_block_addr before return from
+mpic_msgr_probe() in the error handling case is missing. So use
+devm_ioremap() instead of just ioremap() when remapping the message
+register block, so the mapping will be automatically released on
+probe failure.
+
+Signed-off-by: Qinglang Miao <miaoqinglang@huawei.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20201028091551.136400-1-miaoqinglang@huawei.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/powerpc/sysdev/mpic_msgr.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/powerpc/sysdev/mpic_msgr.c b/arch/powerpc/sysdev/mpic_msgr.c
+index f6b253e2be409..36ec0bdd8b63c 100644
+--- a/arch/powerpc/sysdev/mpic_msgr.c
++++ b/arch/powerpc/sysdev/mpic_msgr.c
+@@ -191,7 +191,7 @@ static int mpic_msgr_probe(struct platform_device *dev)
+ 
+       /* IO map the message register block. */
+       of_address_to_resource(np, 0, &rsrc);
+-      msgr_block_addr = ioremap(rsrc.start, resource_size(&rsrc));
++      msgr_block_addr = devm_ioremap(&dev->dev, rsrc.start, resource_size(&rsrc));
+       if (!msgr_block_addr) {
+               dev_err(&dev->dev, "Failed to iomap MPIC message registers");
+               return -EFAULT;
+-- 
+2.27.0
+
diff --git a/queue-5.10/quota-don-t-overflow-quota-file-offsets.patch b/queue-5.10/quota-don-t-overflow-quota-file-offsets.patch

new file mode 100644 (file)

index 0000000..046f67b
--- /dev/null
+++ b/queue-5.10/quota-don-t-overflow-quota-file-offsets.patch
@@ -0,0 +1,66 @@
+From 127af98fb71671f76c2e8a5e2c0bd5187ee8bab7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 2 Nov 2020 16:32:10 +0100
+Subject: quota: Don't overflow quota file offsets
+
+From: Jan Kara <jack@suse.cz>
+
+[ Upstream commit 10f04d40a9fa29785206c619f80d8beedb778837 ]
+
+The on-disk quota format supports quota files with upto 2^32 blocks. Be
+careful when computing quota file offsets in the quota files from block
+numbers as they can overflow 32-bit types. Since quota files larger than
+4GB would require ~26 millions of quota users, this is mostly a
+theoretical concern now but better be careful, fuzzers would find the
+problem sooner or later anyway...
+
+Reviewed-by: Andreas Dilger <adilger@dilger.ca>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/quota/quota_tree.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
+index a6f856f341dc7..c5562c871c8be 100644
+--- a/fs/quota/quota_tree.c
++++ b/fs/quota/quota_tree.c
+@@ -62,7 +62,7 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
+ 
+       memset(buf, 0, info->dqi_usable_bs);
+       return sb->s_op->quota_read(sb, info->dqi_type, buf,
+-             info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
++             info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits);
+ }
+ 
+ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
+@@ -71,7 +71,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
+       ssize_t ret;
+ 
+       ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
+-             info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
++             info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits);
+       if (ret != info->dqi_usable_bs) {
+               quota_error(sb, "dquota write failed");
+               if (ret >= 0)
+@@ -284,7 +284,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+                           blk);
+               goto out_buf;
+       }
+-      dquot->dq_off = (blk << info->dqi_blocksize_bits) +
++      dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) +
+                       sizeof(struct qt_disk_dqdbheader) +
+                       i * info->dqi_entry_size;
+       kfree(buf);
+@@ -559,7 +559,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+               ret = -EIO;
+               goto out_buf;
+       } else {
+-              ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
++              ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct
+                 qt_disk_dqdbheader) + i * info->dqi_entry_size;
+       }
+ out_buf:
+-- 
+2.27.0
+
diff --git a/queue-5.10/rtc-pl031-fix-resource-leak-in-pl031_probe.patch b/queue-5.10/rtc-pl031-fix-resource-leak-in-pl031_probe.patch

new file mode 100644 (file)

index 0000000..767ebf6
--- /dev/null
+++ b/queue-5.10/rtc-pl031-fix-resource-leak-in-pl031_probe.patch
@@ -0,0 +1,42 @@
+From a8587cba324a6acf49b45b72d0f93f686592a0a1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 Nov 2020 17:31:39 +0800
+Subject: rtc: pl031: fix resource leak in pl031_probe
+
+From: Zheng Liang <zhengliang6@huawei.com>
+
+[ Upstream commit 1eab0fea2514b269e384c117f5b5772b882761f0 ]
+
+When devm_rtc_allocate_device is failed in pl031_probe, it should release
+mem regions with device.
+
+Reported-by: Hulk Robot <hulkci@huawei.com>
+Signed-off-by: Zheng Liang <zhengliang6@huawei.com>
+Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
+Acked-by: Linus Walleij <linus.walleij@linaro.org>
+Link: https://lore.kernel.org/r/20201112093139.32566-1-zhengliang6@huawei.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/rtc/rtc-pl031.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
+index c6b89273feba8..d4b2ab7861266 100644
+--- a/drivers/rtc/rtc-pl031.c
++++ b/drivers/rtc/rtc-pl031.c
+@@ -361,8 +361,10 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id)
+ 
+       device_init_wakeup(&adev->dev, true);
+       ldata->rtc = devm_rtc_allocate_device(&adev->dev);
+-      if (IS_ERR(ldata->rtc))
+-              return PTR_ERR(ldata->rtc);
++      if (IS_ERR(ldata->rtc)) {
++              ret = PTR_ERR(ldata->rtc);
++              goto out;
++      }
+ 
+       ldata->rtc->ops = ops;
+       ldata->rtc->range_min = vendor->range_min;
+-- 
+2.27.0
+
diff --git a/queue-5.10/rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch b/queue-5.10/rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch

new file mode 100644 (file)

index 0000000..4532f68
--- /dev/null
+++ b/queue-5.10/rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch
@@ -0,0 +1,65 @@
+From 2893a626a352c4d938c9d49cf0d0ba7d0a0cf95d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Oct 2020 14:12:26 +0800
+Subject: rtc: sun6i: Fix memleak in sun6i_rtc_clk_init
+
+From: Dinghao Liu <dinghao.liu@zju.edu.cn>
+
+[ Upstream commit 28d211919e422f58c1e6c900e5810eee4f1ce4c8 ]
+
+When clk_hw_register_fixed_rate_with_accuracy() fails,
+clk_data should be freed. It's the same for the subsequent
+two error paths, but we should also unregister the already
+registered clocks in them.
+
+Signed-off-by: Dinghao Liu <dinghao.liu@zju.edu.cn>
+Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
+Link: https://lore.kernel.org/r/20201020061226.6572-1-dinghao.liu@zju.edu.cn
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/rtc/rtc-sun6i.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c
+index e2b8b150bcb44..f2818cdd11d82 100644
+--- a/drivers/rtc/rtc-sun6i.c
++++ b/drivers/rtc/rtc-sun6i.c
+@@ -272,7 +272,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node,
+                                                               300000000);
+       if (IS_ERR(rtc->int_osc)) {
+               pr_crit("Couldn't register the internal oscillator\n");
+-              return;
++              goto err;
+       }
+ 
+       parents[0] = clk_hw_get_name(rtc->int_osc);
+@@ -290,7 +290,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node,
+       rtc->losc = clk_register(NULL, &rtc->hw);
+       if (IS_ERR(rtc->losc)) {
+               pr_crit("Couldn't register the LOSC clock\n");
+-              return;
++              goto err_register;
+       }
+ 
+       of_property_read_string_index(node, "clock-output-names", 1,
+@@ -301,7 +301,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node,
+                                         &rtc->lock);
+       if (IS_ERR(rtc->ext_losc)) {
+               pr_crit("Couldn't register the LOSC external gate\n");
+-              return;
++              goto err_register;
+       }
+ 
+       clk_data->num = 2;
+@@ -314,6 +314,8 @@ static void __init sun6i_rtc_clk_init(struct device_node *node,
+       of_clk_add_hw_provider(node, of_clk_hw_onecell_get, clk_data);
+       return;
+ 
++err_register:
++      clk_hw_unregister_fixed_rate(rtc->int_osc);
+ err:
+       kfree(clk_data);
+ }
+-- 
+2.27.0
+
diff --git a/queue-5.10/s390-always-clear-kernel-stack-backchain-before-call.patch b/queue-5.10/s390-always-clear-kernel-stack-backchain-before-call.patch

new file mode 100644 (file)

index 0000000..69d886c
--- /dev/null
+++ b/queue-5.10/s390-always-clear-kernel-stack-backchain-before-call.patch
@@ -0,0 +1,87 @@
+From 35edc5975bac95b303541b9e83f4ed12d5a4684c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 4 Dec 2020 17:56:57 +0100
+Subject: s390: always clear kernel stack backchain before calling functions
+
+From: Heiko Carstens <hca@linux.ibm.com>
+
+[ Upstream commit 9365965db0c7ca7fc81eee27c21d8522d7102c32 ]
+
+Clear the kernel stack backchain before potentially calling the
+lockdep trace_hardirqs_off/on functions. Without this walking the
+kernel backchain, e.g. during a panic, might stop too early.
+
+Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/s390/kernel/entry.S | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
+index 6343dca0dbeb6..71203324ff42b 100644
+--- a/arch/s390/kernel/entry.S
++++ b/arch/s390/kernel/entry.S
+@@ -406,6 +406,7 @@ ENTRY(system_call)
+       mvc     __PT_PSW(16,%r11),__LC_SVC_OLD_PSW
+       mvc     __PT_INT_CODE(4,%r11),__LC_SVC_ILC
+       stg     %r14,__PT_FLAGS(%r11)
++      xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+       ENABLE_INTS
+ .Lsysc_do_svc:
+       # clear user controlled register to prevent speculative use
+@@ -422,7 +423,6 @@ ENTRY(system_call)
+       jnl     .Lsysc_nr_ok
+       slag    %r8,%r1,3
+ .Lsysc_nr_ok:
+-      xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+       stg     %r2,__PT_ORIG_GPR2(%r11)
+       stg     %r7,STACK_FRAME_OVERHEAD(%r15)
+       lg      %r9,0(%r8,%r10)                 # get system call add.
+@@ -712,8 +712,8 @@ ENTRY(pgm_check_handler)
+       mvc     __THREAD_per_address(8,%r14),__LC_PER_ADDRESS
+       mvc     __THREAD_per_cause(2,%r14),__LC_PER_CODE
+       mvc     __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID
+-6:    RESTORE_SM_CLEAR_PER
+-      xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
++6:    xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
++      RESTORE_SM_CLEAR_PER
+       larl    %r1,pgm_check_table
+       llgh    %r10,__PT_INT_CODE+2(%r11)
+       nill    %r10,0x007f
+@@ -734,8 +734,8 @@ ENTRY(pgm_check_handler)
+ # PER event in supervisor state, must be kprobes
+ #
+ .Lpgm_kprobe:
+-      RESTORE_SM_CLEAR_PER
+       xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
++      RESTORE_SM_CLEAR_PER
+       lgr     %r2,%r11                # pass pointer to pt_regs
+       brasl   %r14,do_per_trap
+       j       .Lpgm_return
+@@ -777,10 +777,10 @@ ENTRY(io_int_handler)
+       stmg    %r8,%r9,__PT_PSW(%r11)
+       mvc     __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
+       xc      __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
++      xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+       TSTMSK  __LC_CPU_FLAGS,_CIF_IGNORE_IRQ
+       jo      .Lio_restore
+       TRACE_IRQS_OFF
+-      xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ .Lio_loop:
+       lgr     %r2,%r11                # pass pointer to pt_regs
+       lghi    %r3,IO_INTERRUPT
+@@ -980,10 +980,10 @@ ENTRY(ext_int_handler)
+       mvc     __PT_INT_PARM(4,%r11),__LC_EXT_PARAMS
+       mvc     __PT_INT_PARM_LONG(8,%r11),0(%r1)
+       xc      __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
++      xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+       TSTMSK  __LC_CPU_FLAGS,_CIF_IGNORE_IRQ
+       jo      .Lio_restore
+       TRACE_IRQS_OFF
+-      xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+       lgr     %r2,%r11                # pass pointer to pt_regs
+       lghi    %r3,EXT_INTERRUPT
+       brasl   %r14,do_IRQ
+-- 
+2.27.0
+
diff --git a/queue-5.10/series b/queue-5.10/series

index 79a867120136cb6f8669244518b677cb4051e5b9..104f3aebf601b9bf9d2af8477116de898f7cd038 100644 (file)
--- a/queue-5.10/series
+++ b/queue-5.10/series
@@ -36,3 +36,28 @@ bfs-don-t-use-warning-string-when-it-s-just-info.patch
  ext4-check-for-invalid-block-size-early-when-mounting-a-file-system.patch
  fcntl-fix-potential-deadlock-in-send_sig-io-urg.patch
  io_uring-check-kthread-stopped-flag-when-sq-thread-is-unparked.patch
+rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch
+module-set-module_state_going-state-when-a-module-fa.patch
+quota-don-t-overflow-quota-file-offsets.patch
+rtc-pl031-fix-resource-leak-in-pl031_probe.patch
+powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch
+i3c-master-fix-missing-destroy_workqueue-on-error-in.patch
+nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch
+f2fs-avoid-race-condition-for-shrinker-count.patch
+f2fs-fix-race-of-pending_pages-in-decompression.patch
+module-delay-kobject-uevent-until-after-module-init-.patch
+powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch
+fs-namespace.c-warn-if-mnt_count-has-become-negative.patch
+watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch
+um-random-register-random-as-hwrng-core-device.patch
+um-ubd-submit-all-data-segments-atomically.patch
+nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch
+ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch
+drm-amd-display-updated-wm-table-for-renoir.patch
+tick-sched-remove-bogus-boot-safety-check.patch
+s390-always-clear-kernel-stack-backchain-before-call.patch
+io_uring-remove-racy-overflow-list-fast-checks.patch
+alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch
+dm-verity-skip-verity-work-if-i-o-error-when-system-.patch
+ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch
+device-dax-fix-range-release.patch
diff --git a/queue-5.10/tick-sched-remove-bogus-boot-safety-check.patch b/queue-5.10/tick-sched-remove-bogus-boot-safety-check.patch

new file mode 100644 (file)

index 0000000..9584760
--- /dev/null
+++ b/queue-5.10/tick-sched-remove-bogus-boot-safety-check.patch
@@ -0,0 +1,49 @@
+From b2866008cf5603219f25f52b1523c8732df89c3f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 6 Dec 2020 22:12:55 +0100
+Subject: tick/sched: Remove bogus boot "safety" check
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+[ Upstream commit ba8ea8e7dd6e1662e34e730eadfc52aa6816f9dd ]
+
+can_stop_idle_tick() checks whether the do_timer() duty has been taken over
+by a CPU on boot. That's silly because the boot CPU always takes over with
+the initial clockevent device.
+
+But even if no CPU would have installed a clockevent and taken over the
+duty then the question whether the tick on the current CPU can be stopped
+or not is moot. In that case the current CPU would have no clockevent
+either, so there would be nothing to keep ticking.
+
+Remove it.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Frederic Weisbecker <frederic@kernel.org>
+Link: https://lore.kernel.org/r/20201206212002.725238293@linutronix.de
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/time/tick-sched.c | 7 -------
+ 1 file changed, 7 deletions(-)
+
+diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
+index 81632cd5e3b72..e8d351b7f9b03 100644
+--- a/kernel/time/tick-sched.c
++++ b/kernel/time/tick-sched.c
+@@ -941,13 +941,6 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
+                */
+               if (tick_do_timer_cpu == cpu)
+                       return false;
+-              /*
+-               * Boot safety: make sure the timekeeping duty has been
+-               * assigned before entering dyntick-idle mode,
+-               * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
+-               */
+-              if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
+-                      return false;
+ 
+               /* Should not happen for nohz-full */
+               if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+-- 
+2.27.0
+
diff --git a/queue-5.10/um-random-register-random-as-hwrng-core-device.patch b/queue-5.10/um-random-register-random-as-hwrng-core-device.patch

new file mode 100644 (file)

index 0000000..93917e3
--- /dev/null
+++ b/queue-5.10/um-random-register-random-as-hwrng-core-device.patch
@@ -0,0 +1,254 @@
+From 76ddf8266cfb65258a05dbbca4c4fd6b4f956a55 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Oct 2020 15:30:22 +0000
+Subject: um: random: Register random as hwrng-core device
+
+From: Christopher Obbard <chris.obbard@collabora.com>
+
+[ Upstream commit 72d3e093afae79611fa38f8f2cfab9a888fe66f2 ]
+
+The UML random driver creates a dummy device under the guest,
+/dev/hw_random. When this file is read from the guest, the driver
+reads from the host machine's /dev/random, in-turn reading from
+the host kernel's entropy pool. This entropy pool could have been
+filled by a hardware random number generator or just the host
+kernel's internal software entropy generator.
+
+Currently the driver does not fill the guests kernel entropy pool,
+this requires a userspace tool running inside the guest (like
+rng-tools) to read from the dummy device provided by this driver,
+which then would fill the guest's internal entropy pool.
+
+This all seems quite pointless when we are already reading from an
+entropy pool, so this patch aims to register the device as a hwrng
+device using the hwrng-core framework. This not only improves and
+cleans up the driver, but also fills the guest's entropy pool
+without having to resort to using extra userspace tools in the guest.
+
+This is typically a nuisance when booting a guest: the random pool
+takes a long time (~200s) to build up enough entropy since the dummy
+hwrng is not used to fill the guest's pool.
+
+This port was originally attempted by Alexander Neville "dark" (in CC,
+discussion in Link), but the conversation there stalled since the
+handling of -EAGAIN errors were no removed and longer handled by the
+driver. This patch attempts to use the existing method of error
+handling but utilises the new hwrng core.
+
+The issue can be noticed when booting a UML guest:
+
+    [    2.560000] random: fast init done
+    [  214.000000] random: crng init done
+
+With the patch applied, filling the pool becomes a lot quicker:
+
+    [    2.560000] random: fast init done
+    [   12.000000] random: crng init done
+
+Cc: Alexander Neville <dark@volatile.bz>
+Link: https://lore.kernel.org/lkml/20190828204609.02a7ff70@TheDarkness/
+Link: https://lore.kernel.org/lkml/20190829135001.6a5ff940@TheDarkness.local/
+Cc: Sjoerd Simons <sjoerd.simons@collabora.co.uk>
+Signed-off-by: Christopher Obbard <chris.obbard@collabora.com>
+Acked-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/um/drivers/random.c       | 101 ++++++++-------------------------
+ drivers/char/hw_random/Kconfig |  16 +++---
+ 2 files changed, 33 insertions(+), 84 deletions(-)
+
+diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
+index ce115fce52f02..e4b9b2ce9abf4 100644
+--- a/arch/um/drivers/random.c
++++ b/arch/um/drivers/random.c
+@@ -11,6 +11,7 @@
+ #include <linux/fs.h>
+ #include <linux/interrupt.h>
+ #include <linux/miscdevice.h>
++#include <linux/hw_random.h>
+ #include <linux/delay.h>
+ #include <linux/uaccess.h>
+ #include <init.h>
+@@ -18,9 +19,8 @@
+ #include <os.h>
+ 
+ /*
+- * core module and version information
++ * core module information
+  */
+-#define RNG_VERSION "1.0.0"
+ #define RNG_MODULE_NAME "hw_random"
+ 
+ /* Changed at init time, in the non-modular case, and at module load
+@@ -28,88 +28,36 @@
+  * protects against a module being loaded twice at the same time.
+  */
+ static int random_fd = -1;
+-static DECLARE_WAIT_QUEUE_HEAD(host_read_wait);
++static struct hwrng hwrng = { 0, };
++static DECLARE_COMPLETION(have_data);
+ 
+-static int rng_dev_open (struct inode *inode, struct file *filp)
++static int rng_dev_read(struct hwrng *rng, void *buf, size_t max, bool block)
+ {
+-      /* enforce read-only access to this chrdev */
+-      if ((filp->f_mode & FMODE_READ) == 0)
+-              return -EINVAL;
+-      if ((filp->f_mode & FMODE_WRITE) != 0)
+-              return -EINVAL;
++      int ret;
+ 
+-      return 0;
+-}
+-
+-static atomic_t host_sleep_count = ATOMIC_INIT(0);
+-
+-static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
+-                           loff_t *offp)
+-{
+-      u32 data;
+-      int n, ret = 0, have_data;
+-
+-      while (size) {
+-              n = os_read_file(random_fd, &data, sizeof(data));
+-              if (n > 0) {
+-                      have_data = n;
+-                      while (have_data && size) {
+-                              if (put_user((u8) data, buf++)) {
+-                                      ret = ret ? : -EFAULT;
+-                                      break;
+-                              }
+-                              size--;
+-                              ret++;
+-                              have_data--;
+-                              data >>= 8;
+-                      }
+-              }
+-              else if (n == -EAGAIN) {
+-                      DECLARE_WAITQUEUE(wait, current);
+-
+-                      if (filp->f_flags & O_NONBLOCK)
+-                              return ret ? : -EAGAIN;
+-
+-                      atomic_inc(&host_sleep_count);
++      for (;;) {
++              ret = os_read_file(random_fd, buf, max);
++              if (block && ret == -EAGAIN) {
+                       add_sigio_fd(random_fd);
+ 
+-                      add_wait_queue(&host_read_wait, &wait);
+-                      set_current_state(TASK_INTERRUPTIBLE);
++                      ret = wait_for_completion_killable(&have_data);
+ 
+-                      schedule();
+-                      remove_wait_queue(&host_read_wait, &wait);
++                      ignore_sigio_fd(random_fd);
++                      deactivate_fd(random_fd, RANDOM_IRQ);
+ 
+-                      if (atomic_dec_and_test(&host_sleep_count)) {
+-                              ignore_sigio_fd(random_fd);
+-                              deactivate_fd(random_fd, RANDOM_IRQ);
+-                      }
++                      if (ret < 0)
++                              break;
++              } else {
++                      break;
+               }
+-              else
+-                      return n;
+-
+-              if (signal_pending (current))
+-                      return ret ? : -ERESTARTSYS;
+       }
+-      return ret;
+-}
+ 
+-static const struct file_operations rng_chrdev_ops = {
+-      .owner          = THIS_MODULE,
+-      .open           = rng_dev_open,
+-      .read           = rng_dev_read,
+-      .llseek         = noop_llseek,
+-};
+-
+-/* rng_init shouldn't be called more than once at boot time */
+-static struct miscdevice rng_miscdev = {
+-      HWRNG_MINOR,
+-      RNG_MODULE_NAME,
+-      &rng_chrdev_ops,
+-};
++      return ret != -EAGAIN ? ret : 0;
++}
+ 
+ static irqreturn_t random_interrupt(int irq, void *data)
+ {
+-      wake_up(&host_read_wait);
++      complete(&have_data);
+ 
+       return IRQ_HANDLED;
+ }
+@@ -126,18 +74,19 @@ static int __init rng_init (void)
+               goto out;
+ 
+       random_fd = err;
+-
+       err = um_request_irq(RANDOM_IRQ, random_fd, IRQ_READ, random_interrupt,
+                            0, "random", NULL);
+       if (err)
+               goto err_out_cleanup_hw;
+ 
+       sigio_broken(random_fd, 1);
++      hwrng.name = RNG_MODULE_NAME;
++      hwrng.read = rng_dev_read;
++      hwrng.quality = 1024;
+ 
+-      err = misc_register (&rng_miscdev);
++      err = hwrng_register(&hwrng);
+       if (err) {
+-              printk (KERN_ERR RNG_MODULE_NAME ": misc device register "
+-                      "failed\n");
++              pr_err(RNG_MODULE_NAME " registering failed (%d)\n", err);
+               goto err_out_cleanup_hw;
+       }
+ out:
+@@ -161,8 +110,8 @@ static void cleanup(void)
+ 
+ static void __exit rng_cleanup(void)
+ {
++      hwrng_unregister(&hwrng);
+       os_close_file(random_fd);
+-      misc_deregister (&rng_miscdev);
+ }
+ 
+ module_init (rng_init);
+diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
+index e92c4d9469d82..5952210526aaa 100644
+--- a/drivers/char/hw_random/Kconfig
++++ b/drivers/char/hw_random/Kconfig
+@@ -540,15 +540,15 @@ endif # HW_RANDOM
+ 
+ config UML_RANDOM
+       depends on UML
+-      tristate "Hardware random number generator"
++      select HW_RANDOM
++      tristate "UML Random Number Generator support"
+       help
+         This option enables UML's "hardware" random number generator.  It
+         attaches itself to the host's /dev/random, supplying as much entropy
+         as the host has, rather than the small amount the UML gets from its
+-        own drivers.  It registers itself as a standard hardware random number
+-        generator, major 10, minor 183, and the canonical device name is
+-        /dev/hwrng.
+-        The way to make use of this is to install the rng-tools package
+-        (check your distro, or download from
+-        http://sourceforge.net/projects/gkernel/).  rngd periodically reads
+-        /dev/hwrng and injects the entropy into /dev/random.
++        own drivers. It registers itself as a rng-core driver thus providing
++        a device which is usually called /dev/hwrng. This hardware random
++        number generator does feed into the kernel's random number generator
++        entropy pool.
++
++        If unsure, say Y.
+-- 
+2.27.0
+
diff --git a/queue-5.10/um-ubd-submit-all-data-segments-atomically.patch b/queue-5.10/um-ubd-submit-all-data-segments-atomically.patch

new file mode 100644 (file)

index 0000000..1ad5f11
--- /dev/null
+++ b/queue-5.10/um-ubd-submit-all-data-segments-atomically.patch
@@ -0,0 +1,434 @@
+From 5b730860e1976f7f024014b59e4bd8b568c5ccb8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 21 Nov 2020 23:13:56 -0500
+Subject: um: ubd: Submit all data segments atomically
+
+From: Gabriel Krisman Bertazi <krisman@collabora.com>
+
+[ Upstream commit fc6b6a872dcd48c6f39c7975836d75113db67d37 ]
+
+Internally, UBD treats each physical IO segment as a separate command to
+be submitted in the execution pipe.  If the pipe returns a transient
+error after a few segments have already been written, UBD will tell the
+block layer to requeue the request, but there is no way to reclaim the
+segments already submitted.  When a new attempt to dispatch the request
+is done, those segments already submitted will get duplicated, causing
+the WARN_ON below in the best case, and potentially data corruption.
+
+In my system, running a UML instance with 2GB of RAM and a 50M UBD disk,
+I can reproduce the WARN_ON by simply running mkfs.fvat against the
+disk on a freshly booted system.
+
+There are a few ways to around this, like reducing the pressure on
+the pipe by reducing the queue depth, which almost eliminates the
+occurrence of the problem, increasing the pipe buffer size on the host
+system, or by limiting the request to one physical segment, which causes
+the block layer to submit way more requests to resolve a single
+operation.
+
+Instead, this patch modifies the format of a UBD command, such that all
+segments are sent through a single element in the communication pipe,
+turning the command submission atomic from the point of view of the
+block layer.  The new format has a variable size, depending on the
+number of elements, and looks like this:
+
++------------+-----------+-----------+------------
+| cmd_header | segment 0 | segment 1 | segment ...
++------------+-----------+-----------+------------
+
+With this format, we push a pointer to cmd_header in the submission
+pipe.
+
+This has the advantage of reducing the memory footprint of executing a
+single request, since it allow us to merge some fields in the header.
+It is possible to reduce even further each segment memory footprint, by
+merging bitmap_words and cow_offset, for instance, but this is not the
+focus of this patch and is left as future work.  One issue with the
+patch is that for a big number of segments, we now perform one big
+memory allocation instead of multiple small ones, but I wasn't able to
+trigger any real issues or -ENOMEM because of this change, that wouldn't
+be reproduced otherwise.
+
+This was tested using fio with the verify-crc32 option, and by running
+an ext4 filesystem over this UBD device.
+
+The original WARN_ON was:
+
+------------[ cut here ]------------
+WARNING: CPU: 0 PID: 0 at lib/refcount.c:28 refcount_warn_saturate+0x13f/0x141
+refcount_t: underflow; use-after-free.
+Modules linked in:
+CPU: 0 PID: 0 Comm: swapper Not tainted 5.5.0-rc6-00002-g2a5bb2cf75c8 #346
+Stack:
+ 6084eed0 6063dc77 00000009 6084ef60
+ 00000000 604b8d9f 6084eee0 6063dcbc
+ 6084ef40 6006ab8d e013d780 1c00000000
+Call Trace:
+ [<600a0c1c>] ? printk+0x0/0x94
+ [<6004a888>] show_stack+0x13b/0x155
+ [<6063dc77>] ? dump_stack_print_info+0xdf/0xe8
+ [<604b8d9f>] ? refcount_warn_saturate+0x13f/0x141
+ [<6063dcbc>] dump_stack+0x2a/0x2c
+ [<6006ab8d>] __warn+0x107/0x134
+ [<6008da6c>] ? wake_up_process+0x17/0x19
+ [<60487628>] ? blk_queue_max_discard_sectors+0x0/0xd
+ [<6006b05f>] warn_slowpath_fmt+0xd1/0xdf
+ [<6006af8e>] ? warn_slowpath_fmt+0x0/0xdf
+ [<600acc14>] ? raw_read_seqcount_begin.constprop.0+0x0/0x15
+ [<600619ae>] ? os_nsecs+0x1d/0x2b
+ [<604b8d9f>] refcount_warn_saturate+0x13f/0x141
+ [<6048bc8f>] refcount_sub_and_test.constprop.0+0x2f/0x37
+ [<6048c8de>] blk_mq_free_request+0xf1/0x10d
+ [<6048ca06>] __blk_mq_end_request+0x10c/0x114
+ [<6005ac0f>] ubd_intr+0xb5/0x169
+ [<600a1a37>] __handle_irq_event_percpu+0x6b/0x17e
+ [<600a1b70>] handle_irq_event_percpu+0x26/0x69
+ [<600a1bd9>] handle_irq_event+0x26/0x34
+ [<600a1bb3>] ? handle_irq_event+0x0/0x34
+ [<600a5186>] ? unmask_irq+0x0/0x37
+ [<600a57e6>] handle_edge_irq+0xbc/0xd6
+ [<600a131a>] generic_handle_irq+0x21/0x29
+ [<60048f6e>] do_IRQ+0x39/0x54
+ [...]
+---[ end trace c6e7444e55386c0f ]---
+
+Cc: Christopher Obbard <chris.obbard@collabora.com>
+Reported-by: Martyn Welch <martyn@collabora.com>
+Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
+Tested-by: Christopher Obbard <chris.obbard@collabora.com>
+Acked-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/um/drivers/ubd_kern.c | 191 ++++++++++++++++++++++---------------
+ 1 file changed, 115 insertions(+), 76 deletions(-)
+
+diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
+index eae8c83364f71..b12c1b0d3e1d0 100644
+--- a/arch/um/drivers/ubd_kern.c
++++ b/arch/um/drivers/ubd_kern.c
+@@ -47,18 +47,25 @@
+ /* Max request size is determined by sector mask - 32K */
+ #define UBD_MAX_REQUEST (8 * sizeof(long))
+ 
++struct io_desc {
++      char *buffer;
++      unsigned long length;
++      unsigned long sector_mask;
++      unsigned long long cow_offset;
++      unsigned long bitmap_words[2];
++};
++
+ struct io_thread_req {
+       struct request *req;
+       int fds[2];
+       unsigned long offsets[2];
+       unsigned long long offset;
+-      unsigned long length;
+-      char *buffer;
+       int sectorsize;
+-      unsigned long sector_mask;
+-      unsigned long long cow_offset;
+-      unsigned long bitmap_words[2];
+       int error;
++
++      int desc_cnt;
++      /* io_desc has to be the last element of the struct */
++      struct io_desc io_desc[];
+ };
+ 
+ 
+@@ -525,12 +532,7 @@ static void ubd_handler(void)
+                               blk_queue_max_write_zeroes_sectors(io_req->req->q, 0);
+                               blk_queue_flag_clear(QUEUE_FLAG_DISCARD, io_req->req->q);
+                       }
+-                      if ((io_req->error) || (io_req->buffer == NULL))
+-                              blk_mq_end_request(io_req->req, io_req->error);
+-                      else {
+-                              if (!blk_update_request(io_req->req, io_req->error, io_req->length))
+-                                      __blk_mq_end_request(io_req->req, io_req->error);
+-                      }
++                      blk_mq_end_request(io_req->req, io_req->error);
+                       kfree(io_req);
+               }
+       }
+@@ -946,6 +948,7 @@ static int ubd_add(int n, char **error_out)
+       blk_queue_write_cache(ubd_dev->queue, true, false);
+ 
+       blk_queue_max_segments(ubd_dev->queue, MAX_SG);
++      blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1);
+       err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
+       if(err){
+               *error_out = "Failed to register device";
+@@ -1289,37 +1292,74 @@ static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
+       *cow_offset += bitmap_offset;
+ }
+ 
+-static void cowify_req(struct io_thread_req *req, unsigned long *bitmap,
++static void cowify_req(struct io_thread_req *req, struct io_desc *segment,
++                     unsigned long offset, unsigned long *bitmap,
+                      __u64 bitmap_offset, __u64 bitmap_len)
+ {
+-      __u64 sector = req->offset >> SECTOR_SHIFT;
++      __u64 sector = offset >> SECTOR_SHIFT;
+       int i;
+ 
+-      if (req->length > (sizeof(req->sector_mask) * 8) << SECTOR_SHIFT)
++      if (segment->length > (sizeof(segment->sector_mask) * 8) << SECTOR_SHIFT)
+               panic("Operation too long");
+ 
+       if (req_op(req->req) == REQ_OP_READ) {
+-              for (i = 0; i < req->length >> SECTOR_SHIFT; i++) {
++              for (i = 0; i < segment->length >> SECTOR_SHIFT; i++) {
+                       if(ubd_test_bit(sector + i, (unsigned char *) bitmap))
+                               ubd_set_bit(i, (unsigned char *)
+-                                          &req->sector_mask);
++                                          &segment->sector_mask);
++              }
++      } else {
++              cowify_bitmap(offset, segment->length, &segment->sector_mask,
++                            &segment->cow_offset, bitmap, bitmap_offset,
++                            segment->bitmap_words, bitmap_len);
++      }
++}
++
++static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req,
++                      struct request *req)
++{
++      struct bio_vec bvec;
++      struct req_iterator iter;
++      int i = 0;
++      unsigned long byte_offset = io_req->offset;
++      int op = req_op(req);
++
++      if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) {
++              io_req->io_desc[0].buffer = NULL;
++              io_req->io_desc[0].length = blk_rq_bytes(req);
++      } else {
++              rq_for_each_segment(bvec, req, iter) {
++                      BUG_ON(i >= io_req->desc_cnt);
++
++                      io_req->io_desc[i].buffer =
++                              page_address(bvec.bv_page) + bvec.bv_offset;
++                      io_req->io_desc[i].length = bvec.bv_len;
++                      i++;
++              }
++      }
++
++      if (dev->cow.file) {
++              for (i = 0; i < io_req->desc_cnt; i++) {
++                      cowify_req(io_req, &io_req->io_desc[i], byte_offset,
++                                 dev->cow.bitmap, dev->cow.bitmap_offset,
++                                 dev->cow.bitmap_len);
++                      byte_offset += io_req->io_desc[i].length;
+               }
++
+       }
+-      else cowify_bitmap(req->offset, req->length, &req->sector_mask,
+-                         &req->cow_offset, bitmap, bitmap_offset,
+-                         req->bitmap_words, bitmap_len);
+ }
+ 
+-static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
+-              u64 off, struct bio_vec *bvec)
++static struct io_thread_req *ubd_alloc_req(struct ubd *dev, struct request *req,
++                                         int desc_cnt)
+ {
+-      struct ubd *dev = hctx->queue->queuedata;
+       struct io_thread_req *io_req;
+-      int ret;
++      int i;
+ 
+-      io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
++      io_req = kmalloc(sizeof(*io_req) +
++                       (desc_cnt * sizeof(struct io_desc)),
++                       GFP_ATOMIC);
+       if (!io_req)
+-              return -ENOMEM;
++              return NULL;
+ 
+       io_req->req = req;
+       if (dev->cow.file)
+@@ -1327,26 +1367,41 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
+       else
+               io_req->fds[0] = dev->fd;
+       io_req->error = 0;
+-
+-      if (bvec != NULL) {
+-              io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset;
+-              io_req->length = bvec->bv_len;
+-      } else {
+-              io_req->buffer = NULL;
+-              io_req->length = blk_rq_bytes(req);
+-      }
+-
+       io_req->sectorsize = SECTOR_SIZE;
+       io_req->fds[1] = dev->fd;
+-      io_req->cow_offset = -1;
+-      io_req->offset = off;
+-      io_req->sector_mask = 0;
++      io_req->offset = (u64) blk_rq_pos(req) << SECTOR_SHIFT;
+       io_req->offsets[0] = 0;
+       io_req->offsets[1] = dev->cow.data_offset;
+ 
+-      if (dev->cow.file)
+-              cowify_req(io_req, dev->cow.bitmap,
+-                         dev->cow.bitmap_offset, dev->cow.bitmap_len);
++      for (i = 0 ; i < desc_cnt; i++) {
++              io_req->io_desc[i].sector_mask = 0;
++              io_req->io_desc[i].cow_offset = -1;
++      }
++
++      return io_req;
++}
++
++static int ubd_submit_request(struct ubd *dev, struct request *req)
++{
++      int segs = 0;
++      struct io_thread_req *io_req;
++      int ret;
++      int op = req_op(req);
++
++      if (op == REQ_OP_FLUSH)
++              segs = 0;
++      else if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD)
++              segs = 1;
++      else
++              segs = blk_rq_nr_phys_segments(req);
++
++      io_req = ubd_alloc_req(dev, req, segs);
++      if (!io_req)
++              return -ENOMEM;
++
++      io_req->desc_cnt = segs;
++      if (segs)
++              ubd_map_req(dev, io_req, req);
+ 
+       ret = os_write_file(thread_fd, &io_req, sizeof(io_req));
+       if (ret != sizeof(io_req)) {
+@@ -1357,22 +1412,6 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
+       return ret;
+ }
+ 
+-static int queue_rw_req(struct blk_mq_hw_ctx *hctx, struct request *req)
+-{
+-      struct req_iterator iter;
+-      struct bio_vec bvec;
+-      int ret;
+-      u64 off = (u64)blk_rq_pos(req) << SECTOR_SHIFT;
+-
+-      rq_for_each_segment(bvec, req, iter) {
+-              ret = ubd_queue_one_vec(hctx, req, off, &bvec);
+-              if (ret < 0)
+-                      return ret;
+-              off += bvec.bv_len;
+-      }
+-      return 0;
+-}
+-
+ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
+                                const struct blk_mq_queue_data *bd)
+ {
+@@ -1385,17 +1424,12 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
+       spin_lock_irq(&ubd_dev->lock);
+ 
+       switch (req_op(req)) {
+-      /* operations with no lentgth/offset arguments */
+       case REQ_OP_FLUSH:
+-              ret = ubd_queue_one_vec(hctx, req, 0, NULL);
+-              break;
+       case REQ_OP_READ:
+       case REQ_OP_WRITE:
+-              ret = queue_rw_req(hctx, req);
+-              break;
+       case REQ_OP_DISCARD:
+       case REQ_OP_WRITE_ZEROES:
+-              ret = ubd_queue_one_vec(hctx, req, (u64)blk_rq_pos(req) << 9, NULL);
++              ret = ubd_submit_request(ubd_dev, req);
+               break;
+       default:
+               WARN_ON_ONCE(1);
+@@ -1483,22 +1517,22 @@ static int map_error(int error_code)
+  * will result in unpredictable behaviour and/or crashes.
+  */
+ 
+-static int update_bitmap(struct io_thread_req *req)
++static int update_bitmap(struct io_thread_req *req, struct io_desc *segment)
+ {
+       int n;
+ 
+-      if(req->cow_offset == -1)
++      if (segment->cow_offset == -1)
+               return map_error(0);
+ 
+-      n = os_pwrite_file(req->fds[1], &req->bitmap_words,
+-                        sizeof(req->bitmap_words), req->cow_offset);
+-      if (n != sizeof(req->bitmap_words))
++      n = os_pwrite_file(req->fds[1], &segment->bitmap_words,
++                        sizeof(segment->bitmap_words), segment->cow_offset);
++      if (n != sizeof(segment->bitmap_words))
+               return map_error(-n);
+ 
+       return map_error(0);
+ }
+ 
+-static void do_io(struct io_thread_req *req)
++static void do_io(struct io_thread_req *req, struct io_desc *desc)
+ {
+       char *buf = NULL;
+       unsigned long len;
+@@ -1513,21 +1547,20 @@ static void do_io(struct io_thread_req *req)
+               return;
+       }
+ 
+-      nsectors = req->length / req->sectorsize;
++      nsectors = desc->length / req->sectorsize;
+       start = 0;
+       do {
+-              bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask);
++              bit = ubd_test_bit(start, (unsigned char *) &desc->sector_mask);
+               end = start;
+               while((end < nsectors) &&
+-                    (ubd_test_bit(end, (unsigned char *)
+-                                  &req->sector_mask) == bit))
++                    (ubd_test_bit(end, (unsigned char *) &desc->sector_mask) == bit))
+                       end++;
+ 
+               off = req->offset + req->offsets[bit] +
+                       start * req->sectorsize;
+               len = (end - start) * req->sectorsize;
+-              if (req->buffer != NULL)
+-                      buf = &req->buffer[start * req->sectorsize];
++              if (desc->buffer != NULL)
++                      buf = &desc->buffer[start * req->sectorsize];
+ 
+               switch (req_op(req->req)) {
+               case REQ_OP_READ:
+@@ -1567,7 +1600,8 @@ static void do_io(struct io_thread_req *req)
+               start = end;
+       } while(start < nsectors);
+ 
+-      req->error = update_bitmap(req);
++      req->offset += len;
++      req->error = update_bitmap(req, desc);
+ }
+ 
+ /* Changed in start_io_thread, which is serialized by being called only
+@@ -1600,8 +1634,13 @@ int io_thread(void *arg)
+               }
+ 
+               for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
++                      struct io_thread_req *req = (*io_req_buffer)[count];
++                      int i;
++
+                       io_count++;
+-                      do_io((*io_req_buffer)[count]);
++                      for (i = 0; !req->error && i < req->desc_cnt; i++)
++                              do_io(req, &(req->io_desc[i]));
++
+               }
+ 
+               written = 0;
+-- 
+2.27.0
+
diff --git a/queue-5.10/watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch b/queue-5.10/watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch

new file mode 100644 (file)

index 0000000..aec25b4
--- /dev/null
+++ b/queue-5.10/watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch
@@ -0,0 +1,42 @@
+From 49a4a3d8b7fc4a26b17c0f556b693bc0cc084f52 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 30 Oct 2020 23:49:09 +0800
+Subject: watchdog: rti-wdt: fix reference leak in rti_wdt_probe
+
+From: Zhang Qilong <zhangqilong3@huawei.com>
+
+[ Upstream commit 8711071e9700b67045fe5518161d63f7a03e3c9e ]
+
+pm_runtime_get_sync() will increment pm usage counter even it
+failed. Forgetting to call pm_runtime_put_noidle will result
+in reference leak in rti_wdt_probe, so we should fix it.
+
+Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com>
+Reviewed-by: Guenter Roeck <linux@roeck-us.net>
+Link: https://lore.kernel.org/r/20201030154909.100023-1-zhangqilong3@huawei.com
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/watchdog/rti_wdt.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c
+index 836319cbaca9d..359302f71f7ef 100644
+--- a/drivers/watchdog/rti_wdt.c
++++ b/drivers/watchdog/rti_wdt.c
+@@ -227,8 +227,10 @@ static int rti_wdt_probe(struct platform_device *pdev)
+ 
+       pm_runtime_enable(dev);
+       ret = pm_runtime_get_sync(dev);
+-      if (ret)
++      if (ret) {
++              pm_runtime_put_noidle(dev);
+               return dev_err_probe(dev, ret, "runtime pm failed\n");
++      }
+ 
+       platform_set_drvdata(pdev, wdt);
+ 
+-- 
+2.27.0
+
author	Sasha Levin <sashal@kernel.org>
	Mon, 4 Jan 2021 15:53:13 +0000 (10:53 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Mon, 4 Jan 2021 15:53:13 +0000 (10:53 -0500)
queue-5.10/alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/device-dax-fix-range-release.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/dm-verity-skip-verity-work-if-i-o-error-when-system-.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/drm-amd-display-updated-wm-table-for-renoir.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/f2fs-avoid-race-condition-for-shrinker-count.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/f2fs-fix-race-of-pending_pages-in-decompression.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/fs-namespace.c-warn-if-mnt_count-has-become-negative.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/i3c-master-fix-missing-destroy_workqueue-on-error-in.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/io_uring-remove-racy-overflow-list-fast-checks.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/module-delay-kobject-uevent-until-after-module-init-.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/module-set-module_state_going-state-when-a-module-fa.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/quota-don-t-overflow-quota-file-offsets.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/rtc-pl031-fix-resource-leak-in-pl031_probe.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/s390-always-clear-kernel-stack-backchain-before-call.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/series		patch \| blob \| blame \| history
queue-5.10/tick-sched-remove-bogus-boot-safety-check.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/um-random-register-random-as-hwrng-core-device.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/um-ubd-submit-all-data-segments-atomically.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch	[new file with mode: 0644]	patch \| blob