From 9e1fcaa37a92a4ae6857194a74b98599aaa58af8 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 4 Jan 2021 10:53:13 -0500 Subject: [PATCH] Fixes for 5.10 Signed-off-by: Sasha Levin --- ...he-full-allocated-memory-at-hw_param.patch | 52 +++ ...efcount-leak-when-ceph_fill_inode-on.patch | 34 ++ queue-5.10/device-dax-fix-range-release.patch | 134 ++++++ ...erity-work-if-i-o-error-when-system-.patch | 59 +++ ...-display-updated-wm-table-for-renoir.patch | 63 +++ ...prefetch-to-be-zero-in-individual-sc.patch | 71 +++ ...id-race-condition-for-shrinker-count.patch | 236 ++++++++++ ...ce-of-pending_pages-in-decompression.patch | 240 ++++++++++ ...arn-if-mnt_count-has-become-negative.patch | 87 ++++ ...issing-destroy_workqueue-on-error-in.patch | 47 ++ ...emove-racy-overflow-list-fast-checks.patch | 48 ++ ...ject-uevent-until-after-module-init-.patch | 72 +++ ...e_state_going-state-when-a-module-fa.patch | 36 ++ ...-layout-related-use-after-free-race-.patch | 131 ++++++ ...ror-when-exiting-early-on-a-read_plu.patch | 118 +++++ ...eplay-remove-decrementer-overflow-ch.patch | 171 +++++++ ...dd-missing-iounmap-on-error-in-mpic_.patch | 39 ++ ...ta-don-t-overflow-quota-file-offsets.patch | 66 +++ ...031-fix-resource-leak-in-pl031_probe.patch | 42 ++ ...6i-fix-memleak-in-sun6i_rtc_clk_init.patch | 65 +++ ...r-kernel-stack-backchain-before-call.patch | 87 ++++ queue-5.10/series | 25 + ...sched-remove-bogus-boot-safety-check.patch | 49 ++ ...register-random-as-hwrng-core-device.patch | 254 ++++++++++ ...-submit-all-data-segments-atomically.patch | 434 ++++++++++++++++++ ...-fix-reference-leak-in-rti_wdt_probe.patch | 42 ++ 26 files changed, 2702 insertions(+) create mode 100644 queue-5.10/alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch create mode 100644 queue-5.10/ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch create mode 100644 queue-5.10/device-dax-fix-range-release.patch create mode 100644 queue-5.10/dm-verity-skip-verity-work-if-i-o-error-when-system-.patch create mode 100644 queue-5.10/drm-amd-display-updated-wm-table-for-renoir.patch create mode 100644 queue-5.10/ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch create mode 100644 queue-5.10/f2fs-avoid-race-condition-for-shrinker-count.patch create mode 100644 queue-5.10/f2fs-fix-race-of-pending_pages-in-decompression.patch create mode 100644 queue-5.10/fs-namespace.c-warn-if-mnt_count-has-become-negative.patch create mode 100644 queue-5.10/i3c-master-fix-missing-destroy_workqueue-on-error-in.patch create mode 100644 queue-5.10/io_uring-remove-racy-overflow-list-fast-checks.patch create mode 100644 queue-5.10/module-delay-kobject-uevent-until-after-module-init-.patch create mode 100644 queue-5.10/module-set-module_state_going-state-when-a-module-fa.patch create mode 100644 queue-5.10/nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch create mode 100644 queue-5.10/nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch create mode 100644 queue-5.10/powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch create mode 100644 queue-5.10/powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch create mode 100644 queue-5.10/quota-don-t-overflow-quota-file-offsets.patch create mode 100644 queue-5.10/rtc-pl031-fix-resource-leak-in-pl031_probe.patch create mode 100644 queue-5.10/rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch create mode 100644 queue-5.10/s390-always-clear-kernel-stack-backchain-before-call.patch create mode 100644 queue-5.10/tick-sched-remove-bogus-boot-safety-check.patch create mode 100644 queue-5.10/um-random-register-random-as-hwrng-core-device.patch create mode 100644 queue-5.10/um-ubd-submit-all-data-segments-atomically.patch create mode 100644 queue-5.10/watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch diff --git a/queue-5.10/alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch b/queue-5.10/alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch new file mode 100644 index 00000000000..3578a5241ea --- /dev/null +++ b/queue-5.10/alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch @@ -0,0 +1,52 @@ +From 2d6a2a446f4ca540fd548786c39da13db8c1a33a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 18 Dec 2020 15:56:25 +0100 +Subject: ALSA: pcm: Clear the full allocated memory at hw_params + +From: Takashi Iwai + +[ Upstream commit 618de0f4ef11acd8cf26902e65493d46cc20cc89 ] + +The PCM hw_params core function tries to clear up the PCM buffer +before actually using for avoiding the information leak from the +previous usages or the usage before a new allocation. It performs the +memset() with runtime->dma_bytes, but this might still leave some +remaining bytes untouched; namely, the PCM buffer size is aligned in +page size for mmap, hence runtime->dma_bytes doesn't necessarily cover +all PCM buffer pages, and the remaining bytes are exposed via mmap. + +This patch changes the memory clearance to cover the all buffer pages +if the stream is supposed to be mmap-ready (that guarantees that the +buffer size is aligned in page size). + +Reviewed-by: Lars-Peter Clausen +Link: https://lore.kernel.org/r/20201218145625.2045-3-tiwai@suse.de +Signed-off-by: Takashi Iwai +Signed-off-by: Sasha Levin +--- + sound/core/pcm_native.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c +index 47b155a49226f..9f3f8e953ff04 100644 +--- a/sound/core/pcm_native.c ++++ b/sound/core/pcm_native.c +@@ -755,8 +755,13 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, + runtime->boundary *= 2; + + /* clear the buffer for avoiding possible kernel info leaks */ +- if (runtime->dma_area && !substream->ops->copy_user) +- memset(runtime->dma_area, 0, runtime->dma_bytes); ++ if (runtime->dma_area && !substream->ops->copy_user) { ++ size_t size = runtime->dma_bytes; ++ ++ if (runtime->info & SNDRV_PCM_INFO_MMAP) ++ size = PAGE_ALIGN(size); ++ memset(runtime->dma_area, 0, size); ++ } + + snd_pcm_timer_resolution_change(substream); + snd_pcm_set_state(substream, SNDRV_PCM_STATE_SETUP); +-- +2.27.0 + diff --git a/queue-5.10/ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch b/queue-5.10/ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch new file mode 100644 index 00000000000..b79e128d22e --- /dev/null +++ b/queue-5.10/ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch @@ -0,0 +1,34 @@ +From de7b4ea194839e2bfd2dac67a676a20d88923ec1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Nov 2020 09:37:59 -0500 +Subject: ceph: fix inode refcount leak when ceph_fill_inode on non-I_NEW inode + fails + +From: Jeff Layton + +[ Upstream commit 68cbb8056a4c24c6a38ad2b79e0a9764b235e8fa ] + +Signed-off-by: Jeff Layton +Reviewed-by: Ilya Dryomov +Signed-off-by: Ilya Dryomov +Signed-off-by: Sasha Levin +--- + fs/ceph/inode.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c +index 526faf4778ce4..2462a9a84b956 100644 +--- a/fs/ceph/inode.c ++++ b/fs/ceph/inode.c +@@ -1335,6 +1335,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) + in, ceph_vinop(in)); + if (in->i_state & I_NEW) + discard_new_inode(in); ++ else ++ iput(in); + goto done; + } + req->r_target_inode = in; +-- +2.27.0 + diff --git a/queue-5.10/device-dax-fix-range-release.patch b/queue-5.10/device-dax-fix-range-release.patch new file mode 100644 index 00000000000..31600133a33 --- /dev/null +++ b/queue-5.10/device-dax-fix-range-release.patch @@ -0,0 +1,134 @@ +From a09dd794b26d82c969c4563bda1eea962aebf87c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 18 Dec 2020 18:41:41 -0800 +Subject: device-dax: Fix range release + +From: Dan Williams + +[ Upstream commit 6268d7da4d192af339f4d688942b9ccb45a65e04 ] + +There are multiple locations that open-code the release of the last +range in a device-dax instance. Consolidate this into a new +dev_dax_trim_range() helper. + +This also addresses a kmemleak report: + +# cat /sys/kernel/debug/kmemleak +[..] +unreferenced object 0xffff976bd46f6240 (size 64): + comm "ndctl", pid 23556, jiffies 4299514316 (age 5406.733s) + hex dump (first 32 bytes): + 00 00 00 00 00 00 00 00 00 00 20 c3 37 00 00 00 .......... .7... + ff ff ff 7f 38 00 00 00 00 00 00 00 00 00 00 00 ....8........... + backtrace: + [<00000000064003cf>] __kmalloc_track_caller+0x136/0x379 + [<00000000d85e3c52>] krealloc+0x67/0x92 + [<00000000d7d3ba8a>] __alloc_dev_dax_range+0x73/0x25c + [<0000000027d58626>] devm_create_dev_dax+0x27d/0x416 + [<00000000434abd43>] __dax_pmem_probe+0x1c9/0x1000 [dax_pmem_core] + [<0000000083726c1c>] dax_pmem_probe+0x10/0x1f [dax_pmem] + [<00000000b5f2319c>] nvdimm_bus_probe+0x9d/0x340 [libnvdimm] + [<00000000c055e544>] really_probe+0x230/0x48d + [<000000006cabd38e>] driver_probe_device+0x122/0x13b + [<0000000029c7b95a>] device_driver_attach+0x5b/0x60 + [<0000000053e5659b>] bind_store+0xb7/0xc3 + [<00000000d3bdaadc>] drv_attr_store+0x27/0x31 + [<00000000949069c5>] sysfs_kf_write+0x4a/0x57 + [<000000004a8b5adf>] kernfs_fop_write+0x150/0x1e5 + [<00000000bded60f0>] __vfs_write+0x1b/0x34 + [<00000000b92900f0>] vfs_write+0xd8/0x1d1 + +Reported-by: Jane Chu +Cc: Zhen Lei +Link: https://lore.kernel.org/r/160834570161.1791850.14911670304441510419.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: Dan Williams +Signed-off-by: Sasha Levin +--- + drivers/dax/bus.c | 44 +++++++++++++++++++++----------------------- + 1 file changed, 21 insertions(+), 23 deletions(-) + +diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c +index 27513d311242e..de7b74505e75e 100644 +--- a/drivers/dax/bus.c ++++ b/drivers/dax/bus.c +@@ -367,19 +367,28 @@ void kill_dev_dax(struct dev_dax *dev_dax) + } + EXPORT_SYMBOL_GPL(kill_dev_dax); + +-static void free_dev_dax_ranges(struct dev_dax *dev_dax) ++static void trim_dev_dax_range(struct dev_dax *dev_dax) + { ++ int i = dev_dax->nr_range - 1; ++ struct range *range = &dev_dax->ranges[i].range; + struct dax_region *dax_region = dev_dax->region; +- int i; + + device_lock_assert(dax_region->dev); +- for (i = 0; i < dev_dax->nr_range; i++) { +- struct range *range = &dev_dax->ranges[i].range; +- +- __release_region(&dax_region->res, range->start, +- range_len(range)); ++ dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i, ++ (unsigned long long)range->start, ++ (unsigned long long)range->end); ++ ++ __release_region(&dax_region->res, range->start, range_len(range)); ++ if (--dev_dax->nr_range == 0) { ++ kfree(dev_dax->ranges); ++ dev_dax->ranges = NULL; + } +- dev_dax->nr_range = 0; ++} ++ ++static void free_dev_dax_ranges(struct dev_dax *dev_dax) ++{ ++ while (dev_dax->nr_range) ++ trim_dev_dax_range(dev_dax); + } + + static void unregister_dev_dax(void *dev) +@@ -804,15 +813,10 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, + return 0; + + rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1); +- if (rc) { +- dev_dbg(dev, "delete range[%d]: %pa:%pa\n", dev_dax->nr_range - 1, +- &alloc->start, &alloc->end); +- dev_dax->nr_range--; +- __release_region(res, alloc->start, resource_size(alloc)); +- return rc; +- } ++ if (rc) ++ trim_dev_dax_range(dev_dax); + +- return 0; ++ return rc; + } + + static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size) +@@ -885,12 +889,7 @@ static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size) + if (shrink >= range_len(range)) { + devm_release_action(dax_region->dev, + unregister_dax_mapping, &mapping->dev); +- __release_region(&dax_region->res, range->start, +- range_len(range)); +- dev_dax->nr_range--; +- dev_dbg(dev, "delete range[%d]: %#llx:%#llx\n", i, +- (unsigned long long) range->start, +- (unsigned long long) range->end); ++ trim_dev_dax_range(dev_dax); + to_shrink -= shrink; + if (!to_shrink) + break; +@@ -1274,7 +1273,6 @@ static void dev_dax_release(struct device *dev) + put_dax(dax_dev); + free_dev_dax_id(dev_dax); + dax_region_put(dax_region); +- kfree(dev_dax->ranges); + kfree(dev_dax->pgmap); + kfree(dev_dax); + } +-- +2.27.0 + diff --git a/queue-5.10/dm-verity-skip-verity-work-if-i-o-error-when-system-.patch b/queue-5.10/dm-verity-skip-verity-work-if-i-o-error-when-system-.patch new file mode 100644 index 00000000000..a4b22dc4b18 --- /dev/null +++ b/queue-5.10/dm-verity-skip-verity-work-if-i-o-error-when-system-.patch @@ -0,0 +1,59 @@ +From 5e7a77e76c8ba928983f571ef463fe624bc25d0c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 3 Dec 2020 09:46:59 +0900 +Subject: dm verity: skip verity work if I/O error when system is shutting down + +From: Hyeongseok Kim + +[ Upstream commit 252bd1256396cebc6fc3526127fdb0b317601318 ] + +If emergency system shutdown is called, like by thermal shutdown, +a dm device could be alive when the block device couldn't process +I/O requests anymore. In this state, the handling of I/O errors +by new dm I/O requests or by those already in-flight can lead to +a verity corruption state, which is a misjudgment. + +So, skip verity work in response to I/O error when system is shutting +down. + +Signed-off-by: Hyeongseok Kim +Reviewed-by: Sami Tolvanen +Signed-off-by: Mike Snitzer +Signed-off-by: Sasha Levin +--- + drivers/md/dm-verity-target.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c +index f74982dcbea0d..6b8e5bdd8526d 100644 +--- a/drivers/md/dm-verity-target.c ++++ b/drivers/md/dm-verity-target.c +@@ -537,6 +537,15 @@ static int verity_verify_io(struct dm_verity_io *io) + return 0; + } + ++/* ++ * Skip verity work in response to I/O error when system is shutting down. ++ */ ++static inline bool verity_is_system_shutting_down(void) ++{ ++ return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ++ || system_state == SYSTEM_RESTART; ++} ++ + /* + * End one "io" structure with a given error. + */ +@@ -564,7 +573,8 @@ static void verity_end_io(struct bio *bio) + { + struct dm_verity_io *io = bio->bi_private; + +- if (bio->bi_status && !verity_fec_is_enabled(io->v)) { ++ if (bio->bi_status && ++ (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) { + verity_finish_io(io, bio->bi_status); + return; + } +-- +2.27.0 + diff --git a/queue-5.10/drm-amd-display-updated-wm-table-for-renoir.patch b/queue-5.10/drm-amd-display-updated-wm-table-for-renoir.patch new file mode 100644 index 00000000000..f6382b6f98c --- /dev/null +++ b/queue-5.10/drm-amd-display-updated-wm-table-for-renoir.patch @@ -0,0 +1,63 @@ +From 395e3945ad577c47057fe6b8883920be561092c7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 3 Dec 2020 14:05:56 -0500 +Subject: drm/amd/display: updated wm table for Renoir + +From: Jake Wang + +[ Upstream commit 410066d24cfc1071be25e402510367aca9db5cb6 ] + +[Why] +For certain timings, Renoir may underflow due to sr exit +latency being too slow. + +[How] +Updated wm table for renoir. + +Signed-off-by: Jake Wang +Reviewed-by: Yongqiang Sun +Acked-by: Qingqing Zhuo +Signed-off-by: Alex Deucher +Signed-off-by: Sasha Levin +--- + .../drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c +index 6b431db146cd9..1c6e401dd4cce 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c +@@ -704,24 +704,24 @@ static struct wm_table ddr4_wm_table_rn = { + .wm_inst = WM_B, + .wm_type = WM_TYPE_PSTATE_CHG, + .pstate_latency_us = 11.72, +- .sr_exit_time_us = 10.12, +- .sr_enter_plus_exit_time_us = 11.48, ++ .sr_exit_time_us = 11.12, ++ .sr_enter_plus_exit_time_us = 12.48, + .valid = true, + }, + { + .wm_inst = WM_C, + .wm_type = WM_TYPE_PSTATE_CHG, + .pstate_latency_us = 11.72, +- .sr_exit_time_us = 10.12, +- .sr_enter_plus_exit_time_us = 11.48, ++ .sr_exit_time_us = 11.12, ++ .sr_enter_plus_exit_time_us = 12.48, + .valid = true, + }, + { + .wm_inst = WM_D, + .wm_type = WM_TYPE_PSTATE_CHG, + .pstate_latency_us = 11.72, +- .sr_exit_time_us = 10.12, +- .sr_enter_plus_exit_time_us = 11.48, ++ .sr_exit_time_us = 11.12, ++ .sr_enter_plus_exit_time_us = 12.48, + .valid = true, + }, + } +-- +2.27.0 + diff --git a/queue-5.10/ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch b/queue-5.10/ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch new file mode 100644 index 00000000000..2644928c52c --- /dev/null +++ b/queue-5.10/ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch @@ -0,0 +1,71 @@ +From 0781df1b520bf59b12c373d6925f3300f23c801d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 4 Dec 2020 11:05:43 +0800 +Subject: ext4: avoid s_mb_prefetch to be zero in individual scenarios + +From: Chunguang Xu + +[ Upstream commit 82ef1370b0c1757ab4ce29f34c52b4e93839b0aa ] + +Commit cfd732377221 ("ext4: add prefetching for block allocation +bitmaps") introduced block bitmap prefetch, and expects to read block +bitmaps of flex_bg through an IO. However, it seems to ignore the +value range of s_log_groups_per_flex. In the scenario where the value +of s_log_groups_per_flex is greater than 27, s_mb_prefetch or +s_mb_prefetch_limit will overflow, cause a divide zero exception. + +In addition, the logic of calculating nr is also flawed, because the +size of flexbg is fixed during a single mount, but s_mb_prefetch can +be modified, which causes nr to fail to meet the value condition of +[1, flexbg_size]. + +To solve this problem, we need to set the upper limit of +s_mb_prefetch. Since we expect to load block bitmaps of a flex_bg +through an IO, we can consider determining a reasonable upper limit +among the IO limit parameters. After consideration, we chose +BLK_MAX_SEGMENT_SIZE. This is a good choice to solve divide zero +problem and avoiding performance degradation. + +[ Some minor code simplifications to make the changes easy to follow -- TYT ] + +Reported-by: Tosk Robot +Signed-off-by: Chunguang Xu +Reviewed-by: Samuel Liao +Reviewed-by: Andreas Dilger +Link: https://lore.kernel.org/r/1607051143-24508-1-git-send-email-brookxu@tencent.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/mballoc.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 37a619bf1ac7c..e67d5de6f28ca 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2395,9 +2395,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + + nr = sbi->s_mb_prefetch; + if (ext4_has_feature_flex_bg(sb)) { +- nr = (group / sbi->s_mb_prefetch) * +- sbi->s_mb_prefetch; +- nr = nr + sbi->s_mb_prefetch - group; ++ nr = 1 << sbi->s_log_groups_per_flex; ++ nr -= group & (nr - 1); ++ nr = min(nr, sbi->s_mb_prefetch); + } + prefetch_grp = ext4_mb_prefetch(sb, group, + nr, &prefetch_ios); +@@ -2733,7 +2733,8 @@ static int ext4_mb_init_backend(struct super_block *sb) + + if (ext4_has_feature_flex_bg(sb)) { + /* a single flex group is supposed to be read by a single IO */ +- sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; ++ sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex, ++ BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); + sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ + } else { + sbi->s_mb_prefetch = 32; +-- +2.27.0 + diff --git a/queue-5.10/f2fs-avoid-race-condition-for-shrinker-count.patch b/queue-5.10/f2fs-avoid-race-condition-for-shrinker-count.patch new file mode 100644 index 00000000000..1f99b36d807 --- /dev/null +++ b/queue-5.10/f2fs-avoid-race-condition-for-shrinker-count.patch @@ -0,0 +1,236 @@ +From b113fd5a4b803810a8474423935612e94c5ef906 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 6 Nov 2020 13:22:05 -0800 +Subject: f2fs: avoid race condition for shrinker count + +From: Jaegeuk Kim + +[ Upstream commit a95ba66ac1457b76fe472c8e092ab1006271f16c ] + +Light reported sometimes shinker gets nat_cnt < dirty_nat_cnt resulting in +wrong do_shinker work. Let's avoid to return insane overflowed value by adding +single tracking value. + +Reported-by: Light Hsieh +Reviewed-by: Chao Yu +Signed-off-by: Jaegeuk Kim +Signed-off-by: Sasha Levin +--- + fs/f2fs/checkpoint.c | 2 +- + fs/f2fs/debug.c | 11 ++++++----- + fs/f2fs/f2fs.h | 10 ++++++++-- + fs/f2fs/node.c | 29 ++++++++++++++++++----------- + fs/f2fs/node.h | 4 ++-- + fs/f2fs/shrinker.c | 4 +--- + 6 files changed, 36 insertions(+), 24 deletions(-) + +diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c +index 023462e80e58d..b39bf416d5114 100644 +--- a/fs/f2fs/checkpoint.c ++++ b/fs/f2fs/checkpoint.c +@@ -1600,7 +1600,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) + goto out; + } + +- if (NM_I(sbi)->dirty_nat_cnt == 0 && ++ if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + f2fs_flush_sit_entries(sbi, cpc); +diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c +index a8357fd4f5fab..197c914119da8 100644 +--- a/fs/f2fs/debug.c ++++ b/fs/f2fs/debug.c +@@ -145,8 +145,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) + si->node_pages = NODE_MAPPING(sbi)->nrpages; + if (sbi->meta_inode) + si->meta_pages = META_MAPPING(sbi)->nrpages; +- si->nats = NM_I(sbi)->nat_cnt; +- si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; ++ si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; ++ si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; + si->sits = MAIN_SEGS(sbi); + si->dirty_sits = SIT_I(sbi)->dirty_sentries; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; +@@ -278,9 +278,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi) + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * + sizeof(struct free_nid); +- si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); +- si->cache_mem += NM_I(sbi)->dirty_nat_cnt * +- sizeof(struct nat_entry_set); ++ si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] * ++ sizeof(struct nat_entry); ++ si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * ++ sizeof(struct nat_entry_set); + si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); + for (i = 0; i < MAX_INO_ENTRY; i++) + si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); +diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h +index 9a321c52facec..e4344d98a780c 100644 +--- a/fs/f2fs/f2fs.h ++++ b/fs/f2fs/f2fs.h +@@ -894,6 +894,13 @@ enum nid_state { + MAX_NID_STATE, + }; + ++enum nat_state { ++ TOTAL_NAT, ++ DIRTY_NAT, ++ RECLAIMABLE_NAT, ++ MAX_NAT_STATE, ++}; ++ + struct f2fs_nm_info { + block_t nat_blkaddr; /* base disk address of NAT */ + nid_t max_nid; /* maximum possible node ids */ +@@ -909,8 +916,7 @@ struct f2fs_nm_info { + struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct list_head nat_entries; /* cached nat entry list (clean) */ + spinlock_t nat_list_lock; /* protect clean nat entry list */ +- unsigned int nat_cnt; /* the # of cached nat entries */ +- unsigned int dirty_nat_cnt; /* total num of nat entries in set */ ++ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ + unsigned int nat_blocks; /* # of nat blocks */ + + /* free node ids management */ +diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c +index 42394de6c7eb1..e65d73293a3f6 100644 +--- a/fs/f2fs/node.c ++++ b/fs/f2fs/node.c +@@ -62,8 +62,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) + sizeof(struct free_nid)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + } else if (type == NAT_ENTRIES) { +- mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> +- PAGE_SHIFT; ++ mem_size = (nm_i->nat_cnt[TOTAL_NAT] * ++ sizeof(struct nat_entry)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; +@@ -177,7 +177,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, + list_add_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + +- nm_i->nat_cnt++; ++ nm_i->nat_cnt[TOTAL_NAT]++; ++ nm_i->nat_cnt[RECLAIMABLE_NAT]++; + return ne; + } + +@@ -207,7 +208,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, + static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) + { + radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); +- nm_i->nat_cnt--; ++ nm_i->nat_cnt[TOTAL_NAT]--; ++ nm_i->nat_cnt[RECLAIMABLE_NAT]--; + __free_nat_entry(e); + } + +@@ -253,7 +255,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + +- nm_i->dirty_nat_cnt++; ++ nm_i->nat_cnt[DIRTY_NAT]++; ++ nm_i->nat_cnt[RECLAIMABLE_NAT]--; + set_nat_flag(ne, IS_DIRTY, true); + refresh_list: + spin_lock(&nm_i->nat_list_lock); +@@ -273,7 +276,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, + + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; +- nm_i->dirty_nat_cnt--; ++ nm_i->nat_cnt[DIRTY_NAT]--; ++ nm_i->nat_cnt[RECLAIMABLE_NAT]++; + } + + static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, +@@ -2944,14 +2948,17 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) + LIST_HEAD(sets); + int err = 0; + +- /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ ++ /* ++ * during unmount, let's flush nat_bits before checking ++ * nat_cnt[DIRTY_NAT]. ++ */ + if (enabled_nat_bits(sbi, cpc)) { + down_write(&nm_i->nat_tree_lock); + remove_nats_in_journal(sbi); + up_write(&nm_i->nat_tree_lock); + } + +- if (!nm_i->dirty_nat_cnt) ++ if (!nm_i->nat_cnt[DIRTY_NAT]) + return 0; + + down_write(&nm_i->nat_tree_lock); +@@ -2962,7 +2969,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) + * into nat entry set. + */ + if (enabled_nat_bits(sbi, cpc) || +- !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) ++ !__has_cursum_space(journal, ++ nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) + remove_nats_in_journal(sbi); + + while ((found = __gang_lookup_nat_set(nm_i, +@@ -3086,7 +3094,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi) + F2FS_RESERVED_NODE_NUM; + nm_i->nid_cnt[FREE_NID] = 0; + nm_i->nid_cnt[PREALLOC_NID] = 0; +- nm_i->nat_cnt = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; + nm_i->ra_nid_pages = DEF_RA_NID_PAGES; + nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; +@@ -3220,7 +3227,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) + __del_from_nat_cache(nm_i, natvec[idx]); + } + } +- f2fs_bug_on(sbi, nm_i->nat_cnt); ++ f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]); + + /* destroy nat set cache */ + nid = 0; +diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h +index 69e5859e993cf..f84541b57acbb 100644 +--- a/fs/f2fs/node.h ++++ b/fs/f2fs/node.h +@@ -126,13 +126,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, + + static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) + { +- return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * ++ return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid * + NM_I(sbi)->dirty_nats_ratio / 100; + } + + static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) + { +- return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; ++ return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; + } + + static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) +diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c +index d66de5999a26d..dd3c3c7a90ec8 100644 +--- a/fs/f2fs/shrinker.c ++++ b/fs/f2fs/shrinker.c +@@ -18,9 +18,7 @@ static unsigned int shrinker_run_no; + + static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) + { +- long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; +- +- return count > 0 ? count : 0; ++ return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT]; + } + + static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) +-- +2.27.0 + diff --git a/queue-5.10/f2fs-fix-race-of-pending_pages-in-decompression.patch b/queue-5.10/f2fs-fix-race-of-pending_pages-in-decompression.patch new file mode 100644 index 00000000000..682cdf589c7 --- /dev/null +++ b/queue-5.10/f2fs-fix-race-of-pending_pages-in-decompression.patch @@ -0,0 +1,240 @@ +From cdbb5d70b70a016bc4b9045dfc7adb08e8718ff6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 5 Dec 2020 13:26:26 +0900 +Subject: f2fs: fix race of pending_pages in decompression + +From: Daeho Jeong + +[ Upstream commit 6422a71ef40e4751d59b8c9412e7e2dafe085878 ] + +I found out f2fs_free_dic() is invoked in a wrong timing, but +f2fs_verify_bio() still needed the dic info and it triggered the +below kernel panic. It has been caused by the race condition of +pending_pages value between decompression and verity logic, when +the same compression cluster had been split in different bios. +By split bios, f2fs_verify_bio() ended up with decreasing +pending_pages value before it is reset to nr_cpages by +f2fs_decompress_pages() and caused the kernel panic. + +[ 4416.564763] Unable to handle kernel NULL pointer dereference + at virtual address 0000000000000000 +... +[ 4416.896016] Workqueue: fsverity_read_queue f2fs_verity_work +[ 4416.908515] pc : fsverity_verify_page+0x20/0x78 +[ 4416.913721] lr : f2fs_verify_bio+0x11c/0x29c +[ 4416.913722] sp : ffffffc019533cd0 +[ 4416.913723] x29: ffffffc019533cd0 x28: 0000000000000402 +[ 4416.913724] x27: 0000000000000001 x26: 0000000000000100 +[ 4416.913726] x25: 0000000000000001 x24: 0000000000000004 +[ 4416.913727] x23: 0000000000001000 x22: 0000000000000000 +[ 4416.913728] x21: 0000000000000000 x20: ffffffff2076f9c0 +[ 4416.913729] x19: ffffffff2076f9c0 x18: ffffff8a32380c30 +[ 4416.913731] x17: ffffffc01f966d97 x16: 0000000000000298 +[ 4416.913732] x15: 0000000000000000 x14: 0000000000000000 +[ 4416.913733] x13: f074faec89ffffff x12: 0000000000000000 +[ 4416.913734] x11: 0000000000001000 x10: 0000000000001000 +[ 4416.929176] x9 : ffffffff20d1f5c7 x8 : 0000000000000000 +[ 4416.929178] x7 : 626d7464ff286b6b x6 : ffffffc019533ade +[ 4416.929179] x5 : 000000008049000e x4 : ffffffff2793e9e0 +[ 4416.929180] x3 : 000000008049000e x2 : ffffff89ecfa74d0 +[ 4416.929181] x1 : 0000000000000c40 x0 : ffffffff2076f9c0 +[ 4416.929184] Call trace: +[ 4416.929187] fsverity_verify_page+0x20/0x78 +[ 4416.929189] f2fs_verify_bio+0x11c/0x29c +[ 4416.929192] f2fs_verity_work+0x58/0x84 +[ 4417.050667] process_one_work+0x270/0x47c +[ 4417.055354] worker_thread+0x27c/0x4d8 +[ 4417.059784] kthread+0x13c/0x320 +[ 4417.063693] ret_from_fork+0x10/0x18 + +Chao pointed this can happen by the below race condition. + +Thread A f2fs_post_read_wq fsverity_wq +- f2fs_read_multi_pages() + - f2fs_alloc_dic + - dic->pending_pages = 2 + - submit_bio() + - submit_bio() + - f2fs_post_read_work() handle first bio + - f2fs_decompress_work() + - __read_end_io() + - f2fs_decompress_pages() + - dic->pending_pages-- + - enqueue f2fs_verity_work() + - f2fs_verity_work() handle first bio + - f2fs_verify_bio() + - dic->pending_pages-- + - f2fs_post_read_work() handle second bio + - f2fs_decompress_work() + - enqueue f2fs_verity_work() + - f2fs_verify_pages() + - f2fs_free_dic() + + - f2fs_verity_work() handle second bio + - f2fs_verfy_bio() + - use-after-free on dic + +Signed-off-by: Daeho Jeong +Signed-off-by: Jaegeuk Kim +Signed-off-by: Sasha Levin +--- + fs/f2fs/compress.c | 2 -- + fs/f2fs/data.c | 58 +++++++++++++++++++++++++++++++++++++--------- + fs/f2fs/f2fs.h | 1 + + 3 files changed, 48 insertions(+), 13 deletions(-) + +diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c +index 14262e0f1cd60..c5fee4d7ea72f 100644 +--- a/fs/f2fs/compress.c ++++ b/fs/f2fs/compress.c +@@ -798,8 +798,6 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) + if (cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); + out_free_dic: +- if (verity) +- atomic_set(&dic->pending_pages, dic->nr_cpages); + if (!verity) + f2fs_decompress_end_io(dic->rpages, dic->cluster_size, + ret, false); +diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c +index be4da52604edc..b29243ee1c3e5 100644 +--- a/fs/f2fs/data.c ++++ b/fs/f2fs/data.c +@@ -202,7 +202,7 @@ static void f2fs_verify_bio(struct bio *bio) + dic = (struct decompress_io_ctx *)page_private(page); + + if (dic) { +- if (atomic_dec_return(&dic->pending_pages)) ++ if (atomic_dec_return(&dic->verity_pages)) + continue; + f2fs_verify_pages(dic->rpages, + dic->cluster_size); +@@ -1027,7 +1027,8 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) + + static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages, unsigned op_flag, +- pgoff_t first_idx, bool for_write) ++ pgoff_t first_idx, bool for_write, ++ bool for_verity) + { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio; +@@ -1049,7 +1050,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + post_read_steps |= 1 << STEP_DECRYPT; + if (f2fs_compressed_file(inode)) + post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; +- if (f2fs_need_verity(inode, first_idx)) ++ if (for_verity && f2fs_need_verity(inode, first_idx)) + post_read_steps |= 1 << STEP_VERITY; + + if (post_read_steps) { +@@ -1079,7 +1080,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, + struct bio *bio; + + bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, +- page->index, for_write); ++ page->index, for_write, true); + if (IS_ERR(bio)) + return PTR_ERR(bio); + +@@ -2133,7 +2134,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, + if (bio == NULL) { + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, + is_readahead ? REQ_RAHEAD : 0, page->index, +- false); ++ false, true); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + bio = NULL; +@@ -2180,6 +2181,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct decompress_io_ctx *dic = NULL; ++ struct bio_post_read_ctx *ctx; ++ bool for_verity = false; + int i; + int ret = 0; + +@@ -2245,10 +2248,29 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + goto out_put_dnode; + } + ++ /* ++ * It's possible to enable fsverity on the fly when handling a cluster, ++ * which requires complicated error handling. Instead of adding more ++ * complexity, let's give a rule where end_io post-processes fsverity ++ * per cluster. In order to do that, we need to submit bio, if previous ++ * bio sets a different post-process policy. ++ */ ++ if (fsverity_active(cc->inode)) { ++ atomic_set(&dic->verity_pages, cc->nr_cpages); ++ for_verity = true; ++ ++ if (bio) { ++ ctx = bio->bi_private; ++ if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { ++ __submit_bio(sbi, bio, DATA); ++ bio = NULL; ++ } ++ } ++ } ++ + for (i = 0; i < dic->nr_cpages; i++) { + struct page *page = dic->cpages[i]; + block_t blkaddr; +- struct bio_post_read_ctx *ctx; + + blkaddr = data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1); +@@ -2264,17 +2286,31 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + if (!bio) { + bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, + is_readahead ? REQ_RAHEAD : 0, +- page->index, for_write); ++ page->index, for_write, for_verity); + if (IS_ERR(bio)) { ++ unsigned int remained = dic->nr_cpages - i; ++ bool release = false; ++ + ret = PTR_ERR(bio); + dic->failed = true; +- if (!atomic_sub_return(dic->nr_cpages - i, +- &dic->pending_pages)) { ++ ++ if (for_verity) { ++ if (!atomic_sub_return(remained, ++ &dic->verity_pages)) ++ release = true; ++ } else { ++ if (!atomic_sub_return(remained, ++ &dic->pending_pages)) ++ release = true; ++ } ++ ++ if (release) { + f2fs_decompress_end_io(dic->rpages, +- cc->cluster_size, true, +- false); ++ cc->cluster_size, true, ++ false); + f2fs_free_dic(dic); + } ++ + f2fs_put_dnode(&dn); + *bio_ret = NULL; + return ret; +diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h +index e4344d98a780c..06e5a6053f3f9 100644 +--- a/fs/f2fs/f2fs.h ++++ b/fs/f2fs/f2fs.h +@@ -1410,6 +1410,7 @@ struct decompress_io_ctx { + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + atomic_t pending_pages; /* in-flight compressed page count */ ++ atomic_t verity_pages; /* in-flight page count for verity */ + bool failed; /* indicate IO error during decompression */ + void *private; /* payload buffer for specified decompression algorithm */ + void *private2; /* extra payload buffer */ +-- +2.27.0 + diff --git a/queue-5.10/fs-namespace.c-warn-if-mnt_count-has-become-negative.patch b/queue-5.10/fs-namespace.c-warn-if-mnt_count-has-become-negative.patch new file mode 100644 index 00000000000..c2c9e187331 --- /dev/null +++ b/queue-5.10/fs-namespace.c-warn-if-mnt_count-has-become-negative.patch @@ -0,0 +1,87 @@ +From 74e61d8287006d6d2ce2df987bafe3b803da24fb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 31 Oct 2020 21:40:21 -0700 +Subject: fs/namespace.c: WARN if mnt_count has become negative + +From: Eric Biggers + +[ Upstream commit edf7ddbf1c5eb98b720b063b73e20e8a4a1ce673 ] + +Missing calls to mntget() (or equivalently, too many calls to mntput()) +are hard to detect because mntput() delays freeing mounts using +task_work_add(), then again using call_rcu(). As a result, mnt_count +can often be decremented to -1 without getting a KASAN use-after-free +report. Such cases are still bugs though, and they point to real +use-after-frees being possible. + +For an example of this, see the bug fixed by commit 1b0b9cc8d379 +("vfs: fsmount: add missing mntget()"), discussed at +https://lkml.kernel.org/linux-fsdevel/20190605135401.GB30925@xxxxxxxxxxxxxxxxxxxxxxxxx/T/#u. +This bug *should* have been trivial to find. But actually, it wasn't +found until syzkaller happened to use fchdir() to manipulate the +reference count just right for the bug to be noticeable. + +Address this by making mntput_no_expire() issue a WARN if mnt_count has +become negative. + +Suggested-by: Miklos Szeredi +Signed-off-by: Eric Biggers +Signed-off-by: Al Viro +Signed-off-by: Sasha Levin +--- + fs/namespace.c | 9 ++++++--- + fs/pnode.h | 2 +- + 2 files changed, 7 insertions(+), 4 deletions(-) + +diff --git a/fs/namespace.c b/fs/namespace.c +index cebaa3e817940..93006abe7946a 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -156,10 +156,10 @@ static inline void mnt_add_count(struct mount *mnt, int n) + /* + * vfsmount lock must be held for write + */ +-unsigned int mnt_get_count(struct mount *mnt) ++int mnt_get_count(struct mount *mnt) + { + #ifdef CONFIG_SMP +- unsigned int count = 0; ++ int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { +@@ -1139,6 +1139,7 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); + static void mntput_no_expire(struct mount *mnt) + { + LIST_HEAD(list); ++ int count; + + rcu_read_lock(); + if (likely(READ_ONCE(mnt->mnt_ns))) { +@@ -1162,7 +1163,9 @@ static void mntput_no_expire(struct mount *mnt) + */ + smp_mb(); + mnt_add_count(mnt, -1); +- if (mnt_get_count(mnt)) { ++ count = mnt_get_count(mnt); ++ if (count != 0) { ++ WARN_ON(count < 0); + rcu_read_unlock(); + unlock_mount_hash(); + return; +diff --git a/fs/pnode.h b/fs/pnode.h +index 49a058c73e4c7..26f74e092bd98 100644 +--- a/fs/pnode.h ++++ b/fs/pnode.h +@@ -44,7 +44,7 @@ int propagate_mount_busy(struct mount *, int); + void propagate_mount_unlock(struct mount *); + void mnt_release_group_id(struct mount *); + int get_dominating_id(struct mount *mnt, const struct path *root); +-unsigned int mnt_get_count(struct mount *mnt); ++int mnt_get_count(struct mount *mnt); + void mnt_set_mountpoint(struct mount *, struct mountpoint *, + struct mount *); + void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, +-- +2.27.0 + diff --git a/queue-5.10/i3c-master-fix-missing-destroy_workqueue-on-error-in.patch b/queue-5.10/i3c-master-fix-missing-destroy_workqueue-on-error-in.patch new file mode 100644 index 00000000000..cc804fe0943 --- /dev/null +++ b/queue-5.10/i3c-master-fix-missing-destroy_workqueue-on-error-in.patch @@ -0,0 +1,47 @@ +From 0bbe3ee7a0495873767eef9bc1a81a0d426b7ec3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 28 Oct 2020 17:15:43 +0800 +Subject: i3c master: fix missing destroy_workqueue() on error in + i3c_master_register + +From: Qinglang Miao + +[ Upstream commit 59165d16c699182b86b5c65181013f1fd88feb62 ] + +Add the missing destroy_workqueue() before return from +i3c_master_register in the error handling case. + +Signed-off-by: Qinglang Miao +Signed-off-by: Boris Brezillon +Link: https://lore.kernel.org/linux-i3c/20201028091543.136167-1-miaoqinglang@huawei.com +Signed-off-by: Sasha Levin +--- + drivers/i3c/master.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c +index 1c6b78ad5ade4..b61bf53ec07af 100644 +--- a/drivers/i3c/master.c ++++ b/drivers/i3c/master.c +@@ -2537,7 +2537,7 @@ int i3c_master_register(struct i3c_master_controller *master, + + ret = i3c_master_bus_init(master); + if (ret) +- goto err_put_dev; ++ goto err_destroy_wq; + + ret = device_add(&master->dev); + if (ret) +@@ -2568,6 +2568,9 @@ int i3c_master_register(struct i3c_master_controller *master, + err_cleanup_bus: + i3c_master_bus_cleanup(master); + ++err_destroy_wq: ++ destroy_workqueue(master->wq); ++ + err_put_dev: + put_device(&master->dev); + +-- +2.27.0 + diff --git a/queue-5.10/io_uring-remove-racy-overflow-list-fast-checks.patch b/queue-5.10/io_uring-remove-racy-overflow-list-fast-checks.patch new file mode 100644 index 00000000000..ba404e46ca5 --- /dev/null +++ b/queue-5.10/io_uring-remove-racy-overflow-list-fast-checks.patch @@ -0,0 +1,48 @@ +From 4f47f5ce6be4ce5f29501a5511bf56080b0bd25c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 17 Dec 2020 00:24:36 +0000 +Subject: io_uring: remove racy overflow list fast checks + +From: Pavel Begunkov + +[ Upstream commit 9cd2be519d05ee78876d55e8e902b7125f78b74f ] + +list_empty_careful() is not racy only if some conditions are met, i.e. +no re-adds after del_init. io_cqring_overflow_flush() does list_move(), +so it's actually racy. + +Remove those checks, we have ->cq_check_overflow for the fast path. + +Signed-off-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + fs/io_uring.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/fs/io_uring.c b/fs/io_uring.c +index e28eedab5365f..1f798c5c4213e 100644 +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -1636,8 +1636,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, + LIST_HEAD(list); + + if (!force) { +- if (list_empty_careful(&ctx->cq_overflow_list)) +- return true; + if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == + rings->cq_ring_entries)) + return false; +@@ -6579,8 +6577,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) + + /* if we have a backlog and couldn't flush it all, return BUSY */ + if (test_bit(0, &ctx->sq_check_overflow)) { +- if (!list_empty(&ctx->cq_overflow_list) && +- !io_cqring_overflow_flush(ctx, false, NULL, NULL)) ++ if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) + return -EBUSY; + } + +-- +2.27.0 + diff --git a/queue-5.10/module-delay-kobject-uevent-until-after-module-init-.patch b/queue-5.10/module-delay-kobject-uevent-until-after-module-init-.patch new file mode 100644 index 00000000000..ce0bf05791a --- /dev/null +++ b/queue-5.10/module-delay-kobject-uevent-until-after-module-init-.patch @@ -0,0 +1,72 @@ +From 92413ae233b6e4d6dbc685871a90c8a74fc43573 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 27 Nov 2020 10:09:39 +0100 +Subject: module: delay kobject uevent until after module init call + +From: Jessica Yu + +[ Upstream commit 38dc717e97153e46375ee21797aa54777e5498f3 ] + +Apparently there has been a longstanding race between udev/systemd and +the module loader. Currently, the module loader sends a uevent right +after sysfs initialization, but before the module calls its init +function. However, some udev rules expect that the module has +initialized already upon receiving the uevent. + +This race has been triggered recently (see link in references) in some +systemd mount unit files. For instance, the configfs module creates the +/sys/kernel/config mount point in its init function, however the module +loader issues the uevent before this happens. sys-kernel-config.mount +expects to be able to mount /sys/kernel/config upon receipt of the +module loading uevent, but if the configfs module has not called its +init function yet, then this directory will not exist and the mount unit +fails. A similar situation exists for sys-fs-fuse-connections.mount, as +the fuse sysfs mount point is created during the fuse module's init +function. If udev is faster than module initialization then the mount +unit would fail in a similar fashion. + +To fix this race, delay the module KOBJ_ADD uevent until after the +module has finished calling its init routine. + +References: https://github.com/systemd/systemd/issues/17586 +Reviewed-by: Greg Kroah-Hartman +Tested-By: Nicolas Morey-Chaisemartin +Signed-off-by: Jessica Yu +Signed-off-by: Sasha Levin +--- + kernel/module.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/kernel/module.c b/kernel/module.c +index b34235082394b..e20499309b2af 100644 +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -1895,7 +1895,6 @@ static int mod_sysfs_init(struct module *mod) + if (err) + mod_kobject_put(mod); + +- /* delay uevent until full sysfs population */ + out: + return err; + } +@@ -1932,7 +1931,6 @@ static int mod_sysfs_setup(struct module *mod, + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); + +- kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); + return 0; + + out_unreg_modinfo_attrs: +@@ -3639,6 +3637,9 @@ static noinline int do_init_module(struct module *mod) + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); + ++ /* Delay uevent until module has finished its init routine */ ++ kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); ++ + /* + * We need to finish all async code before the module init sequence + * is done. This has potential to deadlock. For example, a newly +-- +2.27.0 + diff --git a/queue-5.10/module-set-module_state_going-state-when-a-module-fa.patch b/queue-5.10/module-set-module_state_going-state-when-a-module-fa.patch new file mode 100644 index 00000000000..437f30b3850 --- /dev/null +++ b/queue-5.10/module-set-module_state_going-state-when-a-module-fa.patch @@ -0,0 +1,36 @@ +From d2bb58a0425a26b741ef1868664c1dd83906a6ae Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Oct 2020 15:03:36 +0100 +Subject: module: set MODULE_STATE_GOING state when a module fails to load + +From: Miroslav Benes + +[ Upstream commit 5e8ed280dab9eeabc1ba0b2db5dbe9fe6debb6b5 ] + +If a module fails to load due to an error in prepare_coming_module(), +the following error handling in load_module() runs with +MODULE_STATE_COMING in module's state. Fix it by correctly setting +MODULE_STATE_GOING under "bug_cleanup" label. + +Signed-off-by: Miroslav Benes +Signed-off-by: Jessica Yu +Signed-off-by: Sasha Levin +--- + kernel/module.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/kernel/module.c b/kernel/module.c +index a4fa44a652a75..b34235082394b 100644 +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -3991,6 +3991,7 @@ static int load_module(struct load_info *info, const char __user *uargs, + MODULE_STATE_GOING, mod); + klp_module_going(mod); + bug_cleanup: ++ mod->state = MODULE_STATE_GOING; + /* module_bug_cleanup needs module_mutex protection */ + mutex_lock(&module_mutex); + module_bug_cleanup(mod); +-- +2.27.0 + diff --git a/queue-5.10/nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch b/queue-5.10/nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch new file mode 100644 index 00000000000..b6547caf7de --- /dev/null +++ b/queue-5.10/nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch @@ -0,0 +1,131 @@ +From 8b2e2c9ed5303e21f128fc068ff6d3eb0577996f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Nov 2020 12:06:14 -0500 +Subject: NFSv4: Fix a pNFS layout related use-after-free race when freeing the + inode + +From: Trond Myklebust + +[ Upstream commit b6d49ecd1081740b6e632366428b960461f8158b ] + +When returning the layout in nfs4_evict_inode(), we need to ensure that +the layout is actually done being freed before we can proceed to free the +inode itself. + +Signed-off-by: Trond Myklebust +Signed-off-by: Sasha Levin +--- + fs/nfs/nfs4super.c | 2 +- + fs/nfs/pnfs.c | 33 +++++++++++++++++++++++++++++++-- + fs/nfs/pnfs.h | 5 +++++ + 3 files changed, 37 insertions(+), 3 deletions(-) + +diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c +index 93f5c1678ec29..984cc42ee54d8 100644 +--- a/fs/nfs/nfs4super.c ++++ b/fs/nfs/nfs4super.c +@@ -67,7 +67,7 @@ static void nfs4_evict_inode(struct inode *inode) + nfs_inode_evict_delegation(inode); + /* Note that above delegreturn would trigger pnfs return-on-close */ + pnfs_return_layout(inode); +- pnfs_destroy_layout(NFS_I(inode)); ++ pnfs_destroy_layout_final(NFS_I(inode)); + /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); + nfs4_xattr_cache_zap(inode); +diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c +index 0e50b9d45c320..07f59dc8cb2e7 100644 +--- a/fs/nfs/pnfs.c ++++ b/fs/nfs/pnfs.c +@@ -294,6 +294,7 @@ void + pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) + { + struct inode *inode; ++ unsigned long i_state; + + if (!lo) + return; +@@ -304,8 +305,12 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) + if (!list_empty(&lo->plh_segs)) + WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); + pnfs_detach_layout_hdr(lo); ++ i_state = inode->i_state; + spin_unlock(&inode->i_lock); + pnfs_free_layout_hdr(lo); ++ /* Notify pnfs_destroy_layout_final() that we're done */ ++ if (i_state & (I_FREEING | I_CLEAR)) ++ wake_up_var(lo); + } + } + +@@ -734,8 +739,7 @@ pnfs_free_lseg_list(struct list_head *free_me) + } + } + +-void +-pnfs_destroy_layout(struct nfs_inode *nfsi) ++static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi) + { + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); +@@ -753,9 +757,34 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) + pnfs_put_layout_hdr(lo); + } else + spin_unlock(&nfsi->vfs_inode.i_lock); ++ return lo; ++} ++ ++void pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++ __pnfs_destroy_layout(nfsi); + } + EXPORT_SYMBOL_GPL(pnfs_destroy_layout); + ++static bool pnfs_layout_removed(struct nfs_inode *nfsi, ++ struct pnfs_layout_hdr *lo) ++{ ++ bool ret; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ ret = nfsi->layout != lo; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ return ret; ++} ++ ++void pnfs_destroy_layout_final(struct nfs_inode *nfsi) ++{ ++ struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); ++ ++ if (lo) ++ wait_var_event(lo, pnfs_layout_removed(nfsi, lo)); ++} ++ + static bool + pnfs_layout_add_bulk_destroy_list(struct inode *inode, + struct list_head *layout_list) +diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h +index 2661c44c62db4..78c3893918486 100644 +--- a/fs/nfs/pnfs.h ++++ b/fs/nfs/pnfs.h +@@ -266,6 +266,7 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); + void pnfs_layoutget_free(struct nfs4_layoutget *lgp); + void pnfs_free_lseg_list(struct list_head *tmp_list); + void pnfs_destroy_layout(struct nfs_inode *); ++void pnfs_destroy_layout_final(struct nfs_inode *); + void pnfs_destroy_all_layouts(struct nfs_client *); + int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, +@@ -710,6 +711,10 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) + { + } + ++static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi) ++{ ++} ++ + static inline struct pnfs_layout_segment * + pnfs_get_lseg(struct pnfs_layout_segment *lseg) + { +-- +2.27.0 + diff --git a/queue-5.10/nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch b/queue-5.10/nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch new file mode 100644 index 00000000000..077d921639b --- /dev/null +++ b/queue-5.10/nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch @@ -0,0 +1,118 @@ +From 3b77fb9e8c4f6a6b5e1ae071b96467ef3619aa34 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 8 Dec 2020 07:51:29 -0500 +Subject: NFSv4.2: Don't error when exiting early on a READ_PLUS buffer + overflow + +From: Trond Myklebust + +[ Upstream commit 503b934a752f7e789a5f33217520e0a79f3096ac ] + +Expanding the READ_PLUS extents can cause the read buffer to overflow. +If it does, then don't error, but just exit early. + +Signed-off-by: Trond Myklebust +Signed-off-by: Sasha Levin +--- + fs/nfs/nfs42xdr.c | 36 +++++++++++++++++------------------- + 1 file changed, 17 insertions(+), 19 deletions(-) + +diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c +index 8432bd6b95f08..c078f88552695 100644 +--- a/fs/nfs/nfs42xdr.c ++++ b/fs/nfs/nfs42xdr.c +@@ -1019,29 +1019,24 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re + return decode_op_hdr(xdr, OP_DEALLOCATE); + } + +-static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res, +- uint32_t *eof) ++static int decode_read_plus_data(struct xdr_stream *xdr, ++ struct nfs_pgio_res *res) + { + uint32_t count, recvd; + uint64_t offset; + __be32 *p; + + p = xdr_inline_decode(xdr, 8 + 4); +- if (unlikely(!p)) +- return -EIO; ++ if (!p) ++ return 1; + + p = xdr_decode_hyper(p, &offset); + count = be32_to_cpup(p); + recvd = xdr_align_data(xdr, res->count, count); + res->count += recvd; + +- if (count > recvd) { +- dprintk("NFS: server cheating in read reply: " +- "count %u > recvd %u\n", count, recvd); +- *eof = 0; ++ if (count > recvd) + return 1; +- } +- + return 0; + } + +@@ -1052,18 +1047,16 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *re + __be32 *p; + + p = xdr_inline_decode(xdr, 8 + 8); +- if (unlikely(!p)) +- return -EIO; ++ if (!p) ++ return 1; + + p = xdr_decode_hyper(p, &offset); + p = xdr_decode_hyper(p, &length); + recvd = xdr_expand_hole(xdr, res->count, length); + res->count += recvd; + +- if (recvd < length) { +- *eof = 0; ++ if (recvd < length) + return 1; +- } + return 0; + } + +@@ -1088,12 +1081,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) + + for (i = 0; i < segments; i++) { + p = xdr_inline_decode(xdr, 4); +- if (unlikely(!p)) +- return -EIO; ++ if (!p) ++ goto early_out; + + type = be32_to_cpup(p++); + if (type == NFS4_CONTENT_DATA) +- status = decode_read_plus_data(xdr, res, &eof); ++ status = decode_read_plus_data(xdr, res); + else if (type == NFS4_CONTENT_HOLE) + status = decode_read_plus_hole(xdr, res, &eof); + else +@@ -1102,12 +1095,17 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) + if (status < 0) + return status; + if (status > 0) +- break; ++ goto early_out; + } + + out: + res->eof = eof; + return 0; ++early_out: ++ if (unlikely(!i)) ++ return -EIO; ++ res->eof = 0; ++ return 0; + } + + static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) +-- +2.27.0 + diff --git a/queue-5.10/powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch b/queue-5.10/powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch new file mode 100644 index 00000000000..ab65a135d64 --- /dev/null +++ b/queue-5.10/powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch @@ -0,0 +1,171 @@ +From a2d81a7069f97534eff5249a8448a288358086c3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 7 Nov 2020 11:43:36 +1000 +Subject: powerpc/64: irq replay remove decrementer overflow check + +From: Nicholas Piggin + +[ Upstream commit 59d512e4374b2d8a6ad341475dc94c4a4bdec7d3 ] + +This is way to catch some cases of decrementer overflow, when the +decrementer has underflowed an odd number of times, while MSR[EE] was +disabled. + +With a typical small decrementer, a timer that fires when MSR[EE] is +disabled will be "lost" if MSR[EE] remains disabled for between 4.3 and +8.6 seconds after the timer expires. In any case, the decrementer +interrupt would be taken at 8.6 seconds and the timer would be found at +that point. + +So this check is for catching extreme latency events, and it prevents +those latencies from being a further few seconds long. It's not obvious +this is a good tradeoff. This is already a watchdog magnitude event and +that situation is not improved a significantly with this check. For +large decrementers, it's useless. + +Therefore remove this check, which avoids a mftb when enabling hard +disabled interrupts (e.g., when enabling after coming from hardware +interrupt handlers). Perhaps more importantly, it also removes the +clunky MSR[EE] vs PACA_IRQ_HARD_DIS incoherency in soft-interrupt replay +which simplifies the code. + +Signed-off-by: Nicholas Piggin +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20201107014336.2337337-1-npiggin@gmail.com +Signed-off-by: Sasha Levin +--- + arch/powerpc/kernel/irq.c | 53 ++------------------------- + arch/powerpc/kernel/time.c | 9 ++--- + arch/powerpc/platforms/powernv/opal.c | 2 +- + 3 files changed, 8 insertions(+), 56 deletions(-) + +diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c +index 7d0f7682d01df..6b1eca53e36cc 100644 +--- a/arch/powerpc/kernel/irq.c ++++ b/arch/powerpc/kernel/irq.c +@@ -102,14 +102,6 @@ static inline notrace unsigned long get_irq_happened(void) + return happened; + } + +-static inline notrace int decrementer_check_overflow(void) +-{ +- u64 now = get_tb(); +- u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); +- +- return now >= *next_tb; +-} +- + #ifdef CONFIG_PPC_BOOK3E + + /* This is called whenever we are re-enabling interrupts +@@ -142,35 +134,6 @@ notrace unsigned int __check_irq_replay(void) + trace_hardirqs_on(); + trace_hardirqs_off(); + +- /* +- * We are always hard disabled here, but PACA_IRQ_HARD_DIS may +- * not be set, which means interrupts have only just been hard +- * disabled as part of the local_irq_restore or interrupt return +- * code. In that case, skip the decrementr check becaus it's +- * expensive to read the TB. +- * +- * HARD_DIS then gets cleared here, but it's reconciled later. +- * Either local_irq_disable will replay the interrupt and that +- * will reconcile state like other hard interrupts. Or interrupt +- * retur will replay the interrupt and in that case it sets +- * PACA_IRQ_HARD_DIS by hand (see comments in entry_64.S). +- */ +- if (happened & PACA_IRQ_HARD_DIS) { +- local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; +- +- /* +- * We may have missed a decrementer interrupt if hard disabled. +- * Check the decrementer register in case we had a rollover +- * while hard disabled. +- */ +- if (!(happened & PACA_IRQ_DEC)) { +- if (decrementer_check_overflow()) { +- local_paca->irq_happened |= PACA_IRQ_DEC; +- happened |= PACA_IRQ_DEC; +- } +- } +- } +- + if (happened & PACA_IRQ_DEC) { + local_paca->irq_happened &= ~PACA_IRQ_DEC; + return 0x900; +@@ -186,6 +149,9 @@ notrace unsigned int __check_irq_replay(void) + return 0x280; + } + ++ if (happened & PACA_IRQ_HARD_DIS) ++ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; ++ + /* There should be nothing left ! */ + BUG_ON(local_paca->irq_happened != 0); + +@@ -229,18 +195,6 @@ void replay_soft_interrupts(void) + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(mfmsr() & MSR_EE); + +- if (happened & PACA_IRQ_HARD_DIS) { +- /* +- * We may have missed a decrementer interrupt if hard disabled. +- * Check the decrementer register in case we had a rollover +- * while hard disabled. +- */ +- if (!(happened & PACA_IRQ_DEC)) { +- if (decrementer_check_overflow()) +- happened |= PACA_IRQ_DEC; +- } +- } +- + /* + * Force the delivery of pending soft-disabled interrupts on PS3. + * Any HV call will have this side effect. +@@ -345,6 +299,7 @@ notrace void arch_local_irq_restore(unsigned long mask) + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(!(mfmsr() & MSR_EE)); + __hard_irq_disable(); ++ local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + } else { + /* + * We should already be hard disabled here. We had bugs +diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c +index 74efe46f55327..7d372ff3504b2 100644 +--- a/arch/powerpc/kernel/time.c ++++ b/arch/powerpc/kernel/time.c +@@ -552,14 +552,11 @@ void timer_interrupt(struct pt_regs *regs) + struct pt_regs *old_regs; + u64 now; + +- /* Some implementations of hotplug will get timer interrupts while +- * offline, just ignore these and we also need to set +- * decrementers_next_tb as MAX to make sure __check_irq_replay +- * don't replay timer interrupt when return, otherwise we'll trap +- * here infinitely :( ++ /* ++ * Some implementations of hotplug will get timer interrupts while ++ * offline, just ignore these. + */ + if (unlikely(!cpu_online(smp_processor_id()))) { +- *next_tb = ~(u64)0; + set_dec(decrementer_max); + return; + } +diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c +index d95954ad4c0af..c61c3b62c8c62 100644 +--- a/arch/powerpc/platforms/powernv/opal.c ++++ b/arch/powerpc/platforms/powernv/opal.c +@@ -731,7 +731,7 @@ int opal_hmi_exception_early2(struct pt_regs *regs) + return 1; + } + +-/* HMI exception handler called in virtual mode during check_irq_replay. */ ++/* HMI exception handler called in virtual mode when irqs are next enabled. */ + int opal_handle_hmi_exception(struct pt_regs *regs) + { + /* +-- +2.27.0 + diff --git a/queue-5.10/powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch b/queue-5.10/powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch new file mode 100644 index 00000000000..157ee1ae192 --- /dev/null +++ b/queue-5.10/powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch @@ -0,0 +1,39 @@ +From 0679d328640a54ecbb2338a544b32d729121fd31 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 28 Oct 2020 17:15:51 +0800 +Subject: powerpc: sysdev: add missing iounmap() on error in mpic_msgr_probe() + +From: Qinglang Miao + +[ Upstream commit ffa1797040c5da391859a9556be7b735acbe1242 ] + +I noticed that iounmap() of msgr_block_addr before return from +mpic_msgr_probe() in the error handling case is missing. So use +devm_ioremap() instead of just ioremap() when remapping the message +register block, so the mapping will be automatically released on +probe failure. + +Signed-off-by: Qinglang Miao +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20201028091551.136400-1-miaoqinglang@huawei.com +Signed-off-by: Sasha Levin +--- + arch/powerpc/sysdev/mpic_msgr.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/powerpc/sysdev/mpic_msgr.c b/arch/powerpc/sysdev/mpic_msgr.c +index f6b253e2be409..36ec0bdd8b63c 100644 +--- a/arch/powerpc/sysdev/mpic_msgr.c ++++ b/arch/powerpc/sysdev/mpic_msgr.c +@@ -191,7 +191,7 @@ static int mpic_msgr_probe(struct platform_device *dev) + + /* IO map the message register block. */ + of_address_to_resource(np, 0, &rsrc); +- msgr_block_addr = ioremap(rsrc.start, resource_size(&rsrc)); ++ msgr_block_addr = devm_ioremap(&dev->dev, rsrc.start, resource_size(&rsrc)); + if (!msgr_block_addr) { + dev_err(&dev->dev, "Failed to iomap MPIC message registers"); + return -EFAULT; +-- +2.27.0 + diff --git a/queue-5.10/quota-don-t-overflow-quota-file-offsets.patch b/queue-5.10/quota-don-t-overflow-quota-file-offsets.patch new file mode 100644 index 00000000000..046f67b6cb3 --- /dev/null +++ b/queue-5.10/quota-don-t-overflow-quota-file-offsets.patch @@ -0,0 +1,66 @@ +From 127af98fb71671f76c2e8a5e2c0bd5187ee8bab7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 2 Nov 2020 16:32:10 +0100 +Subject: quota: Don't overflow quota file offsets + +From: Jan Kara + +[ Upstream commit 10f04d40a9fa29785206c619f80d8beedb778837 ] + +The on-disk quota format supports quota files with upto 2^32 blocks. Be +careful when computing quota file offsets in the quota files from block +numbers as they can overflow 32-bit types. Since quota files larger than +4GB would require ~26 millions of quota users, this is mostly a +theoretical concern now but better be careful, fuzzers would find the +problem sooner or later anyway... + +Reviewed-by: Andreas Dilger +Signed-off-by: Jan Kara +Signed-off-by: Sasha Levin +--- + fs/quota/quota_tree.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c +index a6f856f341dc7..c5562c871c8be 100644 +--- a/fs/quota/quota_tree.c ++++ b/fs/quota/quota_tree.c +@@ -62,7 +62,7 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) + + memset(buf, 0, info->dqi_usable_bs); + return sb->s_op->quota_read(sb, info->dqi_type, buf, +- info->dqi_usable_bs, blk << info->dqi_blocksize_bits); ++ info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); + } + + static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) +@@ -71,7 +71,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) + ssize_t ret; + + ret = sb->s_op->quota_write(sb, info->dqi_type, buf, +- info->dqi_usable_bs, blk << info->dqi_blocksize_bits); ++ info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); + if (ret != info->dqi_usable_bs) { + quota_error(sb, "dquota write failed"); + if (ret >= 0) +@@ -284,7 +284,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, + blk); + goto out_buf; + } +- dquot->dq_off = (blk << info->dqi_blocksize_bits) + ++ dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) + + sizeof(struct qt_disk_dqdbheader) + + i * info->dqi_entry_size; + kfree(buf); +@@ -559,7 +559,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, + ret = -EIO; + goto out_buf; + } else { +- ret = (blk << info->dqi_blocksize_bits) + sizeof(struct ++ ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct + qt_disk_dqdbheader) + i * info->dqi_entry_size; + } + out_buf: +-- +2.27.0 + diff --git a/queue-5.10/rtc-pl031-fix-resource-leak-in-pl031_probe.patch b/queue-5.10/rtc-pl031-fix-resource-leak-in-pl031_probe.patch new file mode 100644 index 00000000000..767ebf6e127 --- /dev/null +++ b/queue-5.10/rtc-pl031-fix-resource-leak-in-pl031_probe.patch @@ -0,0 +1,42 @@ +From a8587cba324a6acf49b45b72d0f93f686592a0a1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Nov 2020 17:31:39 +0800 +Subject: rtc: pl031: fix resource leak in pl031_probe + +From: Zheng Liang + +[ Upstream commit 1eab0fea2514b269e384c117f5b5772b882761f0 ] + +When devm_rtc_allocate_device is failed in pl031_probe, it should release +mem regions with device. + +Reported-by: Hulk Robot +Signed-off-by: Zheng Liang +Signed-off-by: Alexandre Belloni +Acked-by: Linus Walleij +Link: https://lore.kernel.org/r/20201112093139.32566-1-zhengliang6@huawei.com +Signed-off-by: Sasha Levin +--- + drivers/rtc/rtc-pl031.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c +index c6b89273feba8..d4b2ab7861266 100644 +--- a/drivers/rtc/rtc-pl031.c ++++ b/drivers/rtc/rtc-pl031.c +@@ -361,8 +361,10 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id) + + device_init_wakeup(&adev->dev, true); + ldata->rtc = devm_rtc_allocate_device(&adev->dev); +- if (IS_ERR(ldata->rtc)) +- return PTR_ERR(ldata->rtc); ++ if (IS_ERR(ldata->rtc)) { ++ ret = PTR_ERR(ldata->rtc); ++ goto out; ++ } + + ldata->rtc->ops = ops; + ldata->rtc->range_min = vendor->range_min; +-- +2.27.0 + diff --git a/queue-5.10/rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch b/queue-5.10/rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch new file mode 100644 index 00000000000..4532f689699 --- /dev/null +++ b/queue-5.10/rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch @@ -0,0 +1,65 @@ +From 2893a626a352c4d938c9d49cf0d0ba7d0a0cf95d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 20 Oct 2020 14:12:26 +0800 +Subject: rtc: sun6i: Fix memleak in sun6i_rtc_clk_init + +From: Dinghao Liu + +[ Upstream commit 28d211919e422f58c1e6c900e5810eee4f1ce4c8 ] + +When clk_hw_register_fixed_rate_with_accuracy() fails, +clk_data should be freed. It's the same for the subsequent +two error paths, but we should also unregister the already +registered clocks in them. + +Signed-off-by: Dinghao Liu +Signed-off-by: Alexandre Belloni +Link: https://lore.kernel.org/r/20201020061226.6572-1-dinghao.liu@zju.edu.cn +Signed-off-by: Sasha Levin +--- + drivers/rtc/rtc-sun6i.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c +index e2b8b150bcb44..f2818cdd11d82 100644 +--- a/drivers/rtc/rtc-sun6i.c ++++ b/drivers/rtc/rtc-sun6i.c +@@ -272,7 +272,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, + 300000000); + if (IS_ERR(rtc->int_osc)) { + pr_crit("Couldn't register the internal oscillator\n"); +- return; ++ goto err; + } + + parents[0] = clk_hw_get_name(rtc->int_osc); +@@ -290,7 +290,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, + rtc->losc = clk_register(NULL, &rtc->hw); + if (IS_ERR(rtc->losc)) { + pr_crit("Couldn't register the LOSC clock\n"); +- return; ++ goto err_register; + } + + of_property_read_string_index(node, "clock-output-names", 1, +@@ -301,7 +301,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, + &rtc->lock); + if (IS_ERR(rtc->ext_losc)) { + pr_crit("Couldn't register the LOSC external gate\n"); +- return; ++ goto err_register; + } + + clk_data->num = 2; +@@ -314,6 +314,8 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, + of_clk_add_hw_provider(node, of_clk_hw_onecell_get, clk_data); + return; + ++err_register: ++ clk_hw_unregister_fixed_rate(rtc->int_osc); + err: + kfree(clk_data); + } +-- +2.27.0 + diff --git a/queue-5.10/s390-always-clear-kernel-stack-backchain-before-call.patch b/queue-5.10/s390-always-clear-kernel-stack-backchain-before-call.patch new file mode 100644 index 00000000000..69d886c55ba --- /dev/null +++ b/queue-5.10/s390-always-clear-kernel-stack-backchain-before-call.patch @@ -0,0 +1,87 @@ +From 35edc5975bac95b303541b9e83f4ed12d5a4684c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 4 Dec 2020 17:56:57 +0100 +Subject: s390: always clear kernel stack backchain before calling functions + +From: Heiko Carstens + +[ Upstream commit 9365965db0c7ca7fc81eee27c21d8522d7102c32 ] + +Clear the kernel stack backchain before potentially calling the +lockdep trace_hardirqs_off/on functions. Without this walking the +kernel backchain, e.g. during a panic, might stop too early. + +Signed-off-by: Heiko Carstens +Signed-off-by: Sasha Levin +--- + arch/s390/kernel/entry.S | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S +index 6343dca0dbeb6..71203324ff42b 100644 +--- a/arch/s390/kernel/entry.S ++++ b/arch/s390/kernel/entry.S +@@ -406,6 +406,7 @@ ENTRY(system_call) + mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW + mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC + stg %r14,__PT_FLAGS(%r11) ++ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + ENABLE_INTS + .Lsysc_do_svc: + # clear user controlled register to prevent speculative use +@@ -422,7 +423,6 @@ ENTRY(system_call) + jnl .Lsysc_nr_ok + slag %r8,%r1,3 + .Lsysc_nr_ok: +- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stg %r2,__PT_ORIG_GPR2(%r11) + stg %r7,STACK_FRAME_OVERHEAD(%r15) + lg %r9,0(%r8,%r10) # get system call add. +@@ -712,8 +712,8 @@ ENTRY(pgm_check_handler) + mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS + mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE + mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID +-6: RESTORE_SM_CLEAR_PER +- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) ++6: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) ++ RESTORE_SM_CLEAR_PER + larl %r1,pgm_check_table + llgh %r10,__PT_INT_CODE+2(%r11) + nill %r10,0x007f +@@ -734,8 +734,8 @@ ENTRY(pgm_check_handler) + # PER event in supervisor state, must be kprobes + # + .Lpgm_kprobe: +- RESTORE_SM_CLEAR_PER + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) ++ RESTORE_SM_CLEAR_PER + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,do_per_trap + j .Lpgm_return +@@ -777,10 +777,10 @@ ENTRY(io_int_handler) + stmg %r8,%r9,__PT_PSW(%r11) + mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID + xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) ++ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ + jo .Lio_restore + TRACE_IRQS_OFF +- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + .Lio_loop: + lgr %r2,%r11 # pass pointer to pt_regs + lghi %r3,IO_INTERRUPT +@@ -980,10 +980,10 @@ ENTRY(ext_int_handler) + mvc __PT_INT_PARM(4,%r11),__LC_EXT_PARAMS + mvc __PT_INT_PARM_LONG(8,%r11),0(%r1) + xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) ++ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ + jo .Lio_restore + TRACE_IRQS_OFF +- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + lgr %r2,%r11 # pass pointer to pt_regs + lghi %r3,EXT_INTERRUPT + brasl %r14,do_IRQ +-- +2.27.0 + diff --git a/queue-5.10/series b/queue-5.10/series index 79a86712013..104f3aebf60 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -36,3 +36,28 @@ bfs-don-t-use-warning-string-when-it-s-just-info.patch ext4-check-for-invalid-block-size-early-when-mounting-a-file-system.patch fcntl-fix-potential-deadlock-in-send_sig-io-urg.patch io_uring-check-kthread-stopped-flag-when-sq-thread-is-unparked.patch +rtc-sun6i-fix-memleak-in-sun6i_rtc_clk_init.patch +module-set-module_state_going-state-when-a-module-fa.patch +quota-don-t-overflow-quota-file-offsets.patch +rtc-pl031-fix-resource-leak-in-pl031_probe.patch +powerpc-sysdev-add-missing-iounmap-on-error-in-mpic_.patch +i3c-master-fix-missing-destroy_workqueue-on-error-in.patch +nfsv4-fix-a-pnfs-layout-related-use-after-free-race-.patch +f2fs-avoid-race-condition-for-shrinker-count.patch +f2fs-fix-race-of-pending_pages-in-decompression.patch +module-delay-kobject-uevent-until-after-module-init-.patch +powerpc-64-irq-replay-remove-decrementer-overflow-ch.patch +fs-namespace.c-warn-if-mnt_count-has-become-negative.patch +watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch +um-random-register-random-as-hwrng-core-device.patch +um-ubd-submit-all-data-segments-atomically.patch +nfsv4.2-don-t-error-when-exiting-early-on-a-read_plu.patch +ceph-fix-inode-refcount-leak-when-ceph_fill_inode-on.patch +drm-amd-display-updated-wm-table-for-renoir.patch +tick-sched-remove-bogus-boot-safety-check.patch +s390-always-clear-kernel-stack-backchain-before-call.patch +io_uring-remove-racy-overflow-list-fast-checks.patch +alsa-pcm-clear-the-full-allocated-memory-at-hw_param.patch +dm-verity-skip-verity-work-if-i-o-error-when-system-.patch +ext4-avoid-s_mb_prefetch-to-be-zero-in-individual-sc.patch +device-dax-fix-range-release.patch diff --git a/queue-5.10/tick-sched-remove-bogus-boot-safety-check.patch b/queue-5.10/tick-sched-remove-bogus-boot-safety-check.patch new file mode 100644 index 00000000000..9584760e462 --- /dev/null +++ b/queue-5.10/tick-sched-remove-bogus-boot-safety-check.patch @@ -0,0 +1,49 @@ +From b2866008cf5603219f25f52b1523c8732df89c3f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 6 Dec 2020 22:12:55 +0100 +Subject: tick/sched: Remove bogus boot "safety" check + +From: Thomas Gleixner + +[ Upstream commit ba8ea8e7dd6e1662e34e730eadfc52aa6816f9dd ] + +can_stop_idle_tick() checks whether the do_timer() duty has been taken over +by a CPU on boot. That's silly because the boot CPU always takes over with +the initial clockevent device. + +But even if no CPU would have installed a clockevent and taken over the +duty then the question whether the tick on the current CPU can be stopped +or not is moot. In that case the current CPU would have no clockevent +either, so there would be nothing to keep ticking. + +Remove it. + +Signed-off-by: Thomas Gleixner +Acked-by: Frederic Weisbecker +Link: https://lore.kernel.org/r/20201206212002.725238293@linutronix.de +Signed-off-by: Sasha Levin +--- + kernel/time/tick-sched.c | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c +index 81632cd5e3b72..e8d351b7f9b03 100644 +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -941,13 +941,6 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) + */ + if (tick_do_timer_cpu == cpu) + return false; +- /* +- * Boot safety: make sure the timekeeping duty has been +- * assigned before entering dyntick-idle mode, +- * tick_do_timer_cpu is TICK_DO_TIMER_BOOT +- */ +- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) +- return false; + + /* Should not happen for nohz-full */ + if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) +-- +2.27.0 + diff --git a/queue-5.10/um-random-register-random-as-hwrng-core-device.patch b/queue-5.10/um-random-register-random-as-hwrng-core-device.patch new file mode 100644 index 00000000000..93917e39e9d --- /dev/null +++ b/queue-5.10/um-random-register-random-as-hwrng-core-device.patch @@ -0,0 +1,254 @@ +From 76ddf8266cfb65258a05dbbca4c4fd6b4f956a55 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Oct 2020 15:30:22 +0000 +Subject: um: random: Register random as hwrng-core device + +From: Christopher Obbard + +[ Upstream commit 72d3e093afae79611fa38f8f2cfab9a888fe66f2 ] + +The UML random driver creates a dummy device under the guest, +/dev/hw_random. When this file is read from the guest, the driver +reads from the host machine's /dev/random, in-turn reading from +the host kernel's entropy pool. This entropy pool could have been +filled by a hardware random number generator or just the host +kernel's internal software entropy generator. + +Currently the driver does not fill the guests kernel entropy pool, +this requires a userspace tool running inside the guest (like +rng-tools) to read from the dummy device provided by this driver, +which then would fill the guest's internal entropy pool. + +This all seems quite pointless when we are already reading from an +entropy pool, so this patch aims to register the device as a hwrng +device using the hwrng-core framework. This not only improves and +cleans up the driver, but also fills the guest's entropy pool +without having to resort to using extra userspace tools in the guest. + +This is typically a nuisance when booting a guest: the random pool +takes a long time (~200s) to build up enough entropy since the dummy +hwrng is not used to fill the guest's pool. + +This port was originally attempted by Alexander Neville "dark" (in CC, +discussion in Link), but the conversation there stalled since the +handling of -EAGAIN errors were no removed and longer handled by the +driver. This patch attempts to use the existing method of error +handling but utilises the new hwrng core. + +The issue can be noticed when booting a UML guest: + + [ 2.560000] random: fast init done + [ 214.000000] random: crng init done + +With the patch applied, filling the pool becomes a lot quicker: + + [ 2.560000] random: fast init done + [ 12.000000] random: crng init done + +Cc: Alexander Neville +Link: https://lore.kernel.org/lkml/20190828204609.02a7ff70@TheDarkness/ +Link: https://lore.kernel.org/lkml/20190829135001.6a5ff940@TheDarkness.local/ +Cc: Sjoerd Simons +Signed-off-by: Christopher Obbard +Acked-by: Anton Ivanov +Signed-off-by: Richard Weinberger +Signed-off-by: Sasha Levin +--- + arch/um/drivers/random.c | 101 ++++++++------------------------- + drivers/char/hw_random/Kconfig | 16 +++--- + 2 files changed, 33 insertions(+), 84 deletions(-) + +diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c +index ce115fce52f02..e4b9b2ce9abf4 100644 +--- a/arch/um/drivers/random.c ++++ b/arch/um/drivers/random.c +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -18,9 +19,8 @@ + #include + + /* +- * core module and version information ++ * core module information + */ +-#define RNG_VERSION "1.0.0" + #define RNG_MODULE_NAME "hw_random" + + /* Changed at init time, in the non-modular case, and at module load +@@ -28,88 +28,36 @@ + * protects against a module being loaded twice at the same time. + */ + static int random_fd = -1; +-static DECLARE_WAIT_QUEUE_HEAD(host_read_wait); ++static struct hwrng hwrng = { 0, }; ++static DECLARE_COMPLETION(have_data); + +-static int rng_dev_open (struct inode *inode, struct file *filp) ++static int rng_dev_read(struct hwrng *rng, void *buf, size_t max, bool block) + { +- /* enforce read-only access to this chrdev */ +- if ((filp->f_mode & FMODE_READ) == 0) +- return -EINVAL; +- if ((filp->f_mode & FMODE_WRITE) != 0) +- return -EINVAL; ++ int ret; + +- return 0; +-} +- +-static atomic_t host_sleep_count = ATOMIC_INIT(0); +- +-static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, +- loff_t *offp) +-{ +- u32 data; +- int n, ret = 0, have_data; +- +- while (size) { +- n = os_read_file(random_fd, &data, sizeof(data)); +- if (n > 0) { +- have_data = n; +- while (have_data && size) { +- if (put_user((u8) data, buf++)) { +- ret = ret ? : -EFAULT; +- break; +- } +- size--; +- ret++; +- have_data--; +- data >>= 8; +- } +- } +- else if (n == -EAGAIN) { +- DECLARE_WAITQUEUE(wait, current); +- +- if (filp->f_flags & O_NONBLOCK) +- return ret ? : -EAGAIN; +- +- atomic_inc(&host_sleep_count); ++ for (;;) { ++ ret = os_read_file(random_fd, buf, max); ++ if (block && ret == -EAGAIN) { + add_sigio_fd(random_fd); + +- add_wait_queue(&host_read_wait, &wait); +- set_current_state(TASK_INTERRUPTIBLE); ++ ret = wait_for_completion_killable(&have_data); + +- schedule(); +- remove_wait_queue(&host_read_wait, &wait); ++ ignore_sigio_fd(random_fd); ++ deactivate_fd(random_fd, RANDOM_IRQ); + +- if (atomic_dec_and_test(&host_sleep_count)) { +- ignore_sigio_fd(random_fd); +- deactivate_fd(random_fd, RANDOM_IRQ); +- } ++ if (ret < 0) ++ break; ++ } else { ++ break; + } +- else +- return n; +- +- if (signal_pending (current)) +- return ret ? : -ERESTARTSYS; + } +- return ret; +-} + +-static const struct file_operations rng_chrdev_ops = { +- .owner = THIS_MODULE, +- .open = rng_dev_open, +- .read = rng_dev_read, +- .llseek = noop_llseek, +-}; +- +-/* rng_init shouldn't be called more than once at boot time */ +-static struct miscdevice rng_miscdev = { +- HWRNG_MINOR, +- RNG_MODULE_NAME, +- &rng_chrdev_ops, +-}; ++ return ret != -EAGAIN ? ret : 0; ++} + + static irqreturn_t random_interrupt(int irq, void *data) + { +- wake_up(&host_read_wait); ++ complete(&have_data); + + return IRQ_HANDLED; + } +@@ -126,18 +74,19 @@ static int __init rng_init (void) + goto out; + + random_fd = err; +- + err = um_request_irq(RANDOM_IRQ, random_fd, IRQ_READ, random_interrupt, + 0, "random", NULL); + if (err) + goto err_out_cleanup_hw; + + sigio_broken(random_fd, 1); ++ hwrng.name = RNG_MODULE_NAME; ++ hwrng.read = rng_dev_read; ++ hwrng.quality = 1024; + +- err = misc_register (&rng_miscdev); ++ err = hwrng_register(&hwrng); + if (err) { +- printk (KERN_ERR RNG_MODULE_NAME ": misc device register " +- "failed\n"); ++ pr_err(RNG_MODULE_NAME " registering failed (%d)\n", err); + goto err_out_cleanup_hw; + } + out: +@@ -161,8 +110,8 @@ static void cleanup(void) + + static void __exit rng_cleanup(void) + { ++ hwrng_unregister(&hwrng); + os_close_file(random_fd); +- misc_deregister (&rng_miscdev); + } + + module_init (rng_init); +diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig +index e92c4d9469d82..5952210526aaa 100644 +--- a/drivers/char/hw_random/Kconfig ++++ b/drivers/char/hw_random/Kconfig +@@ -540,15 +540,15 @@ endif # HW_RANDOM + + config UML_RANDOM + depends on UML +- tristate "Hardware random number generator" ++ select HW_RANDOM ++ tristate "UML Random Number Generator support" + help + This option enables UML's "hardware" random number generator. It + attaches itself to the host's /dev/random, supplying as much entropy + as the host has, rather than the small amount the UML gets from its +- own drivers. It registers itself as a standard hardware random number +- generator, major 10, minor 183, and the canonical device name is +- /dev/hwrng. +- The way to make use of this is to install the rng-tools package +- (check your distro, or download from +- http://sourceforge.net/projects/gkernel/). rngd periodically reads +- /dev/hwrng and injects the entropy into /dev/random. ++ own drivers. It registers itself as a rng-core driver thus providing ++ a device which is usually called /dev/hwrng. This hardware random ++ number generator does feed into the kernel's random number generator ++ entropy pool. ++ ++ If unsure, say Y. +-- +2.27.0 + diff --git a/queue-5.10/um-ubd-submit-all-data-segments-atomically.patch b/queue-5.10/um-ubd-submit-all-data-segments-atomically.patch new file mode 100644 index 00000000000..1ad5f117ea0 --- /dev/null +++ b/queue-5.10/um-ubd-submit-all-data-segments-atomically.patch @@ -0,0 +1,434 @@ +From 5b730860e1976f7f024014b59e4bd8b568c5ccb8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 21 Nov 2020 23:13:56 -0500 +Subject: um: ubd: Submit all data segments atomically + +From: Gabriel Krisman Bertazi + +[ Upstream commit fc6b6a872dcd48c6f39c7975836d75113db67d37 ] + +Internally, UBD treats each physical IO segment as a separate command to +be submitted in the execution pipe. If the pipe returns a transient +error after a few segments have already been written, UBD will tell the +block layer to requeue the request, but there is no way to reclaim the +segments already submitted. When a new attempt to dispatch the request +is done, those segments already submitted will get duplicated, causing +the WARN_ON below in the best case, and potentially data corruption. + +In my system, running a UML instance with 2GB of RAM and a 50M UBD disk, +I can reproduce the WARN_ON by simply running mkfs.fvat against the +disk on a freshly booted system. + +There are a few ways to around this, like reducing the pressure on +the pipe by reducing the queue depth, which almost eliminates the +occurrence of the problem, increasing the pipe buffer size on the host +system, or by limiting the request to one physical segment, which causes +the block layer to submit way more requests to resolve a single +operation. + +Instead, this patch modifies the format of a UBD command, such that all +segments are sent through a single element in the communication pipe, +turning the command submission atomic from the point of view of the +block layer. The new format has a variable size, depending on the +number of elements, and looks like this: + ++------------+-----------+-----------+------------ +| cmd_header | segment 0 | segment 1 | segment ... ++------------+-----------+-----------+------------ + +With this format, we push a pointer to cmd_header in the submission +pipe. + +This has the advantage of reducing the memory footprint of executing a +single request, since it allow us to merge some fields in the header. +It is possible to reduce even further each segment memory footprint, by +merging bitmap_words and cow_offset, for instance, but this is not the +focus of this patch and is left as future work. One issue with the +patch is that for a big number of segments, we now perform one big +memory allocation instead of multiple small ones, but I wasn't able to +trigger any real issues or -ENOMEM because of this change, that wouldn't +be reproduced otherwise. + +This was tested using fio with the verify-crc32 option, and by running +an ext4 filesystem over this UBD device. + +The original WARN_ON was: + +------------[ cut here ]------------ +WARNING: CPU: 0 PID: 0 at lib/refcount.c:28 refcount_warn_saturate+0x13f/0x141 +refcount_t: underflow; use-after-free. +Modules linked in: +CPU: 0 PID: 0 Comm: swapper Not tainted 5.5.0-rc6-00002-g2a5bb2cf75c8 #346 +Stack: + 6084eed0 6063dc77 00000009 6084ef60 + 00000000 604b8d9f 6084eee0 6063dcbc + 6084ef40 6006ab8d e013d780 1c00000000 +Call Trace: + [<600a0c1c>] ? printk+0x0/0x94 + [<6004a888>] show_stack+0x13b/0x155 + [<6063dc77>] ? dump_stack_print_info+0xdf/0xe8 + [<604b8d9f>] ? refcount_warn_saturate+0x13f/0x141 + [<6063dcbc>] dump_stack+0x2a/0x2c + [<6006ab8d>] __warn+0x107/0x134 + [<6008da6c>] ? wake_up_process+0x17/0x19 + [<60487628>] ? blk_queue_max_discard_sectors+0x0/0xd + [<6006b05f>] warn_slowpath_fmt+0xd1/0xdf + [<6006af8e>] ? warn_slowpath_fmt+0x0/0xdf + [<600acc14>] ? raw_read_seqcount_begin.constprop.0+0x0/0x15 + [<600619ae>] ? os_nsecs+0x1d/0x2b + [<604b8d9f>] refcount_warn_saturate+0x13f/0x141 + [<6048bc8f>] refcount_sub_and_test.constprop.0+0x2f/0x37 + [<6048c8de>] blk_mq_free_request+0xf1/0x10d + [<6048ca06>] __blk_mq_end_request+0x10c/0x114 + [<6005ac0f>] ubd_intr+0xb5/0x169 + [<600a1a37>] __handle_irq_event_percpu+0x6b/0x17e + [<600a1b70>] handle_irq_event_percpu+0x26/0x69 + [<600a1bd9>] handle_irq_event+0x26/0x34 + [<600a1bb3>] ? handle_irq_event+0x0/0x34 + [<600a5186>] ? unmask_irq+0x0/0x37 + [<600a57e6>] handle_edge_irq+0xbc/0xd6 + [<600a131a>] generic_handle_irq+0x21/0x29 + [<60048f6e>] do_IRQ+0x39/0x54 + [...] +---[ end trace c6e7444e55386c0f ]--- + +Cc: Christopher Obbard +Reported-by: Martyn Welch +Signed-off-by: Gabriel Krisman Bertazi +Tested-by: Christopher Obbard +Acked-by: Anton Ivanov +Signed-off-by: Richard Weinberger +Signed-off-by: Sasha Levin +--- + arch/um/drivers/ubd_kern.c | 191 ++++++++++++++++++++++--------------- + 1 file changed, 115 insertions(+), 76 deletions(-) + +diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c +index eae8c83364f71..b12c1b0d3e1d0 100644 +--- a/arch/um/drivers/ubd_kern.c ++++ b/arch/um/drivers/ubd_kern.c +@@ -47,18 +47,25 @@ + /* Max request size is determined by sector mask - 32K */ + #define UBD_MAX_REQUEST (8 * sizeof(long)) + ++struct io_desc { ++ char *buffer; ++ unsigned long length; ++ unsigned long sector_mask; ++ unsigned long long cow_offset; ++ unsigned long bitmap_words[2]; ++}; ++ + struct io_thread_req { + struct request *req; + int fds[2]; + unsigned long offsets[2]; + unsigned long long offset; +- unsigned long length; +- char *buffer; + int sectorsize; +- unsigned long sector_mask; +- unsigned long long cow_offset; +- unsigned long bitmap_words[2]; + int error; ++ ++ int desc_cnt; ++ /* io_desc has to be the last element of the struct */ ++ struct io_desc io_desc[]; + }; + + +@@ -525,12 +532,7 @@ static void ubd_handler(void) + blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, io_req->req->q); + } +- if ((io_req->error) || (io_req->buffer == NULL)) +- blk_mq_end_request(io_req->req, io_req->error); +- else { +- if (!blk_update_request(io_req->req, io_req->error, io_req->length)) +- __blk_mq_end_request(io_req->req, io_req->error); +- } ++ blk_mq_end_request(io_req->req, io_req->error); + kfree(io_req); + } + } +@@ -946,6 +948,7 @@ static int ubd_add(int n, char **error_out) + blk_queue_write_cache(ubd_dev->queue, true, false); + + blk_queue_max_segments(ubd_dev->queue, MAX_SG); ++ blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); + err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); + if(err){ + *error_out = "Failed to register device"; +@@ -1289,37 +1292,74 @@ static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, + *cow_offset += bitmap_offset; + } + +-static void cowify_req(struct io_thread_req *req, unsigned long *bitmap, ++static void cowify_req(struct io_thread_req *req, struct io_desc *segment, ++ unsigned long offset, unsigned long *bitmap, + __u64 bitmap_offset, __u64 bitmap_len) + { +- __u64 sector = req->offset >> SECTOR_SHIFT; ++ __u64 sector = offset >> SECTOR_SHIFT; + int i; + +- if (req->length > (sizeof(req->sector_mask) * 8) << SECTOR_SHIFT) ++ if (segment->length > (sizeof(segment->sector_mask) * 8) << SECTOR_SHIFT) + panic("Operation too long"); + + if (req_op(req->req) == REQ_OP_READ) { +- for (i = 0; i < req->length >> SECTOR_SHIFT; i++) { ++ for (i = 0; i < segment->length >> SECTOR_SHIFT; i++) { + if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) + ubd_set_bit(i, (unsigned char *) +- &req->sector_mask); ++ &segment->sector_mask); ++ } ++ } else { ++ cowify_bitmap(offset, segment->length, &segment->sector_mask, ++ &segment->cow_offset, bitmap, bitmap_offset, ++ segment->bitmap_words, bitmap_len); ++ } ++} ++ ++static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req, ++ struct request *req) ++{ ++ struct bio_vec bvec; ++ struct req_iterator iter; ++ int i = 0; ++ unsigned long byte_offset = io_req->offset; ++ int op = req_op(req); ++ ++ if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) { ++ io_req->io_desc[0].buffer = NULL; ++ io_req->io_desc[0].length = blk_rq_bytes(req); ++ } else { ++ rq_for_each_segment(bvec, req, iter) { ++ BUG_ON(i >= io_req->desc_cnt); ++ ++ io_req->io_desc[i].buffer = ++ page_address(bvec.bv_page) + bvec.bv_offset; ++ io_req->io_desc[i].length = bvec.bv_len; ++ i++; ++ } ++ } ++ ++ if (dev->cow.file) { ++ for (i = 0; i < io_req->desc_cnt; i++) { ++ cowify_req(io_req, &io_req->io_desc[i], byte_offset, ++ dev->cow.bitmap, dev->cow.bitmap_offset, ++ dev->cow.bitmap_len); ++ byte_offset += io_req->io_desc[i].length; + } ++ + } +- else cowify_bitmap(req->offset, req->length, &req->sector_mask, +- &req->cow_offset, bitmap, bitmap_offset, +- req->bitmap_words, bitmap_len); + } + +-static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, +- u64 off, struct bio_vec *bvec) ++static struct io_thread_req *ubd_alloc_req(struct ubd *dev, struct request *req, ++ int desc_cnt) + { +- struct ubd *dev = hctx->queue->queuedata; + struct io_thread_req *io_req; +- int ret; ++ int i; + +- io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC); ++ io_req = kmalloc(sizeof(*io_req) + ++ (desc_cnt * sizeof(struct io_desc)), ++ GFP_ATOMIC); + if (!io_req) +- return -ENOMEM; ++ return NULL; + + io_req->req = req; + if (dev->cow.file) +@@ -1327,26 +1367,41 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, + else + io_req->fds[0] = dev->fd; + io_req->error = 0; +- +- if (bvec != NULL) { +- io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset; +- io_req->length = bvec->bv_len; +- } else { +- io_req->buffer = NULL; +- io_req->length = blk_rq_bytes(req); +- } +- + io_req->sectorsize = SECTOR_SIZE; + io_req->fds[1] = dev->fd; +- io_req->cow_offset = -1; +- io_req->offset = off; +- io_req->sector_mask = 0; ++ io_req->offset = (u64) blk_rq_pos(req) << SECTOR_SHIFT; + io_req->offsets[0] = 0; + io_req->offsets[1] = dev->cow.data_offset; + +- if (dev->cow.file) +- cowify_req(io_req, dev->cow.bitmap, +- dev->cow.bitmap_offset, dev->cow.bitmap_len); ++ for (i = 0 ; i < desc_cnt; i++) { ++ io_req->io_desc[i].sector_mask = 0; ++ io_req->io_desc[i].cow_offset = -1; ++ } ++ ++ return io_req; ++} ++ ++static int ubd_submit_request(struct ubd *dev, struct request *req) ++{ ++ int segs = 0; ++ struct io_thread_req *io_req; ++ int ret; ++ int op = req_op(req); ++ ++ if (op == REQ_OP_FLUSH) ++ segs = 0; ++ else if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) ++ segs = 1; ++ else ++ segs = blk_rq_nr_phys_segments(req); ++ ++ io_req = ubd_alloc_req(dev, req, segs); ++ if (!io_req) ++ return -ENOMEM; ++ ++ io_req->desc_cnt = segs; ++ if (segs) ++ ubd_map_req(dev, io_req, req); + + ret = os_write_file(thread_fd, &io_req, sizeof(io_req)); + if (ret != sizeof(io_req)) { +@@ -1357,22 +1412,6 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, + return ret; + } + +-static int queue_rw_req(struct blk_mq_hw_ctx *hctx, struct request *req) +-{ +- struct req_iterator iter; +- struct bio_vec bvec; +- int ret; +- u64 off = (u64)blk_rq_pos(req) << SECTOR_SHIFT; +- +- rq_for_each_segment(bvec, req, iter) { +- ret = ubd_queue_one_vec(hctx, req, off, &bvec); +- if (ret < 0) +- return ret; +- off += bvec.bv_len; +- } +- return 0; +-} +- + static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) + { +@@ -1385,17 +1424,12 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, + spin_lock_irq(&ubd_dev->lock); + + switch (req_op(req)) { +- /* operations with no lentgth/offset arguments */ + case REQ_OP_FLUSH: +- ret = ubd_queue_one_vec(hctx, req, 0, NULL); +- break; + case REQ_OP_READ: + case REQ_OP_WRITE: +- ret = queue_rw_req(hctx, req); +- break; + case REQ_OP_DISCARD: + case REQ_OP_WRITE_ZEROES: +- ret = ubd_queue_one_vec(hctx, req, (u64)blk_rq_pos(req) << 9, NULL); ++ ret = ubd_submit_request(ubd_dev, req); + break; + default: + WARN_ON_ONCE(1); +@@ -1483,22 +1517,22 @@ static int map_error(int error_code) + * will result in unpredictable behaviour and/or crashes. + */ + +-static int update_bitmap(struct io_thread_req *req) ++static int update_bitmap(struct io_thread_req *req, struct io_desc *segment) + { + int n; + +- if(req->cow_offset == -1) ++ if (segment->cow_offset == -1) + return map_error(0); + +- n = os_pwrite_file(req->fds[1], &req->bitmap_words, +- sizeof(req->bitmap_words), req->cow_offset); +- if (n != sizeof(req->bitmap_words)) ++ n = os_pwrite_file(req->fds[1], &segment->bitmap_words, ++ sizeof(segment->bitmap_words), segment->cow_offset); ++ if (n != sizeof(segment->bitmap_words)) + return map_error(-n); + + return map_error(0); + } + +-static void do_io(struct io_thread_req *req) ++static void do_io(struct io_thread_req *req, struct io_desc *desc) + { + char *buf = NULL; + unsigned long len; +@@ -1513,21 +1547,20 @@ static void do_io(struct io_thread_req *req) + return; + } + +- nsectors = req->length / req->sectorsize; ++ nsectors = desc->length / req->sectorsize; + start = 0; + do { +- bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); ++ bit = ubd_test_bit(start, (unsigned char *) &desc->sector_mask); + end = start; + while((end < nsectors) && +- (ubd_test_bit(end, (unsigned char *) +- &req->sector_mask) == bit)) ++ (ubd_test_bit(end, (unsigned char *) &desc->sector_mask) == bit)) + end++; + + off = req->offset + req->offsets[bit] + + start * req->sectorsize; + len = (end - start) * req->sectorsize; +- if (req->buffer != NULL) +- buf = &req->buffer[start * req->sectorsize]; ++ if (desc->buffer != NULL) ++ buf = &desc->buffer[start * req->sectorsize]; + + switch (req_op(req->req)) { + case REQ_OP_READ: +@@ -1567,7 +1600,8 @@ static void do_io(struct io_thread_req *req) + start = end; + } while(start < nsectors); + +- req->error = update_bitmap(req); ++ req->offset += len; ++ req->error = update_bitmap(req, desc); + } + + /* Changed in start_io_thread, which is serialized by being called only +@@ -1600,8 +1634,13 @@ int io_thread(void *arg) + } + + for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { ++ struct io_thread_req *req = (*io_req_buffer)[count]; ++ int i; ++ + io_count++; +- do_io((*io_req_buffer)[count]); ++ for (i = 0; !req->error && i < req->desc_cnt; i++) ++ do_io(req, &(req->io_desc[i])); ++ + } + + written = 0; +-- +2.27.0 + diff --git a/queue-5.10/watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch b/queue-5.10/watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch new file mode 100644 index 00000000000..aec25b4d7d1 --- /dev/null +++ b/queue-5.10/watchdog-rti-wdt-fix-reference-leak-in-rti_wdt_probe.patch @@ -0,0 +1,42 @@ +From 49a4a3d8b7fc4a26b17c0f556b693bc0cc084f52 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 30 Oct 2020 23:49:09 +0800 +Subject: watchdog: rti-wdt: fix reference leak in rti_wdt_probe + +From: Zhang Qilong + +[ Upstream commit 8711071e9700b67045fe5518161d63f7a03e3c9e ] + +pm_runtime_get_sync() will increment pm usage counter even it +failed. Forgetting to call pm_runtime_put_noidle will result +in reference leak in rti_wdt_probe, so we should fix it. + +Signed-off-by: Zhang Qilong +Reviewed-by: Guenter Roeck +Link: https://lore.kernel.org/r/20201030154909.100023-1-zhangqilong3@huawei.com +Signed-off-by: Guenter Roeck +Signed-off-by: Wim Van Sebroeck +Signed-off-by: Sasha Levin +--- + drivers/watchdog/rti_wdt.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c +index 836319cbaca9d..359302f71f7ef 100644 +--- a/drivers/watchdog/rti_wdt.c ++++ b/drivers/watchdog/rti_wdt.c +@@ -227,8 +227,10 @@ static int rti_wdt_probe(struct platform_device *pdev) + + pm_runtime_enable(dev); + ret = pm_runtime_get_sync(dev); +- if (ret) ++ if (ret) { ++ pm_runtime_put_noidle(dev); + return dev_err_probe(dev, ret, "runtime pm failed\n"); ++ } + + platform_set_drvdata(pdev, wdt); + +-- +2.27.0 + -- 2.47.3