From: Sasha Levin Date: Sat, 4 Jul 2020 16:29:56 +0000 (-0400) Subject: Fixes for 5.7 X-Git-Tag: v4.4.230~40 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=fe5c4a21e3242bfd5986f7bfe8abc2c75dba5084;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.7 Signed-off-by: Sasha Levin --- diff --git a/queue-5.7/drm-amdgpu-disable-ras-query-and-iject-during-gpu-re.patch b/queue-5.7/drm-amdgpu-disable-ras-query-and-iject-during-gpu-re.patch new file mode 100644 index 00000000000..c55ef7e3bb6 --- /dev/null +++ b/queue-5.7/drm-amdgpu-disable-ras-query-and-iject-during-gpu-re.patch @@ -0,0 +1,129 @@ +From 2d3b41695e0b3b40306466929f2bdf5ad9edea01 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Mar 2020 16:01:14 +0800 +Subject: drm/amdgpu: disable ras query and iject during gpu reset + +From: John Clements + +[ Upstream commit 61380faa4b4cc577df8a7ff5db5859bac6b351f7 ] + +added flag to ras context to indicate if ras query functionality is ready + +Reviewed-by: Hawking Zhang +Signed-off-by: John Clements +Signed-off-by: Alex Deucher +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 +++++++++++++++++++--- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 ++++ + 3 files changed, 28 insertions(+), 3 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index affde2de2a0db..59288653412db 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -4091,6 +4091,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + need_full_reset = job_signaled = false; + INIT_LIST_HEAD(&device_list); + ++ amdgpu_ras_set_error_query_ready(adev, false); ++ + dev_info(adev->dev, "GPU %s begin!\n", + (in_ras_intr && !use_baco) ? "jobs stop":"reset"); + +@@ -4147,6 +4149,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + /* block all schedulers and reset given job's ring */ + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + if (tmp_adev != adev) { ++ amdgpu_ras_set_error_query_ready(tmp_adev, false); + amdgpu_device_lock_adev(tmp_adev, false); + if (!amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_pre_reset(tmp_adev); +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +index ab379b44679cc..aa6148d12d5a4 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +@@ -80,6 +80,20 @@ atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); + static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, + uint64_t addr); + ++void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) ++{ ++ if (adev) ++ amdgpu_ras_get_context(adev)->error_query_ready = ready; ++} ++ ++bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) ++{ ++ if (adev) ++ return amdgpu_ras_get_context(adev)->error_query_ready; ++ ++ return false; ++} ++ + static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) + { +@@ -281,7 +295,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * + struct ras_debug_if data; + int ret = 0; + +- if (amdgpu_ras_intr_triggered()) { ++ if (!amdgpu_ras_get_error_query_ready(adev)) { + DRM_WARN("RAS WARN: error injection currently inaccessible\n"); + return size; + } +@@ -399,7 +413,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, + .head = obj->head, + }; + +- if (amdgpu_ras_intr_triggered()) ++ if (!amdgpu_ras_get_error_query_ready(obj->adev)) + return snprintf(buf, PAGE_SIZE, + "Query currently inaccessible\n"); + +@@ -1896,8 +1910,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, + } + + /* in resume phase, no need to create ras fs node */ +- if (adev->in_suspend || adev->in_gpu_reset) ++ if (adev->in_suspend || adev->in_gpu_reset) { ++ amdgpu_ras_set_error_query_ready(adev, true); + return 0; ++ } + + if (ih_info->cb) { + r = amdgpu_ras_interrupt_add_handler(adev, ih_info); +@@ -1909,6 +1925,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, + if (r) + goto sysfs; + ++ amdgpu_ras_set_error_query_ready(adev, true); ++ + return 0; + cleanup: + amdgpu_ras_sysfs_remove(adev, ras_block); +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +index 55c3eceb390d4..e7df5d8429f82 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +@@ -334,6 +334,8 @@ struct amdgpu_ras { + uint32_t flags; + bool reboot; + struct amdgpu_ras_eeprom_control eeprom_control; ++ ++ bool error_query_ready; + }; + + struct ras_fs_data { +@@ -629,4 +631,6 @@ static inline void amdgpu_ras_intr_cleared(void) + + void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); + ++void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready); ++ + #endif +-- +2.25.1 + diff --git a/queue-5.7/drm-amdgpu-fix-kernel-page-fault-issue-by-ras-recove.patch b/queue-5.7/drm-amdgpu-fix-kernel-page-fault-issue-by-ras-recove.patch new file mode 100644 index 00000000000..11d1c58a614 --- /dev/null +++ b/queue-5.7/drm-amdgpu-fix-kernel-page-fault-issue-by-ras-recove.patch @@ -0,0 +1,50 @@ +From 5a211368384eb937b650d165da935777996f9977 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 16 Apr 2020 23:41:07 +0800 +Subject: drm/amdgpu: fix kernel page fault issue by ras recovery on sGPU + +From: Guchun Chen + +[ Upstream commit 12c17b9d62663c14a5343d6742682b3e67280754 ] + +When running ras uncorrectable error injection and triggering GPU +reset on sGPU, below issue is observed. It's caused by the list +uninitialized when accessing. + +[ 80.047227] BUG: unable to handle page fault for address: ffffffffc0f4f750 +[ 80.047300] #PF: supervisor write access in kernel mode +[ 80.047351] #PF: error_code(0x0003) - permissions violation +[ 80.047404] PGD 12c20e067 P4D 12c20e067 PUD 12c210067 PMD 41c4ee067 PTE 404316061 +[ 80.047477] Oops: 0003 [#1] SMP PTI +[ 80.047516] CPU: 7 PID: 377 Comm: kworker/7:2 Tainted: G OE 5.4.0-rc7-guchchen #1 +[ 80.047594] Hardware name: System manufacturer System Product Name/TUF Z370-PLUS GAMING II, BIOS 0411 09/21/2018 +[ 80.047888] Workqueue: events amdgpu_ras_do_recovery [amdgpu] + +Signed-off-by: Guchun Chen +Reviewed-by: John Clements +Signed-off-by: Alex Deucher +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +index b0aa4e1ed4df7..cd18596b47d33 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +@@ -1444,9 +1444,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); + + /* Build list of devices to query RAS related errors */ +- if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { ++ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) + device_list_handle = &hive->device_list; +- } else { ++ else { ++ INIT_LIST_HEAD(&device_list); + list_add_tail(&adev->gmc.xgmi.head, &device_list); + device_list_handle = &device_list; + } +-- +2.25.1 + diff --git a/queue-5.7/drm-amdgpu-fix-non-pointer-dereference-for-non-ras-s.patch b/queue-5.7/drm-amdgpu-fix-non-pointer-dereference-for-non-ras-s.patch new file mode 100644 index 00000000000..5f72d3474c8 --- /dev/null +++ b/queue-5.7/drm-amdgpu-fix-non-pointer-dereference-for-non-ras-s.patch @@ -0,0 +1,69 @@ +From 45e2259f997932f1ffe6344d8edc124c3562ec48 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 27 Mar 2020 15:39:06 +0800 +Subject: drm/amdgpu: fix non-pointer dereference for non-RAS supported + +From: Evan Quan + +[ Upstream commit a9d82d2f91297679cfafd7e61c4bccdca6cd550d ] + +Backtrace on gpu recover test on Navi10. + +[ 1324.516681] RIP: 0010:amdgpu_ras_set_error_query_ready+0x15/0x20 [amdgpu] +[ 1324.523778] Code: 4c 89 f7 e8 cd a2 a0 d8 e9 99 fe ff ff 45 31 ff e9 91 fe ff ff 0f 1f 44 00 00 55 48 85 ff 48 89 e5 74 0e 48 8b 87 d8 2b 01 00 <40> 88 b0 38 01 00 00 5d c3 66 90 0f 1f 44 00 00 55 31 c0 48 85 ff +[ 1324.543452] RSP: 0018:ffffaa1040e4bd28 EFLAGS: 00010286 +[ 1324.549025] RAX: 0000000000000000 RBX: ffff911198b20000 RCX: 0000000000000000 +[ 1324.556217] RDX: 00000000000c0a01 RSI: 0000000000000000 RDI: ffff911198b20000 +[ 1324.563514] RBP: ffffaa1040e4bd28 R08: 0000000000001000 R09: ffff91119d0028c0 +[ 1324.570804] R10: ffffffff9a606b40 R11: 0000000000000000 R12: 0000000000000000 +[ 1324.578413] R13: ffffaa1040e4bd70 R14: ffff911198b20000 R15: 0000000000000000 +[ 1324.586464] FS: 00007f4441cbf540(0000) GS:ffff91119ed80000(0000) knlGS:0000000000000000 +[ 1324.595434] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 1324.601345] CR2: 0000000000000138 CR3: 00000003fcdf8004 CR4: 00000000003606e0 +[ 1324.608694] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 1324.616303] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 1324.623678] Call Trace: +[ 1324.626270] amdgpu_device_gpu_recover+0x6e7/0xc50 [amdgpu] +[ 1324.632018] ? seq_printf+0x4e/0x70 +[ 1324.636652] amdgpu_debugfs_gpu_recover+0x50/0x80 [amdgpu] +[ 1324.643371] seq_read+0xda/0x420 +[ 1324.647601] full_proxy_read+0x5c/0x90 +[ 1324.652426] __vfs_read+0x1b/0x40 +[ 1324.656734] vfs_read+0x8e/0x130 +[ 1324.660981] ksys_read+0xa7/0xe0 +[ 1324.665201] __x64_sys_read+0x1a/0x20 +[ 1324.669907] do_syscall_64+0x57/0x1c0 +[ 1324.674517] entry_SYSCALL_64_after_hwframe+0x44/0xa9 +[ 1324.680654] RIP: 0033:0x7f44417cf081 + +Signed-off-by: Evan Quan +Reviewed-by: John Clements +Signed-off-by: Alex Deucher +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +index aa6148d12d5a4..b0aa4e1ed4df7 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +@@ -82,13 +82,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, + + void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) + { +- if (adev) ++ if (adev && amdgpu_ras_get_context(adev)) + amdgpu_ras_get_context(adev)->error_query_ready = ready; + } + + bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) + { +- if (adev) ++ if (adev && amdgpu_ras_get_context(adev)) + return amdgpu_ras_get_context(adev)->error_query_ready; + + return false; +-- +2.25.1 + diff --git a/queue-5.7/drm-i915-gt-mark-timeline-cacheline-as-destroyed-aft.patch b/queue-5.7/drm-i915-gt-mark-timeline-cacheline-as-destroyed-aft.patch new file mode 100644 index 00000000000..375a649b685 --- /dev/null +++ b/queue-5.7/drm-i915-gt-mark-timeline-cacheline-as-destroyed-aft.patch @@ -0,0 +1,86 @@ +From a43143a8d4bebae62e9222ed3e35be96fefcfa12 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 23 Mar 2020 09:28:34 +0000 +Subject: drm/i915/gt: Mark timeline->cacheline as destroyed after rcu grace + period + +From: Chris Wilson + +[ Upstream commit 8e87e0139aff59c5961347ab1ef06814f092c439 ] + +Since we take advantage of RCU for some i915_active objects, like the +intel_timeline_cacheline, we need to delay the i915_active_fini until +after the RCU grace period and we perform the kfree -- that is until +after all RCU protected readers. + +<3> [108.204873] ODEBUG: assert_init not available (active state 0) object type: i915_active hint: __cacheline_active+0x0/0x80 [i915] +<4> [108.207377] WARNING: CPU: 3 PID: 2342 at lib/debugobjects.c:488 debug_print_object+0x67/0x90 +<4> [108.207400] Modules linked in: vgem snd_hda_codec_hdmi x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul snd_hda_intel ghash_clmulni_intel snd_intel_dspcfg snd_hda_codec ax88179_178a snd_hwdep usbnet btusb snd_hda_core btrtl mii btbcm btintel snd_pcm bluetooth ecdh_generic ecc i915 i2c_hid pinctrl_sunrisepoint pinctrl_intel intel_lpss_pci prime_numbers +<4> [108.207587] CPU: 3 PID: 2342 Comm: gem_exec_parall Tainted: G U 5.6.0-rc6-CI-Patchwork_17047+ #1 +<4> [108.207609] Hardware name: Google Soraka/Soraka, BIOS MrChromebox-4.10 08/25/2019 +<4> [108.207639] RIP: 0010:debug_print_object+0x67/0x90 +<4> [108.207668] Code: 83 c2 01 8b 4b 14 4c 8b 45 00 89 15 87 d2 8a 02 8b 53 10 4c 89 e6 48 c7 c7 38 2b 32 82 48 8b 14 d5 80 2f 07 82 e8 49 d5 b7 ff <0f> 0b 5b 83 05 c3 f6 22 01 01 5d 41 5c c3 83 05 b8 f6 22 01 01 c3 +<4> [108.207692] RSP: 0018:ffffc90000e7f890 EFLAGS: 00010282 +<4> [108.207723] RAX: 0000000000000000 RBX: ffffc90000e7f8b0 RCX: 0000000000000001 +<4> [108.207747] RDX: 0000000080000001 RSI: ffff88817ada8cb8 RDI: 00000000ffffffff +<4> [108.207770] RBP: ffffffffa0341cc0 R08: ffff88816b5a8948 R09: 0000000000000000 +<4> [108.207792] R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82322d54 +<4> [108.207814] R13: ffffffffa0341cc0 R14: ffffffff83df9568 R15: ffff88816064f400 +<4> [108.207839] FS: 00007f437d753700(0000) GS:ffff88817ad80000(0000) knlGS:0000000000000000 +<4> [108.207863] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +<4> [108.207887] CR2: 00007f2ad1fb5000 CR3: 00000001725d8004 CR4: 00000000003606e0 +<4> [108.207907] Call Trace: +<4> [108.207959] debug_object_assert_init+0x15c/0x180 +<4> [108.208475] ? i915_active_acquire_if_busy+0x10/0x50 [i915] +<4> [108.208513] ? rcu_read_lock_held+0x4d/0x60 +<4> [108.208970] i915_active_acquire_if_busy+0x10/0x50 [i915] +<4> [108.209380] intel_timeline_read_hwsp+0x81/0x540 [i915] +<4> [108.210262] __emit_semaphore_wait+0x45/0x1b0 [i915] +<4> [108.210726] ? i915_request_await_dma_fence+0x143/0x560 [i915] +<4> [108.211156] i915_request_await_dma_fence+0x28a/0x560 [i915] +<4> [108.211633] i915_request_await_object+0x24a/0x3f0 [i915] +<4> [108.212102] eb_submit.isra.47+0x58f/0x920 [i915] +<4> [108.212622] i915_gem_do_execbuffer+0x1706/0x2c70 [i915] +<4> [108.213071] ? i915_gem_execbuffer2_ioctl+0xc0/0x470 [i915] + +Signed-off-by: Chris Wilson +Reviewed-by: Matthew Auld +Link: https://patchwork.freedesktop.org/patch/msgid/20200323092841.22240-1-chris@chris-wilson.co.uk +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/i915/gt/intel_timeline.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c +index 08b56d7ab4f45..92da746f01c1e 100644 +--- a/drivers/gpu/drm/i915/gt/intel_timeline.c ++++ b/drivers/gpu/drm/i915/gt/intel_timeline.c +@@ -119,6 +119,15 @@ static void __idle_hwsp_free(struct intel_timeline_hwsp *hwsp, int cacheline) + spin_unlock_irqrestore(>->hwsp_lock, flags); + } + ++static void __rcu_cacheline_free(struct rcu_head *rcu) ++{ ++ struct intel_timeline_cacheline *cl = ++ container_of(rcu, typeof(*cl), rcu); ++ ++ i915_active_fini(&cl->active); ++ kfree(cl); ++} ++ + static void __idle_cacheline_free(struct intel_timeline_cacheline *cl) + { + GEM_BUG_ON(!i915_active_is_idle(&cl->active)); +@@ -127,8 +136,7 @@ static void __idle_cacheline_free(struct intel_timeline_cacheline *cl) + i915_vma_put(cl->hwsp->vma); + __idle_hwsp_free(cl->hwsp, ptr_unmask_bits(cl->vaddr, CACHELINE_BITS)); + +- i915_active_fini(&cl->active); +- kfree_rcu(cl, rcu); ++ call_rcu(&cl->rcu, __rcu_cacheline_free); + } + + __i915_active_call +-- +2.25.1 + diff --git a/queue-5.7/io_uring-fix-current-mm-null-dereference-on-exit.patch b/queue-5.7/io_uring-fix-current-mm-null-dereference-on-exit.patch new file mode 100644 index 00000000000..aff19186ed6 --- /dev/null +++ b/queue-5.7/io_uring-fix-current-mm-null-dereference-on-exit.patch @@ -0,0 +1,80 @@ +From d40214fc74233df6621c6c3b1248aa1bec2dbcf2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Jun 2020 12:37:11 +0300 +Subject: io_uring: fix current->mm NULL dereference on exit + +From: Pavel Begunkov + +[ Upstream commit d60b5fbc1ce8210759b568da49d149b868e7c6d3 ] + +Don't reissue requests from io_iopoll_reap_events(), the task may not +have mm, which ends up with NULL. It's better to kill everything off on +exit anyway. + +[ 677.734670] RIP: 0010:io_iopoll_complete+0x27e/0x630 +... +[ 677.734679] Call Trace: +[ 677.734695] ? __send_signal+0x1f2/0x420 +[ 677.734698] ? _raw_spin_unlock_irqrestore+0x24/0x40 +[ 677.734699] ? send_signal+0xf5/0x140 +[ 677.734700] io_iopoll_getevents+0x12f/0x1a0 +[ 677.734702] io_iopoll_reap_events.part.0+0x5e/0xa0 +[ 677.734703] io_ring_ctx_wait_and_kill+0x132/0x1c0 +[ 677.734704] io_uring_release+0x20/0x30 +[ 677.734706] __fput+0xcd/0x230 +[ 677.734707] ____fput+0xe/0x10 +[ 677.734709] task_work_run+0x67/0xa0 +[ 677.734710] do_exit+0x35d/0xb70 +[ 677.734712] do_group_exit+0x43/0xa0 +[ 677.734713] get_signal+0x140/0x900 +[ 677.734715] do_signal+0x37/0x780 +[ 677.734717] ? enqueue_hrtimer+0x41/0xb0 +[ 677.734718] ? recalibrate_cpu_khz+0x10/0x10 +[ 677.734720] ? ktime_get+0x3e/0xa0 +[ 677.734721] ? lapic_next_deadline+0x26/0x30 +[ 677.734723] ? tick_program_event+0x4d/0x90 +[ 677.734724] ? __hrtimer_get_next_event+0x4d/0x80 +[ 677.734726] __prepare_exit_to_usermode+0x126/0x1c0 +[ 677.734741] prepare_exit_to_usermode+0x9/0x40 +[ 677.734742] idtentry_exit_cond_rcu+0x4c/0x60 +[ 677.734743] sysvec_reschedule_ipi+0x92/0x160 +[ 677.734744] ? asm_sysvec_reschedule_ipi+0xa/0x20 +[ 677.734745] asm_sysvec_reschedule_ipi+0x12/0x20 + +Signed-off-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + fs/io_uring.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/fs/io_uring.c b/fs/io_uring.c +index 63a456921903e..71d281f68ed83 100644 +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -858,6 +858,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *ip, + unsigned nr_args); + static int io_grab_files(struct io_kiocb *req); ++static void io_complete_rw_common(struct kiocb *kiocb, long res); + static void io_cleanup_req(struct io_kiocb *req); + static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed); +@@ -1697,6 +1698,14 @@ static void io_iopoll_queue(struct list_head *again) + do { + req = list_first_entry(again, struct io_kiocb, list); + list_del(&req->list); ++ ++ /* shouldn't happen unless io_uring is dying, cancel reqs */ ++ if (unlikely(!current->mm)) { ++ io_complete_rw_common(&req->rw.kiocb, -EAGAIN); ++ io_put_req(req); ++ continue; ++ } ++ + refcount_inc(&req->refs); + io_queue_async_work(req); + } while (!list_empty(again)); +-- +2.25.1 + diff --git a/queue-5.7/io_uring-fix-io_sq_thread-no-schedule-when-busy.patch b/queue-5.7/io_uring-fix-io_sq_thread-no-schedule-when-busy.patch new file mode 100644 index 00000000000..6d8c8efdbf9 --- /dev/null +++ b/queue-5.7/io_uring-fix-io_sq_thread-no-schedule-when-busy.patch @@ -0,0 +1,54 @@ +From 4facbc6278d7946f8f884a73a288270194e98c6c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jun 2020 19:34:06 +0800 +Subject: io_uring: fix io_sq_thread no schedule when busy + +From: Xuan Zhuo + +[ Upstream commit b772f07add1c0b22e02c0f1e96f647560679d3a9 ] + +When the user consumes and generates sqe at a fast rate, +io_sqring_entries can always get sqe, and ret will not be equal to -EBUSY, +so that io_sq_thread will never call cond_resched or schedule, and then +we will get the following system error prompt: + +rcu: INFO: rcu_sched self-detected stall on CPU +or +watchdog: BUG: soft lockup-CPU#23 stuck for 112s! [io_uring-sq:1863] + +This patch checks whether need to call cond_resched() by checking +the need_resched() function every cycle. + +Suggested-by: Jens Axboe +Signed-off-by: Xuan Zhuo +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + fs/io_uring.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/io_uring.c b/fs/io_uring.c +index bb74e45941af2..63a456921903e 100644 +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -6084,7 +6084,7 @@ static int io_sq_thread(void *data) + * If submit got -EBUSY, flag us as needing the application + * to enter the kernel to reap and flush events. + */ +- if (!to_submit || ret == -EBUSY) { ++ if (!to_submit || ret == -EBUSY || need_resched()) { + /* + * Drop cur_mm before scheduling, we can't hold it for + * long periods (or over schedule()). Do this before +@@ -6100,7 +6100,7 @@ static int io_sq_thread(void *data) + * more IO, we should wait for the application to + * reap events and wake us up. + */ +- if (!list_empty(&ctx->poll_list) || ++ if (!list_empty(&ctx->poll_list) || need_resched() || + (!time_after(jiffies, timeout) && ret != -EBUSY && + !percpu_ref_is_dying(&ctx->refs))) { + if (current->task_works) +-- +2.25.1 + diff --git a/queue-5.7/io_uring-fix-sq-io-poll-with-unsupported-opcodes.patch b/queue-5.7/io_uring-fix-sq-io-poll-with-unsupported-opcodes.patch new file mode 100644 index 00000000000..6d550e266ec --- /dev/null +++ b/queue-5.7/io_uring-fix-sq-io-poll-with-unsupported-opcodes.patch @@ -0,0 +1,128 @@ +From 63b7431c62d87074ff975d1d32dcd2915d679727 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jun 2020 18:03:22 +0300 +Subject: io_uring: fix {SQ,IO}POLL with unsupported opcodes + +From: Pavel Begunkov + +[ Upstream commit 3232dd02af65f2d01be641120d2a710176b0c7a7 ] + +IORING_SETUP_IOPOLL is defined only for read/write, other opcodes should +be disallowed, otherwise it'll get an error as below. Also refuse +open/close with SQPOLL, as the polling thread wouldn't know which file +table to use. + +RIP: 0010:io_iopoll_getevents+0x111/0x5a0 +Call Trace: + ? _raw_spin_unlock_irqrestore+0x24/0x40 + ? do_send_sig_info+0x64/0x90 + io_iopoll_reap_events.part.0+0x5e/0xa0 + io_ring_ctx_wait_and_kill+0x132/0x1c0 + io_uring_release+0x20/0x30 + __fput+0xcd/0x230 + ____fput+0xe/0x10 + task_work_run+0x67/0xa0 + do_exit+0x353/0xb10 + ? handle_mm_fault+0xd4/0x200 + ? syscall_trace_enter+0x18c/0x2c0 + do_group_exit+0x43/0xa0 + __x64_sys_exit_group+0x18/0x20 + do_syscall_64+0x60/0x1e0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Signed-off-by: Pavel Begunkov +[axboe: allow provide/remove buffers and files update] +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + fs/io_uring.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/fs/io_uring.c b/fs/io_uring.c +index 4ab1728de247c..bb74e45941af2 100644 +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -2748,6 +2748,8 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; ++ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) ++ return -EINVAL; + + sp->file_in = NULL; + sp->off_in = READ_ONCE(sqe->splice_off_in); +@@ -2910,6 +2912,8 @@ static int io_fallocate_prep(struct io_kiocb *req, + { + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + return -EINVAL; ++ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) ++ return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->addr); +@@ -2935,6 +2939,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + const char __user *fname; + int ret; + ++ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) ++ return -EINVAL; + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + if (req->flags & REQ_F_FIXED_FILE) +@@ -2968,6 +2974,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + size_t len; + int ret; + ++ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) ++ return -EINVAL; + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + if (req->flags & REQ_F_FIXED_FILE) +@@ -3207,6 +3215,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, + #if defined(CONFIG_EPOLL) + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; ++ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) ++ return -EINVAL; + + req->epoll.epfd = READ_ONCE(sqe->fd); + req->epoll.op = READ_ONCE(sqe->len); +@@ -3251,6 +3261,8 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + if (sqe->ioprio || sqe->buf_index || sqe->off) + return -EINVAL; ++ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) ++ return -EINVAL; + + req->madvise.addr = READ_ONCE(sqe->addr); + req->madvise.len = READ_ONCE(sqe->len); +@@ -3285,6 +3297,8 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + { + if (sqe->ioprio || sqe->buf_index || sqe->addr) + return -EINVAL; ++ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) ++ return -EINVAL; + + req->fadvise.offset = READ_ONCE(sqe->off); + req->fadvise.len = READ_ONCE(sqe->len); +@@ -3322,6 +3336,8 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + unsigned lookup_flags; + int ret; + ++ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) ++ return -EINVAL; + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + if (req->flags & REQ_F_FIXED_FILE) +@@ -3402,6 +3418,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + */ + req->work.flags |= IO_WQ_WORK_NO_CANCEL; + ++ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) ++ return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || + sqe->rw_flags || sqe->buf_index) + return -EINVAL; +-- +2.25.1 + diff --git a/queue-5.7/kgdb-avoid-suspicious-rcu-usage-warning.patch b/queue-5.7/kgdb-avoid-suspicious-rcu-usage-warning.patch new file mode 100644 index 00000000000..1c8932f9d18 --- /dev/null +++ b/queue-5.7/kgdb-avoid-suspicious-rcu-usage-warning.patch @@ -0,0 +1,109 @@ +From 85cb1d24dc2d267f0a1d12045fcc9c39b4931703 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Jun 2020 15:47:39 -0700 +Subject: kgdb: Avoid suspicious RCU usage warning + +From: Douglas Anderson + +[ Upstream commit 440ab9e10e2e6e5fd677473ee6f9e3af0f6904d6 ] + +At times when I'm using kgdb I see a splat on my console about +suspicious RCU usage. I managed to come up with a case that could +reproduce this that looked like this: + + WARNING: suspicious RCU usage + 5.7.0-rc4+ #609 Not tainted + ----------------------------- + kernel/pid.c:395 find_task_by_pid_ns() needs rcu_read_lock() protection! + + other info that might help us debug this: + + rcu_scheduler_active = 2, debug_locks = 1 + 3 locks held by swapper/0/1: + #0: ffffff81b6b8e988 (&dev->mutex){....}-{3:3}, at: __device_attach+0x40/0x13c + #1: ffffffd01109e9e8 (dbg_master_lock){....}-{2:2}, at: kgdb_cpu_enter+0x20c/0x7ac + #2: ffffffd01109ea90 (dbg_slave_lock){....}-{2:2}, at: kgdb_cpu_enter+0x3ec/0x7ac + + stack backtrace: + CPU: 7 PID: 1 Comm: swapper/0 Not tainted 5.7.0-rc4+ #609 + Hardware name: Google Cheza (rev3+) (DT) + Call trace: + dump_backtrace+0x0/0x1b8 + show_stack+0x1c/0x24 + dump_stack+0xd4/0x134 + lockdep_rcu_suspicious+0xf0/0x100 + find_task_by_pid_ns+0x5c/0x80 + getthread+0x8c/0xb0 + gdb_serial_stub+0x9d4/0xd04 + kgdb_cpu_enter+0x284/0x7ac + kgdb_handle_exception+0x174/0x20c + kgdb_brk_fn+0x24/0x30 + call_break_hook+0x6c/0x7c + brk_handler+0x20/0x5c + do_debug_exception+0x1c8/0x22c + el1_sync_handler+0x3c/0xe4 + el1_sync+0x7c/0x100 + rpmh_rsc_probe+0x38/0x420 + platform_drv_probe+0x94/0xb4 + really_probe+0x134/0x300 + driver_probe_device+0x68/0x100 + __device_attach_driver+0x90/0xa8 + bus_for_each_drv+0x84/0xcc + __device_attach+0xb4/0x13c + device_initial_probe+0x18/0x20 + bus_probe_device+0x38/0x98 + device_add+0x38c/0x420 + +If I understand properly we should just be able to blanket kgdb under +one big RCU read lock and the problem should go away. We'll add it to +the beast-of-a-function known as kgdb_cpu_enter(). + +With this I no longer get any splats and things seem to work fine. + +Signed-off-by: Douglas Anderson +Link: https://lore.kernel.org/r/20200602154729.v2.1.I70e0d4fd46d5ed2aaf0c98a355e8e1b7a5bb7e4e@changeid +Signed-off-by: Daniel Thompson +Signed-off-by: Sasha Levin +--- + kernel/debug/debug_core.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c +index d47c7d6656cd3..9be6accf8fe3d 100644 +--- a/kernel/debug/debug_core.c ++++ b/kernel/debug/debug_core.c +@@ -577,6 +577,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, + arch_kgdb_ops.disable_hw_break(regs); + + acquirelock: ++ rcu_read_lock(); + /* + * Interrupts will be restored by the 'trap return' code, except when + * single stepping. +@@ -636,6 +637,7 @@ return_normal: + atomic_dec(&slaves_in_kgdb); + dbg_touch_watchdogs(); + local_irq_restore(flags); ++ rcu_read_unlock(); + return 0; + } + cpu_relax(); +@@ -654,6 +656,7 @@ return_normal: + raw_spin_unlock(&dbg_master_lock); + dbg_touch_watchdogs(); + local_irq_restore(flags); ++ rcu_read_unlock(); + + goto acquirelock; + } +@@ -777,6 +780,7 @@ kgdb_restore: + raw_spin_unlock(&dbg_master_lock); + dbg_touch_watchdogs(); + local_irq_restore(flags); ++ rcu_read_unlock(); + + return kgdb_info[cpu].ret_state; + } +-- +2.25.1 + diff --git a/queue-5.7/mm-dump_page-do-not-crash-with-invalid-mapping-point.patch b/queue-5.7/mm-dump_page-do-not-crash-with-invalid-mapping-point.patch new file mode 100644 index 00000000000..5a269522bfb --- /dev/null +++ b/queue-5.7/mm-dump_page-do-not-crash-with-invalid-mapping-point.patch @@ -0,0 +1,168 @@ +From a5f36522bacc5ad37ab1f9e7f5261a922a922ffb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Jun 2020 21:46:03 -0700 +Subject: mm, dump_page(): do not crash with invalid mapping pointer + +From: Vlastimil Babka + +[ Upstream commit 002ae7057069538aa3afd500f6f60a429cb948b2 ] + +We have seen a following problem on a RPi4 with 1G RAM: + + BUG: Bad page state in process systemd-hwdb pfn:35601 + page:ffff7e0000d58040 refcount:15 mapcount:131221 mapping:efd8fe765bc80080 index:0x1 compound_mapcount: -32767 + Unable to handle kernel paging request at virtual address efd8fe765bc80080 + Mem abort info: + ESR = 0x96000004 + Exception class = DABT (current EL), IL = 32 bits + SET = 0, FnV = 0 + EA = 0, S1PTW = 0 + Data abort info: + ISV = 0, ISS = 0x00000004 + CM = 0, WnR = 0 + [efd8fe765bc80080] address between user and kernel address ranges + Internal error: Oops: 96000004 [#1] SMP + Modules linked in: btrfs libcrc32c xor xor_neon zlib_deflate raid6_pq mmc_block xhci_pci xhci_hcd usbcore sdhci_iproc sdhci_pltfm sdhci mmc_core clk_raspberrypi gpio_raspberrypi_exp pcie_brcmstb bcm2835_dma gpio_regulator phy_generic fixed sg scsi_mod efivarfs + Supported: No, Unreleased kernel + CPU: 3 PID: 408 Comm: systemd-hwdb Not tainted 5.3.18-8-default #1 SLE15-SP2 (unreleased) + Hardware name: raspberrypi rpi/rpi, BIOS 2020.01 02/21/2020 + pstate: 40000085 (nZcv daIf -PAN -UAO) + pc : __dump_page+0x268/0x368 + lr : __dump_page+0xc4/0x368 + sp : ffff000012563860 + x29: ffff000012563860 x28: ffff80003ddc4300 + x27: 0000000000000010 x26: 000000000000003f + x25: ffff7e0000d58040 x24: 000000000000000f + x23: efd8fe765bc80080 x22: 0000000000020095 + x21: efd8fe765bc80080 x20: ffff000010ede8b0 + x19: ffff7e0000d58040 x18: ffffffffffffffff + x17: 0000000000000001 x16: 0000000000000007 + x15: ffff000011689708 x14: 3030386362353637 + x13: 6566386466653a67 x12: 6e697070616d2031 + x11: 32323133313a746e x10: 756f6370616d2035 + x9 : ffff00001168a840 x8 : ffff00001077a670 + x7 : 000000000000013d x6 : ffff0000118a43b5 + x5 : 0000000000000001 x4 : ffff80003dd9e2c8 + x3 : ffff80003dd9e2c8 x2 : 911c8d7c2f483500 + x1 : dead000000000100 x0 : efd8fe765bc80080 + Call trace: + __dump_page+0x268/0x368 + bad_page+0xd4/0x168 + check_new_page_bad+0x80/0xb8 + rmqueue_bulk.constprop.26+0x4d8/0x788 + get_page_from_freelist+0x4d4/0x1228 + __alloc_pages_nodemask+0x134/0xe48 + alloc_pages_vma+0x198/0x1c0 + do_anonymous_page+0x1a4/0x4d8 + __handle_mm_fault+0x4e8/0x560 + handle_mm_fault+0x104/0x1e0 + do_page_fault+0x1e8/0x4c0 + do_translation_fault+0xb0/0xc0 + do_mem_abort+0x50/0xb0 + el0_da+0x24/0x28 + Code: f9401025 8b8018a0 9a851005 17ffffca (f94002a0) + +Besides the underlying issue with page->mapping containing a bogus value +for some reason, we can see that __dump_page() crashed by trying to read +the pointer at mapping->host, turning a recoverable warning into full +Oops. + +It can be expected that when page is reported as bad state for some +reason, the pointers there should not be trusted blindly. + +So this patch treats all data in __dump_page() that depends on +page->mapping as lava, using probe_kernel_read_strict(). Ideally this +would include the dentry->d_parent recursively, but that would mean +changing printk handler for %pd. Chances of reaching the dentry +printing part with an initially bogus mapping pointer should be rather +low, though. + +Also prefix printing mapping->a_ops with a description of what is being +printed. In case the value is bogus, %ps will print raw value instead +of the symbol name and then it's not obvious at all that it's printing +a_ops. + +Reported-by: Petr Tesarik +Signed-off-by: Vlastimil Babka +Signed-off-by: Andrew Morton +Acked-by: Kirill A. Shutemov +Cc: Matthew Wilcox +Cc: John Hubbard +Link: http://lkml.kernel.org/r/20200331165454.12263-1-vbabka@suse.cz +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/debug.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 50 insertions(+), 6 deletions(-) + +diff --git a/mm/debug.c b/mm/debug.c +index 2189357f09871..f2ede2df585a9 100644 +--- a/mm/debug.c ++++ b/mm/debug.c +@@ -110,13 +110,57 @@ void __dump_page(struct page *page, const char *reason) + else if (PageAnon(page)) + type = "anon "; + else if (mapping) { +- if (mapping->host && mapping->host->i_dentry.first) { +- struct dentry *dentry; +- dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); +- pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); +- } else +- pr_warn("%ps\n", mapping->a_ops); ++ const struct inode *host; ++ const struct address_space_operations *a_ops; ++ const struct hlist_node *dentry_first; ++ const struct dentry *dentry_ptr; ++ struct dentry dentry; ++ ++ /* ++ * mapping can be invalid pointer and we don't want to crash ++ * accessing it, so probe everything depending on it carefully ++ */ ++ if (probe_kernel_read_strict(&host, &mapping->host, ++ sizeof(struct inode *)) || ++ probe_kernel_read_strict(&a_ops, &mapping->a_ops, ++ sizeof(struct address_space_operations *))) { ++ pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n"); ++ goto out_mapping; ++ } ++ ++ if (!host) { ++ pr_warn("mapping->a_ops:%ps\n", a_ops); ++ goto out_mapping; ++ } ++ ++ if (probe_kernel_read_strict(&dentry_first, ++ &host->i_dentry.first, sizeof(struct hlist_node *))) { ++ pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n", ++ a_ops, host); ++ goto out_mapping; ++ } ++ ++ if (!dentry_first) { ++ pr_warn("mapping->a_ops:%ps\n", a_ops); ++ goto out_mapping; ++ } ++ ++ dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); ++ if (probe_kernel_read_strict(&dentry, dentry_ptr, ++ sizeof(struct dentry))) { ++ pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n", ++ a_ops, dentry_ptr); ++ } else { ++ /* ++ * if dentry is corrupted, the %pd handler may still ++ * crash, but it's unlikely that we reach here with a ++ * corrupted struct page ++ */ ++ pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n", ++ a_ops, &dentry); ++ } + } ++out_mapping: + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + + pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags, +-- +2.25.1 + diff --git a/queue-5.7/mm-slub-fix-stack-overruns-with-slub_stats.patch b/queue-5.7/mm-slub-fix-stack-overruns-with-slub_stats.patch new file mode 100644 index 00000000000..859c4ef11b7 --- /dev/null +++ b/queue-5.7/mm-slub-fix-stack-overruns-with-slub_stats.patch @@ -0,0 +1,90 @@ +From 1e0dc7359392d6722e92a9b7f6ceb6ba715a3f58 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Jun 2020 21:45:57 -0700 +Subject: mm/slub: fix stack overruns with SLUB_STATS + +From: Qian Cai + +[ Upstream commit a68ee0573991e90af2f1785db309206408bad3e5 ] + +There is no need to copy SLUB_STATS items from root memcg cache to new +memcg cache copies. Doing so could result in stack overruns because the +store function only accepts 0 to clear the stat and returns an error for +everything else while the show method would print out the whole stat. + +Then, the mismatch of the lengths returns from show and store methods +happens in memcg_propagate_slab_attrs(): + + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + +max_attr_size is only 2 from slab_attr_store(), then, it uses mbuf[64] +in show_stat() later where a bounch of sprintf() would overrun the stack +variable. Fix it by always allocating a page of buffer to be used in +show_stat() if SLUB_STATS=y which should only be used for debug purpose. + + # echo 1 > /sys/kernel/slab/fs_cache/shrink + BUG: KASAN: stack-out-of-bounds in number+0x421/0x6e0 + Write of size 1 at addr ffffc900256cfde0 by task kworker/76:0/53251 + + Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 + Workqueue: memcg_kmem_cache memcg_kmem_cache_create_func + Call Trace: + number+0x421/0x6e0 + vsnprintf+0x451/0x8e0 + sprintf+0x9e/0xd0 + show_stat+0x124/0x1d0 + alloc_slowpath_show+0x13/0x20 + __kmem_cache_create+0x47a/0x6b0 + + addr ffffc900256cfde0 is located in stack of task kworker/76:0/53251 at offset 0 in frame: + process_one_work+0x0/0xb90 + + this frame has 1 object: + [32, 72) 'lockdep_map' + + Memory state around the buggy address: + ffffc900256cfc80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + ffffc900256cfd00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + >ffffc900256cfd80: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 + ^ + ffffc900256cfe00: 00 00 00 00 00 f2 f2 f2 00 00 00 00 00 00 00 00 + ffffc900256cfe80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + ================================================================== + Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: __kmem_cache_create+0x6ac/0x6b0 + Workqueue: memcg_kmem_cache memcg_kmem_cache_create_func + Call Trace: + __kmem_cache_create+0x6ac/0x6b0 + +Fixes: 107dab5c92d5 ("slub: slub-specific propagation changes") +Signed-off-by: Qian Cai +Signed-off-by: Andrew Morton +Cc: Glauber Costa +Cc: Christoph Lameter +Cc: Pekka Enberg +Cc: David Rientjes +Cc: Joonsoo Kim +Link: http://lkml.kernel.org/r/20200429222356.4322-1-cai@lca.pw +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/slub.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/mm/slub.c b/mm/slub.c +index 63f372366ec59..660f4324c0972 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -5681,7 +5681,8 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) + */ + if (buffer) + buf = buffer; +- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) ++ else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) && ++ !IS_ENABLED(CONFIG_SLUB_STATS)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); +-- +2.25.1 + diff --git a/queue-5.7/mm-slub.c-fix-corrupted-freechain-in-deactivate_slab.patch b/queue-5.7/mm-slub.c-fix-corrupted-freechain-in-deactivate_slab.patch new file mode 100644 index 00000000000..e50f0f29c70 --- /dev/null +++ b/queue-5.7/mm-slub.c-fix-corrupted-freechain-in-deactivate_slab.patch @@ -0,0 +1,115 @@ +From 8e1c4be9a4c25e4bbeda3ed8146345d3314fa410 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Jun 2020 21:45:47 -0700 +Subject: mm/slub.c: fix corrupted freechain in deactivate_slab() + +From: Dongli Zhang + +[ Upstream commit 52f23478081ae0dcdb95d1650ea1e7d52d586829 ] + +The slub_debug is able to fix the corrupted slab freelist/page. +However, alloc_debug_processing() only checks the validity of current +and next freepointer during allocation path. As a result, once some +objects have their freepointers corrupted, deactivate_slab() may lead to +page fault. + +Below is from a test kernel module when 'slub_debug=PUF,kmalloc-128 +slub_nomerge'. The test kernel corrupts the freepointer of one free +object on purpose. Unfortunately, deactivate_slab() does not detect it +when iterating the freechain. + + BUG: unable to handle page fault for address: 00000000123456f8 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 0 P4D 0 + Oops: 0000 [#1] SMP PTI + ... ... + RIP: 0010:deactivate_slab.isra.92+0xed/0x490 + ... ... + Call Trace: + ___slab_alloc+0x536/0x570 + __slab_alloc+0x17/0x30 + __kmalloc+0x1d9/0x200 + ext4_htree_store_dirent+0x30/0xf0 + htree_dirblock_to_tree+0xcb/0x1c0 + ext4_htree_fill_tree+0x1bc/0x2d0 + ext4_readdir+0x54f/0x920 + iterate_dir+0x88/0x190 + __x64_sys_getdents+0xa6/0x140 + do_syscall_64+0x49/0x170 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Therefore, this patch adds extra consistency check in deactivate_slab(). +Once an object's freepointer is corrupted, all following objects +starting at this object are isolated. + +[akpm@linux-foundation.org: fix build with CONFIG_SLAB_DEBUG=n] +Signed-off-by: Dongli Zhang +Signed-off-by: Andrew Morton +Cc: Joe Jin +Cc: Christoph Lameter +Cc: Pekka Enberg +Cc: David Rientjes +Cc: Joonsoo Kim +Link: http://lkml.kernel.org/r/20200331031450.12182-1-dongli.zhang@oracle.com +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/slub.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/mm/slub.c b/mm/slub.c +index 63bd39c476431..63f372366ec59 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -679,6 +679,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) + va_end(args); + } + ++static bool freelist_corrupted(struct kmem_cache *s, struct page *page, ++ void *freelist, void *nextfree) ++{ ++ if ((s->flags & SLAB_CONSISTENCY_CHECKS) && ++ !check_valid_pointer(s, page, nextfree)) { ++ object_err(s, page, freelist, "Freechain corrupt"); ++ freelist = NULL; ++ slab_fix(s, "Isolate corrupted freechain"); ++ return true; ++ } ++ ++ return false; ++} ++ + static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) + { + unsigned int off; /* Offset of last byte */ +@@ -1410,6 +1424,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, + static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} + ++static bool freelist_corrupted(struct kmem_cache *s, struct page *page, ++ void *freelist, void *nextfree) ++{ ++ return false; ++} + #endif /* CONFIG_SLUB_DEBUG */ + + /* +@@ -2093,6 +2112,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, + void *prior; + unsigned long counters; + ++ /* ++ * If 'nextfree' is invalid, it is possible that the object at ++ * 'freelist' is already corrupted. So isolate all objects ++ * starting at 'freelist'. ++ */ ++ if (freelist_corrupted(s, page, freelist, nextfree)) ++ break; ++ + do { + prior = page->freelist; + counters = page->counters; +-- +2.25.1 + diff --git a/queue-5.7/nvme-fix-possible-deadlock-when-i-o-is-blocked.patch b/queue-5.7/nvme-fix-possible-deadlock-when-i-o-is-blocked.patch new file mode 100644 index 00000000000..1ed45547471 --- /dev/null +++ b/queue-5.7/nvme-fix-possible-deadlock-when-i-o-is-blocked.patch @@ -0,0 +1,124 @@ +From 887bad9523741466ddcf7a4f4a7235d0d08437d4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jun 2020 01:53:08 -0700 +Subject: nvme: fix possible deadlock when I/O is blocked + +From: Sagi Grimberg + +[ Upstream commit 3b4b19721ec652ad2c4fe51dfbe5124212b5f581 ] + +Revert fab7772bfbcf ("nvme-multipath: revalidate nvme_ns_head gendisk +in nvme_validate_ns") + +When adding a new namespace to the head disk (via nvme_mpath_set_live) +we will see partition scan which triggers I/O on the mpath device node. +This process will usually be triggered from the scan_work which holds +the scan_lock. If I/O blocks (if we got ana change currently have only +available paths but none are accessible) this can deadlock on the head +disk bd_mutex as both partition scan I/O takes it, and head disk revalidation +takes it to check for resize (also triggered from scan_work on a different +path). See trace [1]. + +The mpath disk revalidation was originally added to detect online disk +size change, but this is no longer needed since commit cb224c3af4df +("nvme: Convert to use set_capacity_revalidate_and_notify") which already +updates resize info without unnecessarily revalidating the disk (the +mpath disk doesn't even implement .revalidate_disk fop). + +[1]: +-- +kernel: INFO: task kworker/u65:9:494 blocked for more than 241 seconds. +kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 +kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +kernel: kworker/u65:9 D 0 494 2 0x80004000 +kernel: Workqueue: nvme-wq nvme_scan_work [nvme_core] +kernel: Call Trace: +kernel: __schedule+0x2b9/0x6c0 +kernel: schedule+0x42/0xb0 +kernel: schedule_preempt_disabled+0xe/0x10 +kernel: __mutex_lock.isra.0+0x182/0x4f0 +kernel: __mutex_lock_slowpath+0x13/0x20 +kernel: mutex_lock+0x2e/0x40 +kernel: revalidate_disk+0x63/0xa0 +kernel: __nvme_revalidate_disk+0xfe/0x110 [nvme_core] +kernel: nvme_revalidate_disk+0xa4/0x160 [nvme_core] +kernel: ? evict+0x14c/0x1b0 +kernel: revalidate_disk+0x2b/0xa0 +kernel: nvme_validate_ns+0x49/0x940 [nvme_core] +kernel: ? blk_mq_free_request+0xd2/0x100 +kernel: ? __nvme_submit_sync_cmd+0xbe/0x1e0 [nvme_core] +kernel: nvme_scan_work+0x24f/0x380 [nvme_core] +kernel: process_one_work+0x1db/0x380 +kernel: worker_thread+0x249/0x400 +kernel: kthread+0x104/0x140 +kernel: ? process_one_work+0x380/0x380 +kernel: ? kthread_park+0x80/0x80 +kernel: ret_from_fork+0x1f/0x40 +... +kernel: INFO: task kworker/u65:1:2630 blocked for more than 241 seconds. +kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 +kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +kernel: kworker/u65:1 D 0 2630 2 0x80004000 +kernel: Workqueue: nvme-wq nvme_scan_work [nvme_core] +kernel: Call Trace: +kernel: __schedule+0x2b9/0x6c0 +kernel: schedule+0x42/0xb0 +kernel: io_schedule+0x16/0x40 +kernel: do_read_cache_page+0x438/0x830 +kernel: ? __switch_to_asm+0x34/0x70 +kernel: ? file_fdatawait_range+0x30/0x30 +kernel: read_cache_page+0x12/0x20 +kernel: read_dev_sector+0x27/0xc0 +kernel: read_lba+0xc1/0x220 +kernel: ? kmem_cache_alloc_trace+0x19c/0x230 +kernel: efi_partition+0x1e6/0x708 +kernel: ? vsnprintf+0x39e/0x4e0 +kernel: ? snprintf+0x49/0x60 +kernel: check_partition+0x154/0x244 +kernel: rescan_partitions+0xae/0x280 +kernel: __blkdev_get+0x40f/0x560 +kernel: blkdev_get+0x3d/0x140 +kernel: __device_add_disk+0x388/0x480 +kernel: device_add_disk+0x13/0x20 +kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] +kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] +kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] +kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] +kernel: ? nvme_update_ns_ana_state+0x60/0x60 [nvme_core] +kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] +kernel: nvme_validate_ns+0x396/0x940 [nvme_core] +kernel: ? blk_mq_free_request+0xd2/0x100 +kernel: nvme_scan_work+0x24f/0x380 [nvme_core] +kernel: process_one_work+0x1db/0x380 +kernel: worker_thread+0x249/0x400 +kernel: kthread+0x104/0x140 +kernel: ? process_one_work+0x380/0x380 +kernel: ? kthread_park+0x80/0x80 +kernel: ret_from_fork+0x1f/0x40 +-- + +Fixes: fab7772bfbcf ("nvme-multipath: revalidate nvme_ns_head gendisk +in nvme_validate_ns") +Signed-off-by: Anton Eidelman +Signed-off-by: Sagi Grimberg +Signed-off-by: Christoph Hellwig +Signed-off-by: Sasha Levin +--- + drivers/nvme/host/core.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c +index 887139f8fa53b..85ce6c682849e 100644 +--- a/drivers/nvme/host/core.c ++++ b/drivers/nvme/host/core.c +@@ -1910,7 +1910,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) + if (ns->head->disk) { + nvme_update_disk_info(ns->head->disk, ns, id); + blk_queue_stack_limits(ns->head->disk->queue, ns->queue); +- revalidate_disk(ns->head->disk); + } + #endif + } +-- +2.25.1 + diff --git a/queue-5.7/nvme-multipath-fix-bogus-request-queue-reference-put.patch b/queue-5.7/nvme-multipath-fix-bogus-request-queue-reference-put.patch new file mode 100644 index 00000000000..c855d3ce7aa --- /dev/null +++ b/queue-5.7/nvme-multipath-fix-bogus-request-queue-reference-put.patch @@ -0,0 +1,84 @@ +From 7a92228148b921abe3ffd8966dfd49413aafea67 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jun 2020 01:53:12 -0700 +Subject: nvme-multipath: fix bogus request queue reference put + +From: Sagi Grimberg + +[ Upstream commit c31244669f57963b6ce133a5555b118fc50aec95 ] + +The mpath disk node takes a reference on the request mpath +request queue when adding live path to the mpath gendisk. +However if we connected to an inaccessible path device_add_disk +is not called, so if we disconnect and remove the mpath gendisk +we endup putting an reference on the request queue that was +never taken [1]. + +Fix that to check if we ever added a live path (using +NVME_NS_HEAD_HAS_DISK flag) and if not, clear the disk->queue +reference. + +[1]: +------------[ cut here ]------------ +refcount_t: underflow; use-after-free. +WARNING: CPU: 1 PID: 1372 at lib/refcount.c:28 refcount_warn_saturate+0xa6/0xf0 +CPU: 1 PID: 1372 Comm: nvme Tainted: G O 5.7.0-rc2+ #3 +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1 04/01/2014 +RIP: 0010:refcount_warn_saturate+0xa6/0xf0 +RSP: 0018:ffffb29e8053bdc0 EFLAGS: 00010282 +RAX: 0000000000000000 RBX: ffff8b7a2f4fc060 RCX: 0000000000000007 +RDX: 0000000000000007 RSI: 0000000000000092 RDI: ffff8b7a3ec99980 +RBP: ffff8b7a2f4fc000 R08: 00000000000002e1 R09: 0000000000000004 +R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 +R13: fffffffffffffff2 R14: ffffb29e8053bf08 R15: ffff8b7a320e2da0 +FS: 00007f135d4ca800(0000) GS:ffff8b7a3ec80000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00005651178c0c30 CR3: 000000003b650005 CR4: 0000000000360ee0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + disk_release+0xa2/0xc0 + device_release+0x28/0x80 + kobject_put+0xa5/0x1b0 + nvme_put_ns_head+0x26/0x70 [nvme_core] + nvme_put_ns+0x30/0x60 [nvme_core] + nvme_remove_namespaces+0x9b/0xe0 [nvme_core] + nvme_do_delete_ctrl+0x43/0x5c [nvme_core] + nvme_sysfs_delete.cold+0x8/0xd [nvme_core] + kernfs_fop_write+0xc1/0x1a0 + vfs_write+0xb6/0x1a0 + ksys_write+0x5f/0xe0 + do_syscall_64+0x52/0x1a0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Reported-by: Anton Eidelman +Tested-by: Anton Eidelman +Signed-off-by: Sagi Grimberg +Signed-off-by: Christoph Hellwig +Signed-off-by: Sasha Levin +--- + drivers/nvme/host/multipath.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c +index d1cb65698288b..03bc3aba09871 100644 +--- a/drivers/nvme/host/multipath.c ++++ b/drivers/nvme/host/multipath.c +@@ -691,6 +691,14 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) + kblockd_schedule_work(&head->requeue_work); + flush_work(&head->requeue_work); + blk_cleanup_queue(head->disk->queue); ++ if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { ++ /* ++ * if device_add_disk wasn't called, prevent ++ * disk release to put a bogus reference on the ++ * request queue ++ */ ++ head->disk->queue = NULL; ++ } + put_disk(head->disk); + } + +-- +2.25.1 + diff --git a/queue-5.7/nvme-multipath-fix-deadlock-between-ana_work-and-sca.patch b/queue-5.7/nvme-multipath-fix-deadlock-between-ana_work-and-sca.patch new file mode 100644 index 00000000000..5a970f1c311 --- /dev/null +++ b/queue-5.7/nvme-multipath-fix-deadlock-between-ana_work-and-sca.patch @@ -0,0 +1,134 @@ +From a78bc3d6bec05bec167b99ed2fdf2c05e57c98fa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jun 2020 01:53:09 -0700 +Subject: nvme-multipath: fix deadlock between ana_work and scan_work + +From: Anton Eidelman + +[ Upstream commit 489dd102a2c7c94d783a35f9412eb085b8da1aa4 ] + +When scan_work calls nvme_mpath_add_disk() this holds ana_lock +and invokes nvme_parse_ana_log(), which may issue IO +in device_add_disk() and hang waiting for an accessible path. +While nvme_mpath_set_live() only called when nvme_state_is_live(), +a transition may cause NVME_SC_ANA_TRANSITION and requeue the IO. + +In order to recover and complete the IO ana_work on the same ctrl +should be able to update the path state and remove NVME_NS_ANA_PENDING. + +The deadlock occurs because scan_work keeps holding ana_lock, +so ana_work hangs [1]. + +Fix: +Now nvme_mpath_add_disk() uses nvme_parse_ana_log() to obtain a copy +of the ANA group desc, and then calls nvme_update_ns_ana_state() without +holding ana_lock. + +[1]: +kernel: Workqueue: nvme-wq nvme_scan_work [nvme_core] +kernel: Call Trace: +kernel: __schedule+0x2b9/0x6c0 +kernel: schedule+0x42/0xb0 +kernel: io_schedule+0x16/0x40 +kernel: do_read_cache_page+0x438/0x830 +kernel: read_cache_page+0x12/0x20 +kernel: read_dev_sector+0x27/0xc0 +kernel: read_lba+0xc1/0x220 +kernel: efi_partition+0x1e6/0x708 +kernel: check_partition+0x154/0x244 +kernel: rescan_partitions+0xae/0x280 +kernel: __blkdev_get+0x40f/0x560 +kernel: blkdev_get+0x3d/0x140 +kernel: __device_add_disk+0x388/0x480 +kernel: device_add_disk+0x13/0x20 +kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] +kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] +kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] +kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] +kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] +kernel: nvme_validate_ns+0x396/0x940 [nvme_core] +kernel: nvme_scan_work+0x24f/0x380 [nvme_core] +kernel: process_one_work+0x1db/0x380 +kernel: worker_thread+0x249/0x400 +kernel: kthread+0x104/0x140 + +kernel: Workqueue: nvme-wq nvme_ana_work [nvme_core] +kernel: Call Trace: +kernel: __schedule+0x2b9/0x6c0 +kernel: schedule+0x42/0xb0 +kernel: schedule_preempt_disabled+0xe/0x10 +kernel: __mutex_lock.isra.0+0x182/0x4f0 +kernel: ? __switch_to_asm+0x34/0x70 +kernel: ? select_task_rq_fair+0x1aa/0x5c0 +kernel: ? kvm_sched_clock_read+0x11/0x20 +kernel: ? sched_clock+0x9/0x10 +kernel: __mutex_lock_slowpath+0x13/0x20 +kernel: mutex_lock+0x2e/0x40 +kernel: nvme_read_ana_log+0x3a/0x100 [nvme_core] +kernel: nvme_ana_work+0x15/0x20 [nvme_core] +kernel: process_one_work+0x1db/0x380 +kernel: worker_thread+0x4d/0x400 +kernel: kthread+0x104/0x140 +kernel: ? process_one_work+0x380/0x380 +kernel: ? kthread_park+0x80/0x80 +kernel: ret_from_fork+0x35/0x40 + +Fixes: 0d0b660f214d ("nvme: add ANA support") +Signed-off-by: Anton Eidelman +Signed-off-by: Sagi Grimberg +Signed-off-by: Christoph Hellwig +Signed-off-by: Sasha Levin +--- + drivers/nvme/host/multipath.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c +index 91a8b1ce5a3a2..f4287d8550a9f 100644 +--- a/drivers/nvme/host/multipath.c ++++ b/drivers/nvme/host/multipath.c +@@ -639,26 +639,34 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, + } + DEVICE_ATTR_RO(ana_state); + +-static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl, ++static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, + struct nvme_ana_group_desc *desc, void *data) + { +- struct nvme_ns *ns = data; ++ struct nvme_ana_group_desc *dst = data; + +- if (ns->ana_grpid == le32_to_cpu(desc->grpid)) { +- nvme_update_ns_ana_state(desc, ns); +- return -ENXIO; /* just break out of the loop */ +- } ++ if (desc->grpid != dst->grpid) ++ return 0; + +- return 0; ++ *dst = *desc; ++ return -ENXIO; /* just break out of the loop */ + } + + void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) + { + if (nvme_ctrl_use_ana(ns->ctrl)) { ++ struct nvme_ana_group_desc desc = { ++ .grpid = id->anagrpid, ++ .state = 0, ++ }; ++ + mutex_lock(&ns->ctrl->ana_lock); + ns->ana_grpid = le32_to_cpu(id->anagrpid); +- nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state); ++ nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); + mutex_unlock(&ns->ctrl->ana_lock); ++ if (desc.state) { ++ /* found the group desc: update */ ++ nvme_update_ns_ana_state(&desc, ns); ++ } + } else { + ns->ana_state = NVME_ANA_OPTIMIZED; + nvme_mpath_set_live(ns); +-- +2.25.1 + diff --git a/queue-5.7/nvme-multipath-fix-deadlock-due-to-head-lock.patch b/queue-5.7/nvme-multipath-fix-deadlock-due-to-head-lock.patch new file mode 100644 index 00000000000..42ae4b066ac --- /dev/null +++ b/queue-5.7/nvme-multipath-fix-deadlock-due-to-head-lock.patch @@ -0,0 +1,124 @@ +From d10f0b086190b4e1bbd3e0ee93e02bd9235e3df1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jun 2020 01:53:11 -0700 +Subject: nvme-multipath: fix deadlock due to head->lock + +From: Anton Eidelman + +[ Upstream commit d8a22f85609fadb46ba699e0136cc3ebdeebff79 ] + +In the following scenario scan_work and ana_work will deadlock: + +When scan_work calls nvme_mpath_add_disk() this holds ana_lock +and invokes nvme_parse_ana_log(), which may issue IO +in device_add_disk() and hang waiting for an accessible path. + +While nvme_mpath_set_live() only called when nvme_state_is_live(), +a transition may cause NVME_SC_ANA_TRANSITION and requeue the IO. + +Since nvme_mpath_set_live() holds ns->head->lock, an ana_work on +ANY ctrl will not be able to complete nvme_mpath_set_live() +on the same ns->head, which is required in order to update +the new accessible path and remove NVME_NS_ANA_PENDING.. +Therefore IO never completes: deadlock [1]. + +Fix: +Move device_add_disk out of the head->lock and protect it with an +atomic test_and_set for a new NVME_NS_HEAD_HAS_DISK bit. + +[1]: +kernel: INFO: task kworker/u8:2:160 blocked for more than 120 seconds. +kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 +kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +kernel: kworker/u8:2 D 0 160 2 0x80004000 +kernel: Workqueue: nvme-wq nvme_ana_work [nvme_core] +kernel: Call Trace: +kernel: __schedule+0x2b9/0x6c0 +kernel: schedule+0x42/0xb0 +kernel: schedule_preempt_disabled+0xe/0x10 +kernel: __mutex_lock.isra.0+0x182/0x4f0 +kernel: __mutex_lock_slowpath+0x13/0x20 +kernel: mutex_lock+0x2e/0x40 +kernel: nvme_update_ns_ana_state+0x22/0x60 [nvme_core] +kernel: nvme_update_ana_state+0xca/0xe0 [nvme_core] +kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] +kernel: nvme_read_ana_log+0x76/0x100 [nvme_core] +kernel: nvme_ana_work+0x15/0x20 [nvme_core] +kernel: process_one_work+0x1db/0x380 +kernel: worker_thread+0x4d/0x400 +kernel: kthread+0x104/0x140 +kernel: ret_from_fork+0x35/0x40 +kernel: INFO: task kworker/u8:4:439 blocked for more than 120 seconds. +kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 +kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +kernel: kworker/u8:4 D 0 439 2 0x80004000 +kernel: Workqueue: nvme-wq nvme_scan_work [nvme_core] +kernel: Call Trace: +kernel: __schedule+0x2b9/0x6c0 +kernel: schedule+0x42/0xb0 +kernel: io_schedule+0x16/0x40 +kernel: do_read_cache_page+0x438/0x830 +kernel: read_cache_page+0x12/0x20 +kernel: read_dev_sector+0x27/0xc0 +kernel: read_lba+0xc1/0x220 +kernel: efi_partition+0x1e6/0x708 +kernel: check_partition+0x154/0x244 +kernel: rescan_partitions+0xae/0x280 +kernel: __blkdev_get+0x40f/0x560 +kernel: blkdev_get+0x3d/0x140 +kernel: __device_add_disk+0x388/0x480 +kernel: device_add_disk+0x13/0x20 +kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] +kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] +kernel: nvme_mpath_add_disk+0xbe/0x100 [nvme_core] +kernel: nvme_validate_ns+0x396/0x940 [nvme_core] +kernel: nvme_scan_work+0x256/0x390 [nvme_core] +kernel: process_one_work+0x1db/0x380 +kernel: worker_thread+0x4d/0x400 +kernel: kthread+0x104/0x140 +kernel: ret_from_fork+0x35/0x40 + +Fixes: 0d0b660f214d ("nvme: add ANA support") +Signed-off-by: Anton Eidelman +Signed-off-by: Sagi Grimberg +Signed-off-by: Christoph Hellwig +Signed-off-by: Sasha Levin +--- + drivers/nvme/host/multipath.c | 4 ++-- + drivers/nvme/host/nvme.h | 2 ++ + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c +index f4287d8550a9f..d1cb65698288b 100644 +--- a/drivers/nvme/host/multipath.c ++++ b/drivers/nvme/host/multipath.c +@@ -413,11 +413,11 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) + if (!head->disk) + return; + +- mutex_lock(&head->lock); +- if (!(head->disk->flags & GENHD_FL_UP)) ++ if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) + device_add_disk(&head->subsys->dev, head->disk, + nvme_ns_id_attr_groups); + ++ mutex_lock(&head->lock); + if (nvme_path_is_optimized(ns)) { + int node, srcu_idx; + +diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h +index 2e04a36296d95..719342600be62 100644 +--- a/drivers/nvme/host/nvme.h ++++ b/drivers/nvme/host/nvme.h +@@ -359,6 +359,8 @@ struct nvme_ns_head { + spinlock_t requeue_lock; + struct work_struct requeue_work; + struct mutex lock; ++ unsigned long flags; ++#define NVME_NSHEAD_DISK_LIVE 0 + struct nvme_ns __rcu *current_path[]; + #endif + }; +-- +2.25.1 + diff --git a/queue-5.7/nvme-multipath-set-bdi-capabilities-once.patch b/queue-5.7/nvme-multipath-set-bdi-capabilities-once.patch new file mode 100644 index 00000000000..939da1001f2 --- /dev/null +++ b/queue-5.7/nvme-multipath-set-bdi-capabilities-once.patch @@ -0,0 +1,70 @@ +From 8e16bee27179da297be319641e3f3d39830b6fc0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Apr 2020 09:09:04 -0700 +Subject: nvme-multipath: set bdi capabilities once + +From: Keith Busch + +[ Upstream commit b2ce4d90690bd29ce5b554e203cd03682dd59697 ] + +The queues' backing device info capabilities don't change with each +namespace revalidation. Set it only when each path's request_queue +is initially added to a multipath queue. + +Signed-off-by: Keith Busch +Reviewed-by: Sagi Grimberg +Signed-off-by: Christoph Hellwig +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + drivers/nvme/host/core.c | 7 ------- + drivers/nvme/host/multipath.c | 8 ++++++++ + 2 files changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c +index 7b4cbe2c69541..887139f8fa53b 100644 +--- a/drivers/nvme/host/core.c ++++ b/drivers/nvme/host/core.c +@@ -1910,13 +1910,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) + if (ns->head->disk) { + nvme_update_disk_info(ns->head->disk, ns, id); + blk_queue_stack_limits(ns->head->disk->queue, ns->queue); +- if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { +- struct backing_dev_info *info = +- ns->head->disk->queue->backing_dev_info; +- +- info->capabilities |= BDI_CAP_STABLE_WRITES; +- } +- + revalidate_disk(ns->head->disk); + } + #endif +diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c +index 17f172cf456ad..91a8b1ce5a3a2 100644 +--- a/drivers/nvme/host/multipath.c ++++ b/drivers/nvme/host/multipath.c +@@ -3,6 +3,7 @@ + * Copyright (c) 2017-2018 Christoph Hellwig. + */ + ++#include + #include + #include + #include "nvme.h" +@@ -662,6 +663,13 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) + ns->ana_state = NVME_ANA_OPTIMIZED; + nvme_mpath_set_live(ns); + } ++ ++ if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { ++ struct backing_dev_info *info = ++ ns->head->disk->queue->backing_dev_info; ++ ++ info->capabilities |= BDI_CAP_STABLE_WRITES; ++ } + } + + void nvme_mpath_remove_disk(struct nvme_ns_head *head) +-- +2.25.1 + diff --git a/queue-5.7/powerpc-book3s64-kvm-fix-secondary-page-table-walk-w.patch b/queue-5.7/powerpc-book3s64-kvm-fix-secondary-page-table-walk-w.patch new file mode 100644 index 00000000000..e14cd0c9da0 --- /dev/null +++ b/queue-5.7/powerpc-book3s64-kvm-fix-secondary-page-table-walk-w.patch @@ -0,0 +1,129 @@ +From 715060350a8a13be53857cdbbc06ba460da8e4d2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 May 2020 13:34:56 +0530 +Subject: powerpc/book3s64/kvm: Fix secondary page table walk warning during + migration + +From: Aneesh Kumar K.V + +[ Upstream commit bf8036a4098d1548cdccf9ed5c523ef4e83e3c68 ] + +This patch fixes the below warning reported during migration: + + find_kvm_secondary_pte called with kvm mmu_lock not held + CPU: 23 PID: 5341 Comm: qemu-system-ppc Tainted: G W 5.7.0-rc5-kvm-00211-g9ccf10d6d088 #432 + NIP: c008000000fe848c LR: c008000000fe8488 CTR: 0000000000000000 + REGS: c000001e19f077e0 TRAP: 0700 Tainted: G W (5.7.0-rc5-kvm-00211-g9ccf10d6d088) + MSR: 9000000000029033 CR: 42222422 XER: 20040000 + CFAR: c00000000012f5ac IRQMASK: 0 + GPR00: c008000000fe8488 c000001e19f07a70 c008000000ffe200 0000000000000039 + GPR04: 0000000000000001 c000001ffc8b4900 0000000000018840 0000000000000007 + GPR08: 0000000000000003 0000000000000001 0000000000000007 0000000000000001 + GPR12: 0000000000002000 c000001fff6d9400 000000011f884678 00007fff70b70000 + GPR16: 00007fff7137cb90 00007fff7dcb4410 0000000000000001 0000000000000000 + GPR20: 000000000ffe0000 0000000000000000 0000000000000001 0000000000000000 + GPR24: 8000000000000000 0000000000000001 c000001e1f67e600 c000001e1fd82410 + GPR28: 0000000000001000 c000001e2e410000 0000000000000fff 0000000000000ffe + NIP [c008000000fe848c] kvmppc_hv_get_dirty_log_radix+0x2e4/0x340 [kvm_hv] + LR [c008000000fe8488] kvmppc_hv_get_dirty_log_radix+0x2e0/0x340 [kvm_hv] + Call Trace: + [c000001e19f07a70] [c008000000fe8488] kvmppc_hv_get_dirty_log_radix+0x2e0/0x340 [kvm_hv] (unreliable) + [c000001e19f07b50] [c008000000fd42e4] kvm_vm_ioctl_get_dirty_log_hv+0x33c/0x3c0 [kvm_hv] + [c000001e19f07be0] [c008000000eea878] kvm_vm_ioctl_get_dirty_log+0x30/0x50 [kvm] + [c000001e19f07c00] [c008000000edc818] kvm_vm_ioctl+0x2b0/0xc00 [kvm] + [c000001e19f07d50] [c00000000046e148] ksys_ioctl+0xf8/0x150 + [c000001e19f07da0] [c00000000046e1c8] sys_ioctl+0x28/0x80 + [c000001e19f07dc0] [c00000000003652c] system_call_exception+0x16c/0x240 + [c000001e19f07e20] [c00000000000d070] system_call_common+0xf0/0x278 + Instruction dump: + 7d3a512a 4200ffd0 7ffefb78 4bfffdc4 60000000 3c820000 e8848468 3c620000 + e86384a8 38840010 4800673d e8410018 <0fe00000> 4bfffdd4 60000000 60000000 + +Reported-by: Paul Mackerras +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20200528080456.87797-1-aneesh.kumar@linux.ibm.com +Signed-off-by: Sasha Levin +--- + arch/powerpc/include/asm/kvm_book3s_64.h | 10 +++++++ + arch/powerpc/kvm/book3s_64_mmu_radix.c | 35 ++++++++++++++++++++---- + 2 files changed, 39 insertions(+), 6 deletions(-) + +diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h +index 2c2635967d6e0..0431db7b82af7 100644 +--- a/arch/powerpc/include/asm/kvm_book3s_64.h ++++ b/arch/powerpc/include/asm/kvm_book3s_64.h +@@ -635,6 +635,16 @@ extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, + unsigned long gpa, unsigned long hpa, + unsigned long nbytes); + ++static inline pte_t * ++find_kvm_secondary_pte_unlocked(struct kvm *kvm, unsigned long ea, ++ unsigned *hshift) ++{ ++ pte_t *pte; ++ ++ pte = __find_linux_pte(kvm->arch.pgtable, ea, NULL, hshift); ++ return pte; ++} ++ + static inline pte_t *find_kvm_secondary_pte(struct kvm *kvm, unsigned long ea, + unsigned *hshift) + { +diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c +index e9b3622405b1d..d4e532a63f08e 100644 +--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c ++++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c +@@ -1052,7 +1052,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, + { + unsigned long gfn = memslot->base_gfn + pagenum; + unsigned long gpa = gfn << PAGE_SHIFT; +- pte_t *ptep; ++ pte_t *ptep, pte; + unsigned int shift; + int ret = 0; + unsigned long old, *rmapp; +@@ -1060,12 +1060,35 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return ret; + +- ptep = find_kvm_secondary_pte(kvm, gpa, &shift); +- if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { +- ret = 1; +- if (shift) +- ret = 1 << (shift - PAGE_SHIFT); ++ /* ++ * For performance reasons we don't hold kvm->mmu_lock while walking the ++ * partition scoped table. ++ */ ++ ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift); ++ if (!ptep) ++ return 0; ++ ++ pte = READ_ONCE(*ptep); ++ if (pte_present(pte) && pte_dirty(pte)) { + spin_lock(&kvm->mmu_lock); ++ /* ++ * Recheck the pte again ++ */ ++ if (pte_val(pte) != pte_val(*ptep)) { ++ /* ++ * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can ++ * only find PAGE_SIZE pte entries here. We can continue ++ * to use the pte addr returned by above page table ++ * walk. ++ */ ++ if (!pte_present(*ptep) || !pte_dirty(*ptep)) { ++ spin_unlock(&kvm->mmu_lock); ++ return 0; ++ } ++ } ++ ++ ret = 1; ++ VM_BUG_ON(shift); + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, + gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); +-- +2.25.1 + diff --git a/queue-5.7/powerpc-kvm-book3s-add-helper-to-walk-partition-scop.patch b/queue-5.7/powerpc-kvm-book3s-add-helper-to-walk-partition-scop.patch new file mode 100644 index 00000000000..f256d14cfbb --- /dev/null +++ b/queue-5.7/powerpc-kvm-book3s-add-helper-to-walk-partition-scop.patch @@ -0,0 +1,125 @@ +From 68c2647311131bdcce3df8eddb7fbc8e0c4147ed Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 May 2020 12:47:16 +0530 +Subject: powerpc/kvm/book3s: Add helper to walk partition scoped linux page + table. + +From: Aneesh Kumar K.V + +[ Upstream commit 4b99412ed6972cc77c1f16009e1d00323fcef9ab ] + +The locking rules for walking partition scoped table is different from process +scoped table. Hence add a helper for secondary linux page table walk and also +add check whether we are holding the right locks. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20200505071729.54912-10-aneesh.kumar@linux.ibm.com +Signed-off-by: Sasha Levin +--- + arch/powerpc/include/asm/kvm_book3s_64.h | 13 +++++++++++++ + arch/powerpc/kvm/book3s_64_mmu_radix.c | 12 ++++++------ + arch/powerpc/kvm/book3s_hv_nested.c | 2 +- + 3 files changed, 20 insertions(+), 7 deletions(-) + +diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h +index 04b2b927bb5ae..2c2635967d6e0 100644 +--- a/arch/powerpc/include/asm/kvm_book3s_64.h ++++ b/arch/powerpc/include/asm/kvm_book3s_64.h +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_PPC_PSERIES + static inline bool kvmhv_on_pseries(void) +@@ -634,6 +635,18 @@ extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, + unsigned long gpa, unsigned long hpa, + unsigned long nbytes); + ++static inline pte_t *find_kvm_secondary_pte(struct kvm *kvm, unsigned long ea, ++ unsigned *hshift) ++{ ++ pte_t *pte; ++ ++ VM_WARN(!spin_is_locked(&kvm->mmu_lock), ++ "%s called with kvm mmu_lock not held \n", __func__); ++ pte = __find_linux_pte(kvm->arch.pgtable, ea, NULL, hshift); ++ ++ return pte; ++} ++ + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + + #endif /* __ASM_KVM_BOOK3S_64_H__ */ +diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c +index bc6c1aa3d0e92..e9b3622405b1d 100644 +--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c ++++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c +@@ -993,11 +993,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + return 0; + } + +- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); ++ ptep = find_kvm_secondary_pte(kvm, gpa, &shift); + if (ptep && pte_present(*ptep)) + kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, + kvm->arch.lpid); +- return 0; ++ return 0; + } + + /* Called with kvm->mmu_lock held */ +@@ -1013,7 +1013,7 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return ref; + +- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); ++ ptep = find_kvm_secondary_pte(kvm, gpa, &shift); + if (ptep && pte_present(*ptep) && pte_young(*ptep)) { + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, + gpa, shift); +@@ -1040,7 +1040,7 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return ref; + +- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); ++ ptep = find_kvm_secondary_pte(kvm, gpa, &shift); + if (ptep && pte_present(*ptep) && pte_young(*ptep)) + ref = 1; + return ref; +@@ -1060,7 +1060,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return ret; + +- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); ++ ptep = find_kvm_secondary_pte(kvm, gpa, &shift); + if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { + ret = 1; + if (shift) +@@ -1121,7 +1121,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm, + gpa = memslot->base_gfn << PAGE_SHIFT; + spin_lock(&kvm->mmu_lock); + for (n = memslot->npages; n; --n) { +- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); ++ ptep = find_kvm_secondary_pte(kvm, gpa, &shift); + if (ptep && pte_present(*ptep)) + kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, + kvm->arch.lpid); +diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c +index dc97e5be76f61..7f1fc5db13eab 100644 +--- a/arch/powerpc/kvm/book3s_hv_nested.c ++++ b/arch/powerpc/kvm/book3s_hv_nested.c +@@ -1362,7 +1362,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_run *run, + /* See if can find translation in our partition scoped tables for L1 */ + pte = __pte(0); + spin_lock(&kvm->mmu_lock); +- pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); ++ pte_p = find_kvm_secondary_pte(kvm, gpa, &shift); + if (!shift) + shift = PAGE_SHIFT; + if (pte_p) +-- +2.25.1 + diff --git a/queue-5.7/rxrpc-fix-race-between-incoming-ack-parser-and-retra.patch b/queue-5.7/rxrpc-fix-race-between-incoming-ack-parser-and-retra.patch new file mode 100644 index 00000000000..780c78b06b0 --- /dev/null +++ b/queue-5.7/rxrpc-fix-race-between-incoming-ack-parser-and-retra.patch @@ -0,0 +1,104 @@ +From 60f551918cd97f8d5f48bca2cb08cf0da61dd9ca Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 11 Jun 2020 21:57:00 +0100 +Subject: rxrpc: Fix race between incoming ACK parser and retransmitter + +From: David Howells + +[ Upstream commit 2ad6691d988c0c611362ddc2aad89e0fb50e3261 ] + +There's a race between the retransmission code and the received ACK parser. +The problem is that the retransmission loop has to drop the lock under +which it is iterating through the transmission buffer in order to transmit +a packet, but whilst the lock is dropped, the ACK parser can crank the Tx +window round and discard the packets from the buffer. + +The retransmission code then updated the annotations for the wrong packet +and a later retransmission thought it had to retransmit a packet that +wasn't there, leading to a NULL pointer dereference. + +Fix this by: + + (1) Moving the annotation change to before we drop the lock prior to + transmission. This means we can't vary the annotation depending on + the outcome of the transmission, but that's fine - we'll retransmit + again later if it failed now. + + (2) Skipping the packet if the skb pointer is NULL. + +The following oops was seen: + + BUG: kernel NULL pointer dereference, address: 000000000000002d + Workqueue: krxrpcd rxrpc_process_call + RIP: 0010:rxrpc_get_skb+0x14/0x8a + ... + Call Trace: + rxrpc_resend+0x331/0x41e + ? get_vtime_delta+0x13/0x20 + rxrpc_process_call+0x3c0/0x4ac + process_one_work+0x18f/0x27f + worker_thread+0x1a3/0x247 + ? create_worker+0x17d/0x17d + kthread+0xe6/0xeb + ? kthread_delayed_work_timer_fn+0x83/0x83 + ret_from_fork+0x1f/0x30 + +Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") +Signed-off-by: David Howells +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/rxrpc/call_event.c | 29 +++++++++++------------------ + 1 file changed, 11 insertions(+), 18 deletions(-) + +diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c +index 2a65ac41055f5..985fb89202d0c 100644 +--- a/net/rxrpc/call_event.c ++++ b/net/rxrpc/call_event.c +@@ -248,7 +248,18 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) + if (anno_type != RXRPC_TX_ANNO_RETRANS) + continue; + ++ /* We need to reset the retransmission state, but we need to do ++ * so before we drop the lock as a new ACK/NAK may come in and ++ * confuse things ++ */ ++ annotation &= ~RXRPC_TX_ANNO_MASK; ++ annotation |= RXRPC_TX_ANNO_RESENT; ++ call->rxtx_annotations[ix] = annotation; ++ + skb = call->rxtx_buffer[ix]; ++ if (!skb) ++ continue; ++ + rxrpc_get_skb(skb, rxrpc_skb_got); + spin_unlock_bh(&call->lock); + +@@ -262,24 +273,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) + + rxrpc_free_skb(skb, rxrpc_skb_freed); + spin_lock_bh(&call->lock); +- +- /* We need to clear the retransmit state, but there are two +- * things we need to be aware of: A new ACK/NAK might have been +- * received and the packet might have been hard-ACK'd (in which +- * case it will no longer be in the buffer). +- */ +- if (after(seq, call->tx_hard_ack)) { +- annotation = call->rxtx_annotations[ix]; +- anno_type = annotation & RXRPC_TX_ANNO_MASK; +- if (anno_type == RXRPC_TX_ANNO_RETRANS || +- anno_type == RXRPC_TX_ANNO_NAK) { +- annotation &= ~RXRPC_TX_ANNO_MASK; +- annotation |= RXRPC_TX_ANNO_UNACK; +- } +- annotation |= RXRPC_TX_ANNO_RESENT; +- call->rxtx_annotations[ix] = annotation; +- } +- + if (after(call->tx_hard_ack, seq)) + seq = call->tx_hard_ack; + } +-- +2.25.1 + diff --git a/queue-5.7/s390-debug-avoid-kernel-warning-on-too-large-number-.patch b/queue-5.7/s390-debug-avoid-kernel-warning-on-too-large-number-.patch new file mode 100644 index 00000000000..f3dd5f43407 --- /dev/null +++ b/queue-5.7/s390-debug-avoid-kernel-warning-on-too-large-number-.patch @@ -0,0 +1,41 @@ +From 3f022741052cd78f4ad6856fcf7930c4c0c6615c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 31 Mar 2020 05:57:23 -0400 +Subject: s390/debug: avoid kernel warning on too large number of pages + +From: Christian Borntraeger + +[ Upstream commit 827c4913923e0b441ba07ba4cc41e01181102303 ] + +When specifying insanely large debug buffers a kernel warning is +printed. The debug code does handle the error gracefully, though. +Instead of duplicating the check let us silence the warning to +avoid crashes when panic_on_warn is used. + +Signed-off-by: Christian Borntraeger +Reviewed-by: Heiko Carstens +Signed-off-by: Heiko Carstens +Signed-off-by: Sasha Levin +--- + arch/s390/kernel/debug.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c +index 6d321f5f101d6..7184d55d87aae 100644 +--- a/arch/s390/kernel/debug.c ++++ b/arch/s390/kernel/debug.c +@@ -198,9 +198,10 @@ static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas) + if (!areas) + goto fail_malloc_areas; + for (i = 0; i < nr_areas; i++) { ++ /* GFP_NOWARN to avoid user triggerable WARN, we handle fails */ + areas[i] = kmalloc_array(pages_per_area, + sizeof(debug_entry_t *), +- GFP_KERNEL); ++ GFP_KERNEL | __GFP_NOWARN); + if (!areas[i]) + goto fail_malloc_areas2; + for (j = 0; j < pages_per_area; j++) { +-- +2.25.1 + diff --git a/queue-5.7/sched-debug-make-sd-flags-sysctl-read-only.patch b/queue-5.7/sched-debug-make-sd-flags-sysctl-read-only.patch new file mode 100644 index 00000000000..9ff23b95eb8 --- /dev/null +++ b/queue-5.7/sched-debug-make-sd-flags-sysctl-read-only.patch @@ -0,0 +1,48 @@ +From 063606d9898dc86bbbf7e0a61c30a0b0d05c8c4b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Apr 2020 22:05:05 +0100 +Subject: sched/debug: Make sd->flags sysctl read-only + +From: Valentin Schneider + +[ Upstream commit 9818427c6270a9ce8c52c8621026fe9cebae0f92 ] + +Writing to the sysctl of a sched_domain->flags directly updates the value of +the field, and goes nowhere near update_top_cache_domain(). This means that +the cached domain pointers can end up containing stale data (e.g. the +domain pointed to doesn't have the relevant flag set anymore). + +Explicit domain walks that check for flags will be affected by +the write, but this won't be in sync with the cached pointers which will +still point to the domains that were cached at the last sched_domain +build. + +In other words, writing to this interface is playing a dangerous game. It +could be made to trigger an update of the cached sched_domain pointers when +written to, but this does not seem to be worth the trouble. Make it +read-only. + +Signed-off-by: Valentin Schneider +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20200415210512.805-3-valentin.schneider@arm.com +Signed-off-by: Sasha Levin +--- + kernel/sched/debug.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 239970b991c03..0f4aaad236a9d 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) + set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); +- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); + set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[8] is terminator */ +-- +2.25.1 + diff --git a/queue-5.7/seg6-fix-seg6_validate_srh-to-avoid-slab-out-of-boun.patch b/queue-5.7/seg6-fix-seg6_validate_srh-to-avoid-slab-out-of-boun.patch new file mode 100644 index 00000000000..5eb4dff6741 --- /dev/null +++ b/queue-5.7/seg6-fix-seg6_validate_srh-to-avoid-slab-out-of-boun.patch @@ -0,0 +1,169 @@ +From 2fee62416154243ada38b173bf0d55dfcf5a14a1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jun 2020 06:54:42 +0000 +Subject: seg6: fix seg6_validate_srh() to avoid slab-out-of-bounds +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Ahmed Abdelsalam + +[ Upstream commit bb986a50421a11bf31a81afb15b9b8f45a4a3a11 ] + +The seg6_validate_srh() is used to validate SRH for three cases: + +case1: SRH of data-plane SRv6 packets to be processed by the Linux kernel. +Case2: SRH of the netlink message received from user-space (iproute2) +Case3: SRH injected into packets through setsockopt + +In case1, the SRH can be encoded in the Reduced way (i.e., first SID is +carried in DA only and not represented as SID in the SRH) and the +seg6_validate_srh() now handles this case correctly. + +In case2 and case3, the SRH shouldn’t be encoded in the Reduced way +otherwise we lose the first segment (i.e., the first hop). + +The current implementation of the seg6_validate_srh() allow SRH of case2 +and case3 to be encoded in the Reduced way. This leads a slab-out-of-bounds +problem. + +This patch verifies SRH of case1, case2 and case3. Allowing case1 to be +reduced while preventing SRH of case2 and case3 from being reduced . + +Reported-by: syzbot+e8c028b62439eac42073@syzkaller.appspotmail.com +Reported-by: YueHaibing +Fixes: 0cb7498f234e ("seg6: fix SRH processing to comply with RFC8754") +Signed-off-by: Ahmed Abdelsalam +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/net/seg6.h | 2 +- + net/core/filter.c | 2 +- + net/ipv6/ipv6_sockglue.c | 2 +- + net/ipv6/seg6.c | 16 ++++++++++------ + net/ipv6/seg6_iptunnel.c | 2 +- + net/ipv6/seg6_local.c | 6 +++--- + 6 files changed, 17 insertions(+), 13 deletions(-) + +diff --git a/include/net/seg6.h b/include/net/seg6.h +index 640724b352731..9d19c15e8545c 100644 +--- a/include/net/seg6.h ++++ b/include/net/seg6.h +@@ -57,7 +57,7 @@ extern void seg6_iptunnel_exit(void); + extern int seg6_local_init(void); + extern void seg6_local_exit(void); + +-extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len); ++extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced); + extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + int proto); + extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); +diff --git a/net/core/filter.c b/net/core/filter.c +index 9512a9772d691..45fa65a289833 100644 +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -4920,7 +4920,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len + int err; + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; + +- if (!seg6_validate_srh(srh, len)) ++ if (!seg6_validate_srh(srh, len, false)) + return -EINVAL; + + switch (type) { +diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c +index 5af97b4f5df30..ff187fd2083ff 100644 +--- a/net/ipv6/ipv6_sockglue.c ++++ b/net/ipv6/ipv6_sockglue.c +@@ -458,7 +458,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *) + opt->srcrt; + +- if (!seg6_validate_srh(srh, optlen)) ++ if (!seg6_validate_srh(srh, optlen, false)) + goto sticky_done; + break; + } +diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c +index 37b434293bda3..d2f8138e5a73a 100644 +--- a/net/ipv6/seg6.c ++++ b/net/ipv6/seg6.c +@@ -25,7 +25,7 @@ + #include + #endif + +-bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) ++bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced) + { + unsigned int tlv_offset; + int max_last_entry; +@@ -37,13 +37,17 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) + if (((srh->hdrlen + 1) << 3) != len) + return false; + +- max_last_entry = (srh->hdrlen / 2) - 1; +- +- if (srh->first_segment > max_last_entry) ++ if (!reduced && srh->segments_left > srh->first_segment) { + return false; ++ } else { ++ max_last_entry = (srh->hdrlen / 2) - 1; + +- if (srh->segments_left > srh->first_segment + 1) +- return false; ++ if (srh->first_segment > max_last_entry) ++ return false; ++ ++ if (srh->segments_left > srh->first_segment + 1) ++ return false; ++ } + + tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4); + +diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c +index c7cbfeae94f5e..e0e9f48ab14fe 100644 +--- a/net/ipv6/seg6_iptunnel.c ++++ b/net/ipv6/seg6_iptunnel.c +@@ -426,7 +426,7 @@ static int seg6_build_state(struct net *net, struct nlattr *nla, + } + + /* verify that SRH is consistent */ +- if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo))) ++ if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo), false)) + return -EINVAL; + + newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt)); +diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c +index 52493423f3299..eba23279912df 100644 +--- a/net/ipv6/seg6_local.c ++++ b/net/ipv6/seg6_local.c +@@ -87,7 +87,7 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) + */ + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + +- if (!seg6_validate_srh(srh, len)) ++ if (!seg6_validate_srh(srh, len, true)) + return NULL; + + return srh; +@@ -495,7 +495,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) + return false; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); +- if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)) ++ if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3, true)) + return false; + + srh_state->valid = true; +@@ -670,7 +670,7 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) + if (len < sizeof(*srh) + sizeof(struct in6_addr)) + return -EINVAL; + +- if (!seg6_validate_srh(srh, len)) ++ if (!seg6_validate_srh(srh, len, false)) + return -EINVAL; + + slwt->srh = kmemdup(srh, len, GFP_KERNEL); +-- +2.25.1 + diff --git a/queue-5.7/series b/queue-5.7/series index a3da2fbfb46..d63d36da25c 100644 --- a/queue-5.7/series +++ b/queue-5.7/series @@ -8,3 +8,33 @@ btrfs-fix-race-between-block-group-removal-and-block.patch mm-fix-swap-cache-node-allocation-mask.patch drm-amd-display-fix-incorrectly-pruned-modes-with-de.patch drm-amd-display-fix-ineffective-setting-of-max-bpc-p.patch +seg6-fix-seg6_validate_srh-to-avoid-slab-out-of-boun.patch +tipc-add-test-for-nagle-algorithm-effectiveness.patch +tipc-fix-kernel-warning-in-tipc_msg_append.patch +usbnet-smsc95xx-fix-use-after-free-after-removal.patch +tipc-fix-null-pointer-dereference-in-__tipc_sendstre.patch +drm-i915-gt-mark-timeline-cacheline-as-destroyed-aft.patch +drm-amdgpu-disable-ras-query-and-iject-during-gpu-re.patch +drm-amdgpu-fix-non-pointer-dereference-for-non-ras-s.patch +drm-amdgpu-fix-kernel-page-fault-issue-by-ras-recove.patch +sched-debug-make-sd-flags-sysctl-read-only.patch +soc-ti-omap-prm-use-atomic-iopoll-instead-of-sleepin.patch +powerpc-kvm-book3s-add-helper-to-walk-partition-scop.patch +powerpc-book3s64-kvm-fix-secondary-page-table-walk-w.patch +mm-slub.c-fix-corrupted-freechain-in-deactivate_slab.patch +mm-slub-fix-stack-overruns-with-slub_stats.patch +mm-dump_page-do-not-crash-with-invalid-mapping-point.patch +io_uring-fix-sq-io-poll-with-unsupported-opcodes.patch +rxrpc-fix-race-between-incoming-ack-parser-and-retra.patch +usb-usbtest-fix-missing-kfree-dev-buf-in-usbtest_dis.patch +tools-lib-traceevent-add-append-function-helper-for-.patch +tools-lib-traceevent-handle-__attribute__-user-in-fi.patch +s390-debug-avoid-kernel-warning-on-too-large-number-.patch +io_uring-fix-io_sq_thread-no-schedule-when-busy.patch +nvme-multipath-set-bdi-capabilities-once.patch +nvme-fix-possible-deadlock-when-i-o-is-blocked.patch +nvme-multipath-fix-deadlock-between-ana_work-and-sca.patch +nvme-multipath-fix-deadlock-due-to-head-lock.patch +nvme-multipath-fix-bogus-request-queue-reference-put.patch +io_uring-fix-current-mm-null-dereference-on-exit.patch +kgdb-avoid-suspicious-rcu-usage-warning.patch diff --git a/queue-5.7/soc-ti-omap-prm-use-atomic-iopoll-instead-of-sleepin.patch b/queue-5.7/soc-ti-omap-prm-use-atomic-iopoll-instead-of-sleepin.patch new file mode 100644 index 00000000000..1d9c369f65e --- /dev/null +++ b/queue-5.7/soc-ti-omap-prm-use-atomic-iopoll-instead-of-sleepin.patch @@ -0,0 +1,45 @@ +From 1c988ce76f672c8249dd8c22c939b4d04b72c55f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 May 2020 10:37:18 +0300 +Subject: soc: ti: omap-prm: use atomic iopoll instead of sleeping one + +From: Tero Kristo + +[ Upstream commit 98ece19f247159a51003796ede7112fef2df5d7f ] + +The reset handling APIs for omap-prm can be invoked PM runtime which +runs in atomic context. For this to work properly, switch to atomic +iopoll version instead of the current which can sleep. Otherwise, +this throws a "BUG: scheduling while atomic" warning. Issue is seen +rather easily when CONFIG_PREEMPT is enabled. + +Signed-off-by: Tero Kristo +Acked-by: Santosh Shilimkar +Signed-off-by: Tony Lindgren +Signed-off-by: Sasha Levin +--- + drivers/soc/ti/omap_prm.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/drivers/soc/ti/omap_prm.c b/drivers/soc/ti/omap_prm.c +index 96c6f777519c0..c9b3f9ebf0bbf 100644 +--- a/drivers/soc/ti/omap_prm.c ++++ b/drivers/soc/ti/omap_prm.c +@@ -256,10 +256,10 @@ static int omap_reset_deassert(struct reset_controller_dev *rcdev, + goto exit; + + /* wait for the status to be set */ +- ret = readl_relaxed_poll_timeout(reset->prm->base + +- reset->prm->data->rstst, +- v, v & BIT(st_bit), 1, +- OMAP_RESET_MAX_WAIT); ++ ret = readl_relaxed_poll_timeout_atomic(reset->prm->base + ++ reset->prm->data->rstst, ++ v, v & BIT(st_bit), 1, ++ OMAP_RESET_MAX_WAIT); + if (ret) + pr_err("%s: timedout waiting for %s:%lu\n", __func__, + reset->prm->data->name, id); +-- +2.25.1 + diff --git a/queue-5.7/tipc-add-test-for-nagle-algorithm-effectiveness.patch b/queue-5.7/tipc-add-test-for-nagle-algorithm-effectiveness.patch new file mode 100644 index 00000000000..f1b859b1c16 --- /dev/null +++ b/queue-5.7/tipc-add-test-for-nagle-algorithm-effectiveness.patch @@ -0,0 +1,282 @@ +From afcf3e9b57000d467f2a99e77ebdc09eca724d64 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 26 May 2020 16:38:38 +0700 +Subject: tipc: add test for Nagle algorithm effectiveness + +From: Tuong Lien + +[ Upstream commit 0a3e060f340dbe232ffa290c40f879b7f7db595b ] + +When streaming in Nagle mode, we try to bundle small messages from user +as many as possible if there is one outstanding buffer, i.e. not ACK-ed +by the receiving side, which helps boost up the overall throughput. So, +the algorithm's effectiveness really depends on when Nagle ACK comes or +what the specific network latency (RTT) is, compared to the user's +message sending rate. + +In a bad case, the user's sending rate is low or the network latency is +small, there will not be many bundles, so making a Nagle ACK or waiting +for it is not meaningful. +For example: a user sends its messages every 100ms and the RTT is 50ms, +then for each messages, we require one Nagle ACK but then there is only +one user message sent without any bundles. + +In a better case, even if we have a few bundles (e.g. the RTT = 300ms), +but now the user sends messages in medium size, then there will not be +any difference at all, that says 3 x 1000-byte data messages if bundled +will still result in 3 bundles with MTU = 1500. + +When Nagle is ineffective, the delay in user message sending is clearly +wasted instead of sending directly. + +Besides, adding Nagle ACKs will consume some processor load on both the +sending and receiving sides. + +This commit adds a test on the effectiveness of the Nagle algorithm for +an individual connection in the network on which it actually runs. +Particularly, upon receipt of a Nagle ACK we will compare the number of +bundles in the backlog queue to the number of user messages which would +be sent directly without Nagle. If the ratio is good (e.g. >= 2), Nagle +mode will be kept for further message sending. Otherwise, we will leave +Nagle and put a 'penalty' on the connection, so it will have to spend +more 'one-way' messages before being able to re-enter Nagle. + +In addition, the 'ack-required' bit is only set when really needed that +the number of Nagle ACKs will be reduced during Nagle mode. + +Testing with benchmark showed that with the patch, there was not much +difference in throughput for small messages since the tool continuously +sends messages without a break, so Nagle would still take in effect. + +Acked-by: Ying Xue +Acked-by: Jon Maloy +Signed-off-by: Tuong Lien +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/tipc/msg.c | 3 --- + net/tipc/msg.h | 14 +++++++++-- + net/tipc/socket.c | 64 ++++++++++++++++++++++++++++++++++++++--------- + 3 files changed, 64 insertions(+), 17 deletions(-) + +diff --git a/net/tipc/msg.c b/net/tipc/msg.c +index 3ad411884e6c0..93966321f8929 100644 +--- a/net/tipc/msg.c ++++ b/net/tipc/msg.c +@@ -235,9 +235,6 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, + msg_set_size(hdr, MIN_H_SIZE); + __skb_queue_tail(txq, skb); + total += 1; +- if (prev) +- msg_set_ack_required(buf_msg(prev), 0); +- msg_set_ack_required(hdr, 1); + } + hdr = buf_msg(skb); + curr = msg_blocks(hdr); +diff --git a/net/tipc/msg.h b/net/tipc/msg.h +index 871feadbbc191..a4e2029170b1b 100644 +--- a/net/tipc/msg.h ++++ b/net/tipc/msg.h +@@ -321,9 +321,19 @@ static inline int msg_ack_required(struct tipc_msg *m) + return msg_bits(m, 0, 18, 1); + } + +-static inline void msg_set_ack_required(struct tipc_msg *m, u32 d) ++static inline void msg_set_ack_required(struct tipc_msg *m) + { +- msg_set_bits(m, 0, 18, 1, d); ++ msg_set_bits(m, 0, 18, 1, 1); ++} ++ ++static inline int msg_nagle_ack(struct tipc_msg *m) ++{ ++ return msg_bits(m, 0, 18, 1); ++} ++ ++static inline void msg_set_nagle_ack(struct tipc_msg *m) ++{ ++ msg_set_bits(m, 0, 18, 1, 1); + } + + static inline bool msg_is_rcast(struct tipc_msg *m) +diff --git a/net/tipc/socket.c b/net/tipc/socket.c +index e370ad0edd768..d6b67d07d22ec 100644 +--- a/net/tipc/socket.c ++++ b/net/tipc/socket.c +@@ -48,6 +48,8 @@ + #include "group.h" + #include "trace.h" + ++#define NAGLE_START_INIT 4 ++#define NAGLE_START_MAX 1024 + #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ + #define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ + #define TIPC_FWD_MSG 1 +@@ -119,7 +121,10 @@ struct tipc_sock { + struct rcu_head rcu; + struct tipc_group *group; + u32 oneway; ++ u32 nagle_start; + u16 snd_backlog; ++ u16 msg_acc; ++ u16 pkt_cnt; + bool expect_ack; + bool nodelay; + bool group_is_open; +@@ -143,7 +148,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk); + static void tipc_sk_remove(struct tipc_sock *tsk); + static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); + static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); +-static void tipc_sk_push_backlog(struct tipc_sock *tsk); ++static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack); + + static const struct proto_ops packet_ops; + static const struct proto_ops stream_ops; +@@ -474,6 +479,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, + tsk = tipc_sk(sk); + tsk->max_pkt = MAX_PKT_DEFAULT; + tsk->maxnagle = 0; ++ tsk->nagle_start = NAGLE_START_INIT; + INIT_LIST_HEAD(&tsk->publications); + INIT_LIST_HEAD(&tsk->cong_links); + msg = &tsk->phdr; +@@ -541,7 +547,7 @@ static void __tipc_shutdown(struct socket *sock, int error) + !tsk_conn_cong(tsk))); + + /* Push out delayed messages if in Nagle mode */ +- tipc_sk_push_backlog(tsk); ++ tipc_sk_push_backlog(tsk, false); + /* Remove pending SYN */ + __skb_queue_purge(&sk->sk_write_queue); + +@@ -1252,14 +1258,37 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, + /* tipc_sk_push_backlog(): send accumulated buffers in socket write queue + * when socket is in Nagle mode + */ +-static void tipc_sk_push_backlog(struct tipc_sock *tsk) ++static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack) + { + struct sk_buff_head *txq = &tsk->sk.sk_write_queue; ++ struct sk_buff *skb = skb_peek_tail(txq); + struct net *net = sock_net(&tsk->sk); + u32 dnode = tsk_peer_node(tsk); +- struct sk_buff *skb = skb_peek(txq); + int rc; + ++ if (nagle_ack) { ++ tsk->pkt_cnt += skb_queue_len(txq); ++ if (!tsk->pkt_cnt || tsk->msg_acc / tsk->pkt_cnt < 2) { ++ tsk->oneway = 0; ++ if (tsk->nagle_start < NAGLE_START_MAX) ++ tsk->nagle_start *= 2; ++ tsk->expect_ack = false; ++ pr_debug("tsk %10u: bad nagle %u -> %u, next start %u!\n", ++ tsk->portid, tsk->msg_acc, tsk->pkt_cnt, ++ tsk->nagle_start); ++ } else { ++ tsk->nagle_start = NAGLE_START_INIT; ++ if (skb) { ++ msg_set_ack_required(buf_msg(skb)); ++ tsk->expect_ack = true; ++ } else { ++ tsk->expect_ack = false; ++ } ++ } ++ tsk->msg_acc = 0; ++ tsk->pkt_cnt = 0; ++ } ++ + if (!skb || tsk->cong_link_cnt) + return; + +@@ -1267,9 +1296,10 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk) + if (msg_is_syn(buf_msg(skb))) + return; + ++ if (tsk->msg_acc) ++ tsk->pkt_cnt += skb_queue_len(txq); + tsk->snt_unacked += tsk->snd_backlog; + tsk->snd_backlog = 0; +- tsk->expect_ack = true; + rc = tipc_node_xmit(net, txq, dnode, tsk->portid); + if (rc == -ELINKCONG) + tsk->cong_link_cnt = 1; +@@ -1322,8 +1352,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, + return; + } else if (mtyp == CONN_ACK) { + was_cong = tsk_conn_cong(tsk); +- tsk->expect_ack = false; +- tipc_sk_push_backlog(tsk); ++ tipc_sk_push_backlog(tsk, msg_nagle_ack(hdr)); + tsk->snt_unacked -= msg_conn_ack(hdr); + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) + tsk->snd_win = msg_adv_win(hdr); +@@ -1516,6 +1545,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = &tsk->phdr; + struct net *net = sock_net(sk); ++ struct sk_buff *skb; + u32 dnode = tsk_peer_node(tsk); + int maxnagle = tsk->maxnagle; + int maxpkt = tsk->max_pkt; +@@ -1544,17 +1574,25 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) + break; + send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); + blocks = tsk->snd_backlog; +- if (tsk->oneway++ >= 4 && send <= maxnagle) { ++ if (tsk->oneway++ >= tsk->nagle_start && send <= maxnagle) { + rc = tipc_msg_append(hdr, m, send, maxnagle, txq); + if (unlikely(rc < 0)) + break; + blocks += rc; ++ tsk->msg_acc++; + if (blocks <= 64 && tsk->expect_ack) { + tsk->snd_backlog = blocks; + sent += send; + break; ++ } else if (blocks > 64) { ++ tsk->pkt_cnt += skb_queue_len(txq); ++ } else { ++ skb = skb_peek_tail(txq); ++ msg_set_ack_required(buf_msg(skb)); ++ tsk->expect_ack = true; ++ tsk->msg_acc = 0; ++ tsk->pkt_cnt = 0; + } +- tsk->expect_ack = true; + } else { + rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq); + if (unlikely(rc != send)) +@@ -2091,7 +2129,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, + smp_wmb(); + tsk->cong_link_cnt--; + wakeup = true; +- tipc_sk_push_backlog(tsk); ++ tipc_sk_push_backlog(tsk, false); + break; + case GROUP_PROTOCOL: + tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); +@@ -2180,7 +2218,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, + return false; + case TIPC_ESTABLISHED: + if (!skb_queue_empty(&sk->sk_write_queue)) +- tipc_sk_push_backlog(tsk); ++ tipc_sk_push_backlog(tsk, false); + /* Accept only connection-based messages sent by peer */ + if (likely(con_msg && !err && pport == oport && + pnode == onode)) { +@@ -2188,8 +2226,10 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff *skb; + + skb = tipc_sk_build_ack(tsk); +- if (skb) ++ if (skb) { ++ msg_set_nagle_ack(buf_msg(skb)); + __skb_queue_tail(xmitq, skb); ++ } + } + return true; + } +-- +2.25.1 + diff --git a/queue-5.7/tipc-fix-kernel-warning-in-tipc_msg_append.patch b/queue-5.7/tipc-fix-kernel-warning-in-tipc_msg_append.patch new file mode 100644 index 00000000000..300ffb4b9d5 --- /dev/null +++ b/queue-5.7/tipc-fix-kernel-warning-in-tipc_msg_append.patch @@ -0,0 +1,80 @@ +From 188dba631a2392fe38321d576a8e5d0d098545bf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 11 Jun 2020 17:07:35 +0700 +Subject: tipc: fix kernel WARNING in tipc_msg_append() + +From: Tuong Lien + +[ Upstream commit c9aa81faf19115fc2e732e7f210b37bb316987ff ] + +syzbot found the following issue: + +WARNING: CPU: 0 PID: 6808 at include/linux/thread_info.h:150 check_copy_size include/linux/thread_info.h:150 [inline] +WARNING: CPU: 0 PID: 6808 at include/linux/thread_info.h:150 copy_from_iter include/linux/uio.h:144 [inline] +WARNING: CPU: 0 PID: 6808 at include/linux/thread_info.h:150 tipc_msg_append+0x49a/0x5e0 net/tipc/msg.c:242 +Kernel panic - not syncing: panic_on_warn set ... + +This happens after commit 5e9eeccc58f3 ("tipc: fix NULL pointer +dereference in streaming") that tried to build at least one buffer even +when the message data length is zero... However, it now exposes another +bug that the 'mss' can be zero and the 'cpy' will be negative, thus the +above kernel WARNING will appear! +The zero value of 'mss' is never expected because it means Nagle is not +enabled for the socket (actually the socket type was 'SOCK_SEQPACKET'), +so the function 'tipc_msg_append()' must not be called at all. But that +was in this particular case since the message data length was zero, and +the 'send <= maxnagle' check became true. + +We resolve the issue by explicitly checking if Nagle is enabled for the +socket, i.e. 'maxnagle != 0' before calling the 'tipc_msg_append()'. We +also reinforce the function to against such a negative values if any. + +Reported-by: syzbot+75139a7d2605236b0b7f@syzkaller.appspotmail.com +Fixes: c0bceb97db9e ("tipc: add smart nagle feature") +Acked-by: Jon Maloy +Signed-off-by: Tuong Lien +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/tipc/msg.c | 4 ++-- + net/tipc/socket.c | 3 ++- + 2 files changed, 4 insertions(+), 3 deletions(-) + +diff --git a/net/tipc/msg.c b/net/tipc/msg.c +index 93966321f8929..560d7a4c0ffff 100644 +--- a/net/tipc/msg.c ++++ b/net/tipc/msg.c +@@ -239,14 +239,14 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, + hdr = buf_msg(skb); + curr = msg_blocks(hdr); + mlen = msg_size(hdr); +- cpy = min_t(int, rem, mss - mlen); ++ cpy = min_t(size_t, rem, mss - mlen); + if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) + return -EFAULT; + msg_set_size(hdr, mlen + cpy); + skb_put(skb, cpy); + rem -= cpy; + total += msg_blocks(hdr) - curr; +- } while (rem); ++ } while (rem > 0); + return total - accounted; + } + +diff --git a/net/tipc/socket.c b/net/tipc/socket.c +index d6b67d07d22ec..62fc871a8d673 100644 +--- a/net/tipc/socket.c ++++ b/net/tipc/socket.c +@@ -1574,7 +1574,8 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) + break; + send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); + blocks = tsk->snd_backlog; +- if (tsk->oneway++ >= tsk->nagle_start && send <= maxnagle) { ++ if (tsk->oneway++ >= tsk->nagle_start && maxnagle && ++ send <= maxnagle) { + rc = tipc_msg_append(hdr, m, send, maxnagle, txq); + if (unlikely(rc < 0)) + break; +-- +2.25.1 + diff --git a/queue-5.7/tipc-fix-null-pointer-dereference-in-__tipc_sendstre.patch b/queue-5.7/tipc-fix-null-pointer-dereference-in-__tipc_sendstre.patch new file mode 100644 index 00000000000..1346fbc47c6 --- /dev/null +++ b/queue-5.7/tipc-fix-null-pointer-dereference-in-__tipc_sendstre.patch @@ -0,0 +1,44 @@ +From ed1a8378cb5dee5efa09e36fe297b4e4c31faa94 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 May 2020 22:34:07 +0800 +Subject: tipc: Fix NULL pointer dereference in __tipc_sendstream() + +From: YueHaibing + +[ Upstream commit 4c21daae3dbc9f8536cc18e6e53627821fa2c90c ] + +tipc_sendstream() may send zero length packet, then tipc_msg_append() +do not alloc skb, skb_peek_tail() will get NULL, msg_set_ack_required +will trigger NULL pointer dereference. + +Reported-by: syzbot+8eac6d030e7807c21d32@syzkaller.appspotmail.com +Fixes: 0a3e060f340d ("tipc: add test for Nagle algorithm effectiveness") +Signed-off-by: YueHaibing +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/tipc/socket.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/net/tipc/socket.c b/net/tipc/socket.c +index 62fc871a8d673..f02f2abf6e3c0 100644 +--- a/net/tipc/socket.c ++++ b/net/tipc/socket.c +@@ -1589,8 +1589,12 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) + tsk->pkt_cnt += skb_queue_len(txq); + } else { + skb = skb_peek_tail(txq); +- msg_set_ack_required(buf_msg(skb)); +- tsk->expect_ack = true; ++ if (skb) { ++ msg_set_ack_required(buf_msg(skb)); ++ tsk->expect_ack = true; ++ } else { ++ tsk->expect_ack = false; ++ } + tsk->msg_acc = 0; + tsk->pkt_cnt = 0; + } +-- +2.25.1 + diff --git a/queue-5.7/tools-lib-traceevent-add-append-function-helper-for-.patch b/queue-5.7/tools-lib-traceevent-add-append-function-helper-for-.patch new file mode 100644 index 00000000000..0acf9ff1d76 --- /dev/null +++ b/queue-5.7/tools-lib-traceevent-add-append-function-helper-for-.patch @@ -0,0 +1,243 @@ +From c51c0cfe5f766b271052a2b6d003ca9d41ce4701 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 24 Mar 2020 16:08:46 -0400 +Subject: tools lib traceevent: Add append() function helper for appending + strings + +From: Steven Rostedt (VMware) + +[ Upstream commit 27d4d336f2872193e90ee5450559e1699fae0f6d ] + +There's several locations that open code realloc and strcat() to append +text to strings. Add an append() function that takes a delimiter and a +string to append to another string. + +Signed-off-by: Steven Rostedt (VMware) +Cc: Andrew Morton +Cc: Jaewon Lim +Cc: Jiri Olsa +Cc: Kees Kook +Cc: linux-mm@kvack.org +Cc: linux-trace-devel@vger.kernel.org +Cc: Namhyung Kim +Cc: Vlastimil Babka +Link: http://lore.kernel.org/lkml/20200324200956.515118403@goodmis.org +Signed-off-by: Arnaldo Carvalho de Melo +Signed-off-by: Sasha Levin +--- + tools/lib/traceevent/event-parse.c | 98 ++++++++++++------------------ + 1 file changed, 40 insertions(+), 58 deletions(-) + +diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c +index e1bd2a93c6db8..eec96c31ea9e5 100644 +--- a/tools/lib/traceevent/event-parse.c ++++ b/tools/lib/traceevent/event-parse.c +@@ -1425,6 +1425,19 @@ static unsigned int type_size(const char *name) + return 0; + } + ++static int append(char **buf, const char *delim, const char *str) ++{ ++ char *new_buf; ++ ++ new_buf = realloc(*buf, strlen(*buf) + strlen(delim) + strlen(str) + 1); ++ if (!new_buf) ++ return -1; ++ strcat(new_buf, delim); ++ strcat(new_buf, str); ++ *buf = new_buf; ++ return 0; ++} ++ + static int event_read_fields(struct tep_event *event, struct tep_format_field **fields) + { + struct tep_format_field *field = NULL; +@@ -1432,6 +1445,7 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field ** + char *token; + char *last_token; + int count = 0; ++ int ret; + + do { + unsigned int size_dynamic = 0; +@@ -1490,24 +1504,15 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field ** + field->flags |= TEP_FIELD_IS_POINTER; + + if (field->type) { +- char *new_type; +- new_type = realloc(field->type, +- strlen(field->type) + +- strlen(last_token) + 2); +- if (!new_type) { +- free(last_token); +- goto fail; +- } +- field->type = new_type; +- strcat(field->type, " "); +- strcat(field->type, last_token); ++ ret = append(&field->type, " ", last_token); + free(last_token); ++ if (ret < 0) ++ goto fail; + } else + field->type = last_token; + last_token = token; + continue; + } +- + break; + } + +@@ -1523,8 +1528,6 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field ** + if (strcmp(token, "[") == 0) { + enum tep_event_type last_type = type; + char *brackets = token; +- char *new_brackets; +- int len; + + field->flags |= TEP_FIELD_IS_ARRAY; + +@@ -1536,29 +1539,27 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field ** + field->arraylen = 0; + + while (strcmp(token, "]") != 0) { ++ const char *delim; ++ + if (last_type == TEP_EVENT_ITEM && + type == TEP_EVENT_ITEM) +- len = 2; ++ delim = " "; + else +- len = 1; ++ delim = ""; ++ + last_type = type; + +- new_brackets = realloc(brackets, +- strlen(brackets) + +- strlen(token) + len); +- if (!new_brackets) { ++ ret = append(&brackets, delim, token); ++ if (ret < 0) { + free(brackets); + goto fail; + } +- brackets = new_brackets; +- if (len == 2) +- strcat(brackets, " "); +- strcat(brackets, token); + /* We only care about the last token */ + field->arraylen = strtoul(token, NULL, 0); + free_token(token); + type = read_token(&token); + if (type == TEP_EVENT_NONE) { ++ free(brackets); + do_warning_event(event, "failed to find token"); + goto fail; + } +@@ -1566,13 +1567,11 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field ** + + free_token(token); + +- new_brackets = realloc(brackets, strlen(brackets) + 2); +- if (!new_brackets) { ++ ret = append(&brackets, "", "]"); ++ if (ret < 0) { + free(brackets); + goto fail; + } +- brackets = new_brackets; +- strcat(brackets, "]"); + + /* add brackets to type */ + +@@ -1582,34 +1581,23 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field ** + * the format: type [] item; + */ + if (type == TEP_EVENT_ITEM) { +- char *new_type; +- new_type = realloc(field->type, +- strlen(field->type) + +- strlen(field->name) + +- strlen(brackets) + 2); +- if (!new_type) { ++ ret = append(&field->type, " ", field->name); ++ if (ret < 0) { + free(brackets); + goto fail; + } +- field->type = new_type; +- strcat(field->type, " "); +- strcat(field->type, field->name); ++ ret = append(&field->type, "", brackets); ++ + size_dynamic = type_size(field->name); + free_token(field->name); +- strcat(field->type, brackets); + field->name = field->alias = token; + type = read_token(&token); + } else { +- char *new_type; +- new_type = realloc(field->type, +- strlen(field->type) + +- strlen(brackets) + 1); +- if (!new_type) { ++ ret = append(&field->type, "", brackets); ++ if (ret < 0) { + free(brackets); + goto fail; + } +- field->type = new_type; +- strcat(field->type, brackets); + } + free(brackets); + } +@@ -2046,19 +2034,16 @@ process_op(struct tep_event *event, struct tep_print_arg *arg, char **tok) + /* could just be a type pointer */ + if ((strcmp(arg->op.op, "*") == 0) && + type == TEP_EVENT_DELIM && (strcmp(token, ")") == 0)) { +- char *new_atom; ++ int ret; + + if (left->type != TEP_PRINT_ATOM) { + do_warning_event(event, "bad pointer type"); + goto out_free; + } +- new_atom = realloc(left->atom.atom, +- strlen(left->atom.atom) + 3); +- if (!new_atom) ++ ret = append(&left->atom.atom, " ", "*"); ++ if (ret < 0) + goto out_warn_free; + +- left->atom.atom = new_atom; +- strcat(left->atom.atom, " *"); + free(arg->op.op); + *arg = *left; + free(left); +@@ -3151,18 +3136,15 @@ process_arg_token(struct tep_event *event, struct tep_print_arg *arg, + } + /* atoms can be more than one token long */ + while (type == TEP_EVENT_ITEM) { +- char *new_atom; +- new_atom = realloc(atom, +- strlen(atom) + strlen(token) + 2); +- if (!new_atom) { ++ int ret; ++ ++ ret = append(&atom, " ", token); ++ if (ret < 0) { + free(atom); + *tok = NULL; + free_token(token); + return TEP_EVENT_ERROR; + } +- atom = new_atom; +- strcat(atom, " "); +- strcat(atom, token); + free_token(token); + type = read_token_item(&token); + } +-- +2.25.1 + diff --git a/queue-5.7/tools-lib-traceevent-handle-__attribute__-user-in-fi.patch b/queue-5.7/tools-lib-traceevent-handle-__attribute__-user-in-fi.patch new file mode 100644 index 00000000000..a6f1edc3e7d --- /dev/null +++ b/queue-5.7/tools-lib-traceevent-handle-__attribute__-user-in-fi.patch @@ -0,0 +1,98 @@ +From 21d1f87463af7f12d314b5fdd491dc6e0604f65e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 24 Mar 2020 16:08:47 -0400 +Subject: tools lib traceevent: Handle __attribute__((user)) in field names + +From: Steven Rostedt (VMware) + +[ Upstream commit 74621d929d944529a5e2878a84f48bfa6fb69a66 ] + +Commit c61f13eaa1ee1 ("gcc-plugins: Add structleak for more stack +initialization") added "__attribute__((user))" to the user when +stackleak detector is enabled. This now appears in the field format of +system call trace events for system calls that have user buffers. The +"__attribute__((user))" breaks the parsing in libtraceevent. That needs +to be handled. + +Signed-off-by: Steven Rostedt (VMware) +Cc: Andrew Morton +Cc: Jaewon Kim +Cc: Jiri Olsa +Cc: Kees Kook +Cc: Namhyung Kim +Cc: Vlastimil Babka +Cc: linux-mm@kvack.org +Cc: linux-trace-devel@vger.kernel.org +Link: http://lore.kernel.org/lkml/20200324200956.663647256@goodmis.org +Signed-off-by: Arnaldo Carvalho de Melo +Signed-off-by: Sasha Levin +--- + tools/lib/traceevent/event-parse.c | 39 +++++++++++++++++++++++++++++- + 1 file changed, 38 insertions(+), 1 deletion(-) + +diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c +index eec96c31ea9e5..010e60d5a0817 100644 +--- a/tools/lib/traceevent/event-parse.c ++++ b/tools/lib/traceevent/event-parse.c +@@ -1444,6 +1444,7 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field ** + enum tep_event_type type; + char *token; + char *last_token; ++ char *delim = " "; + int count = 0; + int ret; + +@@ -1504,13 +1505,49 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field ** + field->flags |= TEP_FIELD_IS_POINTER; + + if (field->type) { +- ret = append(&field->type, " ", last_token); ++ ret = append(&field->type, delim, last_token); + free(last_token); + if (ret < 0) + goto fail; + } else + field->type = last_token; + last_token = token; ++ delim = " "; ++ continue; ++ } ++ ++ /* Handle __attribute__((user)) */ ++ if ((type == TEP_EVENT_DELIM) && ++ strcmp("__attribute__", last_token) == 0 && ++ token[0] == '(') { ++ int depth = 1; ++ int ret; ++ ++ ret = append(&field->type, " ", last_token); ++ ret |= append(&field->type, "", "("); ++ if (ret < 0) ++ goto fail; ++ ++ delim = " "; ++ while ((type = read_token(&token)) != TEP_EVENT_NONE) { ++ if (type == TEP_EVENT_DELIM) { ++ if (token[0] == '(') ++ depth++; ++ else if (token[0] == ')') ++ depth--; ++ if (!depth) ++ break; ++ ret = append(&field->type, "", token); ++ delim = ""; ++ } else { ++ ret = append(&field->type, delim, token); ++ delim = " "; ++ } ++ if (ret < 0) ++ goto fail; ++ free(last_token); ++ last_token = token; ++ } + continue; + } + break; +-- +2.25.1 + diff --git a/queue-5.7/usb-usbtest-fix-missing-kfree-dev-buf-in-usbtest_dis.patch b/queue-5.7/usb-usbtest-fix-missing-kfree-dev-buf-in-usbtest_dis.patch new file mode 100644 index 00000000000..310b6701e3b --- /dev/null +++ b/queue-5.7/usb-usbtest-fix-missing-kfree-dev-buf-in-usbtest_dis.patch @@ -0,0 +1,69 @@ +From 5ac3bf12c59f90852a2546e5f39915615364a1a4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Jun 2020 11:52:10 +0800 +Subject: usb: usbtest: fix missing kfree(dev->buf) in usbtest_disconnect + +From: Zqiang + +[ Upstream commit 28ebeb8db77035e058a510ce9bd17c2b9a009dba ] + +BUG: memory leak +unreferenced object 0xffff888055046e00 (size 256): + comm "kworker/2:9", pid 2570, jiffies 4294942129 (age 1095.500s) + hex dump (first 32 bytes): + 00 70 04 55 80 88 ff ff 18 bb 5a 81 ff ff ff ff .p.U......Z..... + f5 96 78 81 ff ff ff ff 37 de 8e 81 ff ff ff ff ..x.....7....... + backtrace: + [<00000000d121dccf>] kmemleak_alloc_recursive +include/linux/kmemleak.h:43 [inline] + [<00000000d121dccf>] slab_post_alloc_hook mm/slab.h:586 [inline] + [<00000000d121dccf>] slab_alloc_node mm/slub.c:2786 [inline] + [<00000000d121dccf>] slab_alloc mm/slub.c:2794 [inline] + [<00000000d121dccf>] kmem_cache_alloc_trace+0x15e/0x2d0 mm/slub.c:2811 + [<000000005c3c3381>] kmalloc include/linux/slab.h:555 [inline] + [<000000005c3c3381>] usbtest_probe+0x286/0x19d0 +drivers/usb/misc/usbtest.c:2790 + [<000000001cec6910>] usb_probe_interface+0x2bd/0x870 +drivers/usb/core/driver.c:361 + [<000000007806c118>] really_probe+0x48d/0x8f0 drivers/base/dd.c:551 + [<00000000a3308c3e>] driver_probe_device+0xfc/0x2a0 drivers/base/dd.c:724 + [<000000003ef66004>] __device_attach_driver+0x1b6/0x240 +drivers/base/dd.c:831 + [<00000000eee53e97>] bus_for_each_drv+0x14e/0x1e0 drivers/base/bus.c:431 + [<00000000bb0648d0>] __device_attach+0x1f9/0x350 drivers/base/dd.c:897 + [<00000000838b324a>] device_initial_probe+0x1a/0x20 drivers/base/dd.c:944 + [<0000000030d501c1>] bus_probe_device+0x1e1/0x280 drivers/base/bus.c:491 + [<000000005bd7adef>] device_add+0x131d/0x1c40 drivers/base/core.c:2504 + [<00000000a0937814>] usb_set_configuration+0xe84/0x1ab0 +drivers/usb/core/message.c:2030 + [<00000000e3934741>] generic_probe+0x6a/0xe0 drivers/usb/core/generic.c:210 + [<0000000098ade0f1>] usb_probe_device+0x90/0xd0 +drivers/usb/core/driver.c:266 + [<000000007806c118>] really_probe+0x48d/0x8f0 drivers/base/dd.c:551 + [<00000000a3308c3e>] driver_probe_device+0xfc/0x2a0 drivers/base/dd.c:724 + +Acked-by: Alan Stern +Reported-by: Kyungtae Kim +Signed-off-by: Zqiang +Link: https://lore.kernel.org/r/20200612035210.20494-1-qiang.zhang@windriver.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + drivers/usb/misc/usbtest.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c +index 98ada1a3425c6..bae88893ee8e3 100644 +--- a/drivers/usb/misc/usbtest.c ++++ b/drivers/usb/misc/usbtest.c +@@ -2873,6 +2873,7 @@ static void usbtest_disconnect(struct usb_interface *intf) + + usb_set_intfdata(intf, NULL); + dev_dbg(&intf->dev, "disconnect\n"); ++ kfree(dev->buf); + kfree(dev); + } + +-- +2.25.1 + diff --git a/queue-5.7/usbnet-smsc95xx-fix-use-after-free-after-removal.patch b/queue-5.7/usbnet-smsc95xx-fix-use-after-free-after-removal.patch new file mode 100644 index 00000000000..d5963ba5b8f --- /dev/null +++ b/queue-5.7/usbnet-smsc95xx-fix-use-after-free-after-removal.patch @@ -0,0 +1,49 @@ +From f4bdf789d33e4cc05c955a5ce8d549b1abf97364 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 21 Jun 2020 13:43:26 +0300 +Subject: usbnet: smsc95xx: Fix use-after-free after removal + +From: Tuomas Tynkkynen + +[ Upstream commit b835a71ef64a61383c414d6bf2896d2c0161deca ] + +Syzbot reports an use-after-free in workqueue context: + +BUG: KASAN: use-after-free in mutex_unlock+0x19/0x40 kernel/locking/mutex.c:737 + mutex_unlock+0x19/0x40 kernel/locking/mutex.c:737 + __smsc95xx_mdio_read drivers/net/usb/smsc95xx.c:217 [inline] + smsc95xx_mdio_read+0x583/0x870 drivers/net/usb/smsc95xx.c:278 + check_carrier+0xd1/0x2e0 drivers/net/usb/smsc95xx.c:644 + process_one_work+0x777/0xf90 kernel/workqueue.c:2274 + worker_thread+0xa8f/0x1430 kernel/workqueue.c:2420 + kthread+0x2df/0x300 kernel/kthread.c:255 + +It looks like that smsc95xx_unbind() is freeing the structures that are +still in use by the concurrently running workqueue callback. Thus switch +to using cancel_delayed_work_sync() to ensure the work callback really +is no longer active. + +Reported-by: syzbot+29dc7d4ae19b703ff947@syzkaller.appspotmail.com +Signed-off-by: Tuomas Tynkkynen +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/usb/smsc95xx.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c +index 355be77f42418..3cf4dc3433f91 100644 +--- a/drivers/net/usb/smsc95xx.c ++++ b/drivers/net/usb/smsc95xx.c +@@ -1324,7 +1324,7 @@ static void smsc95xx_unbind(struct usbnet *dev, struct usb_interface *intf) + struct smsc95xx_priv *pdata = (struct smsc95xx_priv *)(dev->data[0]); + + if (pdata) { +- cancel_delayed_work(&pdata->carrier_check); ++ cancel_delayed_work_sync(&pdata->carrier_check); + netif_dbg(dev, ifdown, dev->net, "free pdata\n"); + kfree(pdata); + pdata = NULL; +-- +2.25.1 +