From: Greg Kroah-Hartman Date: Mon, 11 Nov 2019 05:46:11 +0000 (+0100) Subject: 5.3-stable patches X-Git-Tag: v4.4.201~45 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=94c2a109543a214158f6713f7d47bb1f5f84455b;p=thirdparty%2Fkernel%2Fstable-queue.git 5.3-stable patches added patches: alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch alsa-hda-ca0132-fix-possible-workqueue-stall.patch alsa-timer-fix-incorrectly-assigned-timer-instance.patch mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch --- diff --git a/queue-5.3/alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch b/queue-5.3/alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch new file mode 100644 index 00000000000..f0e5835d307 --- /dev/null +++ b/queue-5.3/alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch @@ -0,0 +1,50 @@ +From 706ad6746a66546daf96d4e4a95e46faf6cf689a Mon Sep 17 00:00:00 2001 +From: Takashi Sakamoto +Date: Sun, 3 Nov 2019 00:09:20 +0900 +Subject: ALSA: bebob: fix to detect configured source of sampling clock for Focusrite Saffire Pro i/o series + +From: Takashi Sakamoto + +commit 706ad6746a66546daf96d4e4a95e46faf6cf689a upstream. + +For Focusrite Saffire Pro i/o, the lowest 8 bits of register represents +configured source of sampling clock. The next lowest 8 bits represents +whether the configured source is actually detected or not just after +the register is changed for the source. + +Current implementation evaluates whole the register to detect configured +source. This results in failure due to the next lowest 8 bits when the +source is connected in advance. + +This commit fixes the bug. + +Fixes: 25784ec2d034 ("ALSA: bebob: Add support for Focusrite Saffire/SaffirePro series") +Cc: # v3.16+ +Signed-off-by: Takashi Sakamoto +Link: https://lore.kernel.org/r/20191102150920.20367-1-o-takashi@sakamocchi.jp +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman + +--- + sound/firewire/bebob/bebob_focusrite.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/sound/firewire/bebob/bebob_focusrite.c ++++ b/sound/firewire/bebob/bebob_focusrite.c +@@ -27,6 +27,8 @@ + #define SAFFIRE_CLOCK_SOURCE_SPDIF 1 + + /* clock sources as returned from register of Saffire Pro 10 and 26 */ ++#define SAFFIREPRO_CLOCK_SOURCE_SELECT_MASK 0x000000ff ++#define SAFFIREPRO_CLOCK_SOURCE_DETECT_MASK 0x0000ff00 + #define SAFFIREPRO_CLOCK_SOURCE_INTERNAL 0 + #define SAFFIREPRO_CLOCK_SOURCE_SKIP 1 /* never used on hardware */ + #define SAFFIREPRO_CLOCK_SOURCE_SPDIF 2 +@@ -189,6 +191,7 @@ saffirepro_both_clk_src_get(struct snd_b + map = saffirepro_clk_maps[1]; + + /* In a case that this driver cannot handle the value of register. */ ++ value &= SAFFIREPRO_CLOCK_SOURCE_SELECT_MASK; + if (value >= SAFFIREPRO_CLOCK_SOURCE_COUNT || map[value] < 0) { + err = -EIO; + goto end; diff --git a/queue-5.3/alsa-hda-ca0132-fix-possible-workqueue-stall.patch b/queue-5.3/alsa-hda-ca0132-fix-possible-workqueue-stall.patch new file mode 100644 index 00000000000..001613a25ff --- /dev/null +++ b/queue-5.3/alsa-hda-ca0132-fix-possible-workqueue-stall.patch @@ -0,0 +1,41 @@ +From 15c2b3cc09a31620914955cb2a89c277c18ee999 Mon Sep 17 00:00:00 2001 +From: Takashi Iwai +Date: Tue, 5 Nov 2019 14:43:16 +0100 +Subject: ALSA: hda/ca0132 - Fix possible workqueue stall + +From: Takashi Iwai + +commit 15c2b3cc09a31620914955cb2a89c277c18ee999 upstream. + +The unsolicited event handler for the headphone jack on CA0132 codec +driver tries to reschedule the another delayed work with +cancel_delayed_work_sync(). It's no good idea, unfortunately, +especially after we changed the work queue to the standard global +one; this may lead to a stall because both works are using the same +global queue. + +Fix it by dropping the _sync but does call cancel_delayed_work() +instead. + +Fixes: 993884f6a26c ("ALSA: hda/ca0132 - Delay HP amp turnon.") +BugLink: https://bugzilla.suse.com/show_bug.cgi?id=1155836 +Cc: +Link: https://lore.kernel.org/r/20191105134316.19294-1-tiwai@suse.de +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman + +--- + sound/pci/hda/patch_ca0132.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/sound/pci/hda/patch_ca0132.c ++++ b/sound/pci/hda/patch_ca0132.c +@@ -7604,7 +7604,7 @@ static void hp_callback(struct hda_codec + /* Delay enabling the HP amp, to let the mic-detection + * state machine run. + */ +- cancel_delayed_work_sync(&spec->unsol_hp_work); ++ cancel_delayed_work(&spec->unsol_hp_work); + schedule_delayed_work(&spec->unsol_hp_work, msecs_to_jiffies(500)); + tbl = snd_hda_jack_tbl_get(codec, cb->nid); + if (tbl) diff --git a/queue-5.3/alsa-timer-fix-incorrectly-assigned-timer-instance.patch b/queue-5.3/alsa-timer-fix-incorrectly-assigned-timer-instance.patch new file mode 100644 index 00000000000..46749bceaf2 --- /dev/null +++ b/queue-5.3/alsa-timer-fix-incorrectly-assigned-timer-instance.patch @@ -0,0 +1,61 @@ +From e7af6307a8a54f0b873960b32b6a644f2d0fbd97 Mon Sep 17 00:00:00 2001 +From: Takashi Iwai +Date: Wed, 6 Nov 2019 17:55:47 +0100 +Subject: ALSA: timer: Fix incorrectly assigned timer instance + +From: Takashi Iwai + +commit e7af6307a8a54f0b873960b32b6a644f2d0fbd97 upstream. + +The clean up commit 41672c0c24a6 ("ALSA: timer: Simplify error path in +snd_timer_open()") unified the error handling code paths with the +standard goto, but it introduced a subtle bug: the timer instance is +stored in snd_timer_open() incorrectly even if it returns an error. +This may eventually lead to UAF, as spotted by fuzzer. + +The culprit is the snd_timer_open() code checks the +SNDRV_TIMER_IFLG_EXCLUSIVE flag with the common variable timeri. +This variable is supposed to be the newly created instance, but we +(ab-)used it for a temporary check before the actual creation of a +timer instance. After that point, there is another check for the max +number of instances, and it bails out if over the threshold. Before +the refactoring above, it worked fine because the code returned +directly from that point. After the refactoring, however, it jumps to +the unified error path that stores the timeri variable in return -- +even if it returns an error. Unfortunately this stored value is kept +in the caller side (snd_timer_user_tselect()) in tu->timeri. This +causes inconsistency later, as if the timer was successfully +assigned. + +In this patch, we fix it by not re-using timeri variable but a +temporary variable for testing the exclusive connection, so timeri +remains NULL at that point. + +Fixes: 41672c0c24a6 ("ALSA: timer: Simplify error path in snd_timer_open()") +Reported-and-tested-by: Tristan Madani +Cc: +Link: https://lore.kernel.org/r/20191106165547.23518-1-tiwai@suse.de +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman + +--- + sound/core/timer.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/sound/core/timer.c ++++ b/sound/core/timer.c +@@ -284,11 +284,11 @@ int snd_timer_open(struct snd_timer_inst + goto unlock; + } + if (!list_empty(&timer->open_list_head)) { +- timeri = list_entry(timer->open_list_head.next, ++ struct snd_timer_instance *t = ++ list_entry(timer->open_list_head.next, + struct snd_timer_instance, open_list); +- if (timeri->flags & SNDRV_TIMER_IFLG_EXCLUSIVE) { ++ if (t->flags & SNDRV_TIMER_IFLG_EXCLUSIVE) { + err = -EBUSY; +- timeri = NULL; + goto unlock; + } + } diff --git a/queue-5.3/mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch b/queue-5.3/mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch new file mode 100644 index 00000000000..dbe42360521 --- /dev/null +++ b/queue-5.3/mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch @@ -0,0 +1,100 @@ +From 869712fd3de5a90b7ba23ae1272278cddc66b37b Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Tue, 5 Nov 2019 21:17:13 -0800 +Subject: mm: memcontrol: fix network errors from failing __GFP_ATOMIC charges + +From: Johannes Weiner + +commit 869712fd3de5a90b7ba23ae1272278cddc66b37b upstream. + +While upgrading from 4.16 to 5.2, we noticed these allocation errors in +the log of the new kernel: + + SLUB: Unable to allocate memory on node -1, gfp=0xa20(GFP_ATOMIC) + cache: tw_sock_TCPv6(960:helper-logs), object size: 232, buffer size: 240, default order: 1, min order: 0 + node 0: slabs: 5, objs: 170, free: 0 + + slab_out_of_memory+1 + ___slab_alloc+969 + __slab_alloc+14 + kmem_cache_alloc+346 + inet_twsk_alloc+60 + tcp_time_wait+46 + tcp_fin+206 + tcp_data_queue+2034 + tcp_rcv_state_process+784 + tcp_v6_do_rcv+405 + __release_sock+118 + tcp_close+385 + inet_release+46 + __sock_release+55 + sock_close+17 + __fput+170 + task_work_run+127 + exit_to_usermode_loop+191 + do_syscall_64+212 + entry_SYSCALL_64_after_hwframe+68 + +accompanied by an increase in machines going completely radio silent +under memory pressure. + +One thing that changed since 4.16 is e699e2c6a654 ("net, mm: account +sock objects to kmemcg"), which made these slab caches subject to cgroup +memory accounting and control. + +The problem with that is that cgroups, unlike the page allocator, do not +maintain dedicated atomic reserves. As a cgroup's usage hovers at its +limit, atomic allocations - such as done during network rx - can fail +consistently for extended periods of time. The kernel is not able to +operate under these conditions. + +We don't want to revert the culprit patch, because it indeed tracks a +potentially substantial amount of memory used by a cgroup. + +We also don't want to implement dedicated atomic reserves for cgroups. +There is no point in keeping a fixed margin of unused bytes in the +cgroup's memory budget to accomodate a consumer that is impossible to +predict - we'd be wasting memory and get into configuration headaches, +not unlike what we have going with min_free_kbytes. We do this for +physical mem because we have to, but cgroups are an accounting game. + +Instead, account these privileged allocations to the cgroup, but let +them bypass the configured limit if they have to. This way, we get the +benefits of accounting the consumed memory and have it exert pressure on +the rest of the cgroup, but like with the page allocator, we shift the +burden of reclaimining on behalf of atomic allocations onto the regular +allocations that can block. + +Link: http://lkml.kernel.org/r/20191022233708.365764-1-hannes@cmpxchg.org +Fixes: e699e2c6a654 ("net, mm: account sock objects to kmemcg") +Signed-off-by: Johannes Weiner +Reviewed-by: Shakeel Butt +Cc: Suleiman Souhlal +Cc: Michal Hocko +Cc: [4.18+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2408,6 +2408,15 @@ retry: + } + + /* ++ * Memcg doesn't have a dedicated reserve for atomic ++ * allocations. But like the global atomic pool, we need to ++ * put the burden of reclaim on regular allocation requests ++ * and let these go through as privileged allocations. ++ */ ++ if (gfp_mask & __GFP_ATOMIC) ++ goto force; ++ ++ /* + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and diff --git a/queue-5.3/mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch b/queue-5.3/mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch new file mode 100644 index 00000000000..7ca2e43ee18 --- /dev/null +++ b/queue-5.3/mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch @@ -0,0 +1,99 @@ +From 7961eee3978475fd9e8626137f88595b1ca05856 Mon Sep 17 00:00:00 2001 +From: Shakeel Butt +Date: Tue, 5 Nov 2019 21:16:21 -0800 +Subject: mm: memcontrol: fix NULL-ptr deref in percpu stats flush + +From: Shakeel Butt + +commit 7961eee3978475fd9e8626137f88595b1ca05856 upstream. + +__mem_cgroup_free() can be called on the failure path in +mem_cgroup_alloc(). However memcg_flush_percpu_vmstats() and +memcg_flush_percpu_vmevents() which are called from __mem_cgroup_free() +access the fields of memcg which can potentially be null if called from +failure path from mem_cgroup_alloc(). Indeed syzbot has reported the +following crash: + + kasan: CONFIG_KASAN_INLINE enabled + kasan: GPF could be caused by NULL-ptr deref or user memory access + general protection fault: 0000 [#1] PREEMPT SMP KASAN + CPU: 0 PID: 30393 Comm: syz-executor.1 Not tainted 5.4.0-rc2+ #0 + Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 + RIP: 0010:memcg_flush_percpu_vmstats+0x4ae/0x930 mm/memcontrol.c:3436 + Code: 05 41 89 c0 41 0f b6 04 24 41 38 c7 7c 08 84 c0 0f 85 5d 03 00 00 44 3b 05 33 d5 12 08 0f 83 e2 00 00 00 4c 89 f0 48 c1 e8 03 <42> 80 3c 28 00 0f 85 91 03 00 00 48 8b 85 10 fe ff ff 48 8b b0 90 + RSP: 0018:ffff888095c27980 EFLAGS: 00010206 + RAX: 0000000000000012 RBX: ffff888095c27b28 RCX: ffffc90008192000 + RDX: 0000000000040000 RSI: ffffffff8340fae7 RDI: 0000000000000007 + RBP: ffff888095c27be0 R08: 0000000000000000 R09: ffffed1013f0da33 + R10: ffffed1013f0da32 R11: ffff88809f86d197 R12: fffffbfff138b760 + R13: dffffc0000000000 R14: 0000000000000090 R15: 0000000000000007 + FS: 00007f5027170700(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000710158 CR3: 00000000a7b18000 CR4: 00000000001406f0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + __mem_cgroup_free+0x1a/0x190 mm/memcontrol.c:5021 + mem_cgroup_free mm/memcontrol.c:5033 [inline] + mem_cgroup_css_alloc+0x3a1/0x1ae0 mm/memcontrol.c:5160 + css_create kernel/cgroup/cgroup.c:5156 [inline] + cgroup_apply_control_enable+0x44d/0xc40 kernel/cgroup/cgroup.c:3119 + cgroup_mkdir+0x899/0x11b0 kernel/cgroup/cgroup.c:5401 + kernfs_iop_mkdir+0x14d/0x1d0 fs/kernfs/dir.c:1124 + vfs_mkdir+0x42e/0x670 fs/namei.c:3807 + do_mkdirat+0x234/0x2a0 fs/namei.c:3830 + __do_sys_mkdir fs/namei.c:3846 [inline] + __se_sys_mkdir fs/namei.c:3844 [inline] + __x64_sys_mkdir+0x5c/0x80 fs/namei.c:3844 + do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290 + entry_SYSCALL_64_after_hwframe+0x49/0xbe + +Fixing this by moving the flush to mem_cgroup_free as there is no need +to flush anything if we see failure in mem_cgroup_alloc(). + +Link: http://lkml.kernel.org/r/20191018165231.249872-1-shakeelb@google.com +Fixes: bb65f89b7d3d ("mm: memcontrol: flush percpu vmevents before releasing memcg") +Fixes: c350a99ea2b1 ("mm: memcontrol: flush percpu vmstats before releasing memcg") +Signed-off-by: Shakeel Butt +Reported-by: syzbot+515d5bcfe179cdf049b2@syzkaller.appspotmail.com +Reviewed-by: Roman Gushchin +Cc: Michal Hocko +Cc: Johannes Weiner +Cc: Vladimir Davydov +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -4763,12 +4763,6 @@ static void __mem_cgroup_free(struct mem + { + int node; + +- /* +- * Flush percpu vmstats and vmevents to guarantee the value correctness +- * on parent's and all ancestor levels. +- */ +- memcg_flush_percpu_vmstats(memcg, false); +- memcg_flush_percpu_vmevents(memcg); + for_each_node(node) + free_mem_cgroup_per_node_info(memcg, node); + free_percpu(memcg->vmstats_percpu); +@@ -4779,6 +4773,12 @@ static void __mem_cgroup_free(struct mem + static void mem_cgroup_free(struct mem_cgroup *memcg) + { + memcg_wb_domain_exit(memcg); ++ /* ++ * Flush percpu vmstats and vmevents to guarantee the value correctness ++ * on parent's and all ancestor levels. ++ */ ++ memcg_flush_percpu_vmstats(memcg, false); ++ memcg_flush_percpu_vmevents(memcg); + __mem_cgroup_free(memcg); + } + diff --git a/queue-5.3/mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch b/queue-5.3/mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch new file mode 100644 index 00000000000..85c10cd9f53 --- /dev/null +++ b/queue-5.3/mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch @@ -0,0 +1,120 @@ +From 3e8fc0075e24338b1117cdff6a79477427b8dbed Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Tue, 5 Nov 2019 21:16:27 -0800 +Subject: mm, meminit: recalculate pcpu batch and high limits after init completes + +From: Mel Gorman + +commit 3e8fc0075e24338b1117cdff6a79477427b8dbed upstream. + +Deferred memory initialisation updates zone->managed_pages during the +initialisation phase but before that finishes, the per-cpu page +allocator (pcpu) calculates the number of pages allocated/freed in +batches as well as the maximum number of pages allowed on a per-cpu +list. As zone->managed_pages is not up to date yet, the pcpu +initialisation calculates inappropriately low batch and high values. + +This increases zone lock contention quite severely in some cases with +the degree of severity depending on how many CPUs share a local zone and +the size of the zone. A private report indicated that kernel build +times were excessive with extremely high system CPU usage. A perf +profile indicated that a large chunk of time was lost on zone->lock +contention. + +This patch recalculates the pcpu batch and high values after deferred +initialisation completes for every populated zone in the system. It was +tested on a 2-socket AMD EPYC 2 machine using a kernel compilation +workload -- allmodconfig and all available CPUs. + +mmtests configuration: config-workload-kernbench-max Configuration was +modified to build on a fresh XFS partition. + +kernbench + 5.4.0-rc3 5.4.0-rc3 + vanilla resetpcpu-v2 +Amean user-256 13249.50 ( 0.00%) 16401.31 * -23.79%* +Amean syst-256 14760.30 ( 0.00%) 4448.39 * 69.86%* +Amean elsp-256 162.42 ( 0.00%) 119.13 * 26.65%* +Stddev user-256 42.97 ( 0.00%) 19.15 ( 55.43%) +Stddev syst-256 336.87 ( 0.00%) 6.71 ( 98.01%) +Stddev elsp-256 2.46 ( 0.00%) 0.39 ( 84.03%) + + 5.4.0-rc3 5.4.0-rc3 + vanilla resetpcpu-v2 +Duration User 39766.24 49221.79 +Duration System 44298.10 13361.67 +Duration Elapsed 519.11 388.87 + +The patch reduces system CPU usage by 69.86% and total build time by +26.65%. The variance of system CPU usage is also much reduced. + +Before, this was the breakdown of batch and high values over all zones +was: + + 256 batch: 1 + 256 batch: 63 + 512 batch: 7 + 256 high: 0 + 256 high: 378 + 512 high: 42 + +512 pcpu pagesets had a batch limit of 7 and a high limit of 42. After +the patch: + + 256 batch: 1 + 768 batch: 63 + 256 high: 0 + 768 high: 378 + +[mgorman@techsingularity.net: fix merge/linkage snafu] + Link: http://lkml.kernel.org/r/20191023084705.GD3016@techsingularity.netLink: http://lkml.kernel.org/r/20191021094808.28824-2-mgorman@techsingularity.net +Signed-off-by: Mel Gorman +Acked-by: Michal Hocko +Acked-by: Vlastimil Babka +Acked-by: David Hildenbrand +Cc: Matt Fleming +Cc: Thomas Gleixner +Cc: Borislav Petkov +Cc: Qian Cai +Cc: [4.1+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1947,6 +1947,14 @@ void __init page_alloc_init_late(void) + wait_for_completion(&pgdat_init_all_done_comp); + + /* ++ * The number of managed pages has changed due to the initialisation ++ * so the pcpu batch and high limits needs to be updated or the limits ++ * will be artificially small. ++ */ ++ for_each_populated_zone(zone) ++ zone_pcp_update(zone); ++ ++ /* + * We initialized the rest of the deferred pages. Permanently disable + * on-demand struct page initialization. + */ +@@ -8479,7 +8487,6 @@ void free_contig_range(unsigned long pfn + WARN(count != 0, "%d pages are still in use!\n", count); + } + +-#ifdef CONFIG_MEMORY_HOTPLUG + /* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. +@@ -8493,7 +8500,6 @@ void __meminit zone_pcp_update(struct zo + per_cpu_ptr(zone->pageset, cpu)); + mutex_unlock(&pcp_batch_high_lock); + } +-#endif + + void zone_pcp_reset(struct zone *zone) + { diff --git a/queue-5.3/mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch b/queue-5.3/mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch new file mode 100644 index 00000000000..0501a034c3d --- /dev/null +++ b/queue-5.3/mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch @@ -0,0 +1,145 @@ +From 169226f7e0d275c1879551f37484ef6683579a5c Mon Sep 17 00:00:00 2001 +From: Yang Shi +Date: Tue, 5 Nov 2019 21:16:30 -0800 +Subject: mm: thp: handle page cache THP correctly in PageTransCompoundMap + +From: Yang Shi + +commit 169226f7e0d275c1879551f37484ef6683579a5c upstream. + +We have a usecase to use tmpfs as QEMU memory backend and we would like +to take the advantage of THP as well. But, our test shows the EPT is +not PMD mapped even though the underlying THP are PMD mapped on host. +The number showed by /sys/kernel/debug/kvm/largepage is much less than +the number of PMD mapped shmem pages as the below: + + 7f2778200000-7f2878200000 rw-s 00000000 00:14 262232 /dev/shm/qemu_back_mem.mem.Hz2hSf (deleted) + Size: 4194304 kB + [snip] + AnonHugePages: 0 kB + ShmemPmdMapped: 579584 kB + [snip] + Locked: 0 kB + + cat /sys/kernel/debug/kvm/largepages + 12 + +And some benchmarks do worse than with anonymous THPs. + +By digging into the code we figured out that commit 127393fbe597 ("mm: +thp: kvm: fix memory corruption in KVM with THP enabled") checks if +there is a single PTE mapping on the page for anonymous THP when setting +up EPT map. But the _mapcount < 0 check doesn't work for page cache THP +since every subpage of page cache THP would get _mapcount inc'ed once it +is PMD mapped, so PageTransCompoundMap() always returns false for page +cache THP. This would prevent KVM from setting up PMD mapped EPT entry. + +So we need handle page cache THP correctly. However, when page cache +THP's PMD gets split, kernel just remove the map instead of setting up +PTE map like what anonymous THP does. Before KVM calls get_user_pages() +the subpages may get PTE mapped even though it is still a THP since the +page cache THP may be mapped by other processes at the mean time. + +Checking its _mapcount and whether the THP has PTE mapped or not. +Although this may report some false negative cases (PTE mapped by other +processes), it looks not trivial to make this accurate. + +With this fix /sys/kernel/debug/kvm/largepage would show reasonable +pages are PMD mapped by EPT as the below: + + 7fbeaee00000-7fbfaee00000 rw-s 00000000 00:14 275464 /dev/shm/qemu_back_mem.mem.SKUvat (deleted) + Size: 4194304 kB + [snip] + AnonHugePages: 0 kB + ShmemPmdMapped: 557056 kB + [snip] + Locked: 0 kB + + cat /sys/kernel/debug/kvm/largepages + 271 + +And the benchmarks are as same as anonymous THPs. + +[yang.shi@linux.alibaba.com: v4] + Link: http://lkml.kernel.org/r/1571865575-42913-1-git-send-email-yang.shi@linux.alibaba.com +Link: http://lkml.kernel.org/r/1571769577-89735-1-git-send-email-yang.shi@linux.alibaba.com +Fixes: dd78fedde4b9 ("rmap: support file thp") +Signed-off-by: Yang Shi +Reported-by: Gang Deng +Tested-by: Gang Deng +Suggested-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Cc: Andrea Arcangeli +Cc: Matthew Wilcox +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/mm.h | 5 ----- + include/linux/mm_types.h | 5 +++++ + include/linux/page-flags.h | 20 ++++++++++++++++++-- + 3 files changed, 23 insertions(+), 7 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -695,11 +695,6 @@ static inline void *kvcalloc(size_t n, s + + extern void kvfree(const void *addr); + +-static inline atomic_t *compound_mapcount_ptr(struct page *page) +-{ +- return &page[1].compound_mapcount; +-} +- + static inline int compound_mapcount(struct page *page) + { + VM_BUG_ON_PAGE(!PageCompound(page), page); +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -221,6 +221,11 @@ struct page { + #endif + } _struct_page_alignment; + ++static inline atomic_t *compound_mapcount_ptr(struct page *page) ++{ ++ return &page[1].compound_mapcount; ++} ++ + /* + * Used for sizing the vmemmap region on some architectures + */ +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -622,12 +622,28 @@ static inline int PageTransCompound(stru + * + * Unlike PageTransCompound, this is safe to be called only while + * split_huge_pmd() cannot run from under us, like if protected by the +- * MMU notifier, otherwise it may result in page->_mapcount < 0 false ++ * MMU notifier, otherwise it may result in page->_mapcount check false + * positives. ++ * ++ * We have to treat page cache THP differently since every subpage of it ++ * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE ++ * mapped in the current process so comparing subpage's _mapcount to ++ * compound_mapcount to filter out PTE mapped case. + */ + static inline int PageTransCompoundMap(struct page *page) + { +- return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0; ++ struct page *head; ++ ++ if (!PageTransCompound(page)) ++ return 0; ++ ++ if (PageAnon(page)) ++ return atomic_read(&page->_mapcount) < 0; ++ ++ head = compound_head(page); ++ /* File THP is PMD mapped and not PTE mapped */ ++ return atomic_read(&page->_mapcount) == ++ atomic_read(compound_mapcount_ptr(head)); + } + + /* diff --git a/queue-5.3/mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch b/queue-5.3/mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch new file mode 100644 index 00000000000..dbc134649c0 --- /dev/null +++ b/queue-5.3/mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch @@ -0,0 +1,57 @@ +From abaed0112c1db08be15a784a2c5c8a8b3063cdd3 Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Tue, 5 Nov 2019 21:16:40 -0800 +Subject: mm, vmstat: hide /proc/pagetypeinfo from normal users + +From: Michal Hocko + +commit abaed0112c1db08be15a784a2c5c8a8b3063cdd3 upstream. + +/proc/pagetypeinfo is a debugging tool to examine internal page +allocator state wrt to fragmentation. It is not very useful for any +other use so normal users really do not need to read this file. + +Waiman Long has noticed that reading this file can have negative side +effects because zone->lock is necessary for gathering data and that a) +interferes with the page allocator and its users and b) can lead to hard +lockups on large machines which have very long free_list. + +Reduce both issues by simply not exporting the file to regular users. + +Link: http://lkml.kernel.org/r/20191025072610.18526-2-mhocko@kernel.org +Fixes: 467c996c1e19 ("Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo") +Signed-off-by: Michal Hocko +Reported-by: Waiman Long +Acked-by: Mel Gorman +Acked-by: Vlastimil Babka +Acked-by: Waiman Long +Acked-by: Rafael Aquini +Acked-by: David Rientjes +Reviewed-by: Andrew Morton +Cc: David Hildenbrand +Cc: Johannes Weiner +Cc: Roman Gushchin +Cc: Konstantin Khlebnikov +Cc: Jann Horn +Cc: Song Liu +Cc: Greg Kroah-Hartman +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmstat.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -1970,7 +1970,7 @@ void __init init_mm_internals(void) + #endif + #ifdef CONFIG_PROC_FS + proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); +- proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op); ++ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); + proc_create_seq("vmstat", 0444, NULL, &vmstat_op); + proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); + #endif diff --git a/queue-5.3/series b/queue-5.3/series index 9413dbc7d4e..d5f69fddd88 100644 --- a/queue-5.3/series +++ b/queue-5.3/series @@ -22,3 +22,11 @@ net-sched-prevent-duplicate-flower-rules-from-tcf_proto-destroy-race.patch net-smc-fix-ethernet-interface-refcounting.patch vsock-virtio-fix-sock-refcnt-holding-during-the-shutdown.patch r8169-fix-page-read-in-r8168g_mdio_read.patch +alsa-timer-fix-incorrectly-assigned-timer-instance.patch +alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch +alsa-hda-ca0132-fix-possible-workqueue-stall.patch +mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch +mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch +mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch +mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch +mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch