5.3-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 11 Nov 2019 05:46:11 +0000 (06:46 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 11 Nov 2019 05:46:11 +0000 (06:46 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 11 Nov 2019 05:46:11 +0000 (06:46 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 11 Nov 2019 05:46:11 +0000 (06:46 +0100)
diff --git a/queue-5.3/alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch b/queue-5.3/alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch

new file mode 100644 (file)

index 0000000..f0e5835
--- /dev/null
+++ b/queue-5.3/alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch
@@ -0,0 +1,50 @@
+From 706ad6746a66546daf96d4e4a95e46faf6cf689a Mon Sep 17 00:00:00 2001
+From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
+Date: Sun, 3 Nov 2019 00:09:20 +0900
+Subject: ALSA: bebob: fix to detect configured source of sampling clock for Focusrite Saffire Pro i/o series
+
+From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
+
+commit 706ad6746a66546daf96d4e4a95e46faf6cf689a upstream.
+
+For Focusrite Saffire Pro i/o, the lowest 8 bits of register represents
+configured source of sampling clock. The next lowest 8 bits represents
+whether the configured source is actually detected or not just after
+the register is changed for the source.
+
+Current implementation evaluates whole the register to detect configured
+source. This results in failure due to the next lowest 8 bits when the
+source is connected in advance.
+
+This commit fixes the bug.
+
+Fixes: 25784ec2d034 ("ALSA: bebob: Add support for Focusrite Saffire/SaffirePro series")
+Cc: <stable@vger.kernel.org> # v3.16+
+Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
+Link: https://lore.kernel.org/r/20191102150920.20367-1-o-takashi@sakamocchi.jp
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ sound/firewire/bebob/bebob_focusrite.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/sound/firewire/bebob/bebob_focusrite.c
++++ b/sound/firewire/bebob/bebob_focusrite.c
+@@ -27,6 +27,8 @@
+ #define SAFFIRE_CLOCK_SOURCE_SPDIF            1
+ 
+ /* clock sources as returned from register of Saffire Pro 10 and 26 */
++#define SAFFIREPRO_CLOCK_SOURCE_SELECT_MASK   0x000000ff
++#define SAFFIREPRO_CLOCK_SOURCE_DETECT_MASK   0x0000ff00
+ #define SAFFIREPRO_CLOCK_SOURCE_INTERNAL      0
+ #define SAFFIREPRO_CLOCK_SOURCE_SKIP          1 /* never used on hardware */
+ #define SAFFIREPRO_CLOCK_SOURCE_SPDIF         2
+@@ -189,6 +191,7 @@ saffirepro_both_clk_src_get(struct snd_b
+               map = saffirepro_clk_maps[1];
+ 
+       /* In a case that this driver cannot handle the value of register. */
++      value &= SAFFIREPRO_CLOCK_SOURCE_SELECT_MASK;
+       if (value >= SAFFIREPRO_CLOCK_SOURCE_COUNT || map[value] < 0) {
+               err = -EIO;
+               goto end;
diff --git a/queue-5.3/alsa-hda-ca0132-fix-possible-workqueue-stall.patch b/queue-5.3/alsa-hda-ca0132-fix-possible-workqueue-stall.patch

new file mode 100644 (file)

index 0000000..001613a
--- /dev/null
+++ b/queue-5.3/alsa-hda-ca0132-fix-possible-workqueue-stall.patch
@@ -0,0 +1,41 @@
+From 15c2b3cc09a31620914955cb2a89c277c18ee999 Mon Sep 17 00:00:00 2001
+From: Takashi Iwai <tiwai@suse.de>
+Date: Tue, 5 Nov 2019 14:43:16 +0100
+Subject: ALSA: hda/ca0132 - Fix possible workqueue stall
+
+From: Takashi Iwai <tiwai@suse.de>
+
+commit 15c2b3cc09a31620914955cb2a89c277c18ee999 upstream.
+
+The unsolicited event handler for the headphone jack on CA0132 codec
+driver tries to reschedule the another delayed work with
+cancel_delayed_work_sync().  It's no good idea, unfortunately,
+especially after we changed the work queue to the standard global
+one; this may lead to a stall because both works are using the same
+global queue.
+
+Fix it by dropping the _sync but does call cancel_delayed_work()
+instead.
+
+Fixes: 993884f6a26c ("ALSA: hda/ca0132 - Delay HP amp turnon.")
+BugLink: https://bugzilla.suse.com/show_bug.cgi?id=1155836
+Cc: <stable@vger.kernel.org>
+Link: https://lore.kernel.org/r/20191105134316.19294-1-tiwai@suse.de
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ sound/pci/hda/patch_ca0132.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/sound/pci/hda/patch_ca0132.c
++++ b/sound/pci/hda/patch_ca0132.c
+@@ -7604,7 +7604,7 @@ static void hp_callback(struct hda_codec
+       /* Delay enabling the HP amp, to let the mic-detection
+        * state machine run.
+        */
+-      cancel_delayed_work_sync(&spec->unsol_hp_work);
++      cancel_delayed_work(&spec->unsol_hp_work);
+       schedule_delayed_work(&spec->unsol_hp_work, msecs_to_jiffies(500));
+       tbl = snd_hda_jack_tbl_get(codec, cb->nid);
+       if (tbl)
diff --git a/queue-5.3/alsa-timer-fix-incorrectly-assigned-timer-instance.patch b/queue-5.3/alsa-timer-fix-incorrectly-assigned-timer-instance.patch

new file mode 100644 (file)

index 0000000..46749bc
--- /dev/null
+++ b/queue-5.3/alsa-timer-fix-incorrectly-assigned-timer-instance.patch
@@ -0,0 +1,61 @@
+From e7af6307a8a54f0b873960b32b6a644f2d0fbd97 Mon Sep 17 00:00:00 2001
+From: Takashi Iwai <tiwai@suse.de>
+Date: Wed, 6 Nov 2019 17:55:47 +0100
+Subject: ALSA: timer: Fix incorrectly assigned timer instance
+
+From: Takashi Iwai <tiwai@suse.de>
+
+commit e7af6307a8a54f0b873960b32b6a644f2d0fbd97 upstream.
+
+The clean up commit 41672c0c24a6 ("ALSA: timer: Simplify error path in
+snd_timer_open()") unified the error handling code paths with the
+standard goto, but it introduced a subtle bug: the timer instance is
+stored in snd_timer_open() incorrectly even if it returns an error.
+This may eventually lead to UAF, as spotted by fuzzer.
+
+The culprit is the snd_timer_open() code checks the
+SNDRV_TIMER_IFLG_EXCLUSIVE flag with the common variable timeri.
+This variable is supposed to be the newly created instance, but we
+(ab-)used it for a temporary check before the actual creation of a
+timer instance.  After that point, there is another check for the max
+number of instances, and it bails out if over the threshold.  Before
+the refactoring above, it worked fine because the code returned
+directly from that point.  After the refactoring, however, it jumps to
+the unified error path that stores the timeri variable in return --
+even if it returns an error.  Unfortunately this stored value is kept
+in the caller side (snd_timer_user_tselect()) in tu->timeri.  This
+causes inconsistency later, as if the timer was successfully
+assigned.
+
+In this patch, we fix it by not re-using timeri variable but a
+temporary variable for testing the exclusive connection, so timeri
+remains NULL at that point.
+
+Fixes: 41672c0c24a6 ("ALSA: timer: Simplify error path in snd_timer_open()")
+Reported-and-tested-by: Tristan Madani <tristmd@gmail.com>
+Cc: <stable@vger.kernel.org>
+Link: https://lore.kernel.org/r/20191106165547.23518-1-tiwai@suse.de
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ sound/core/timer.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/sound/core/timer.c
++++ b/sound/core/timer.c
+@@ -284,11 +284,11 @@ int snd_timer_open(struct snd_timer_inst
+               goto unlock;
+       }
+       if (!list_empty(&timer->open_list_head)) {
+-              timeri = list_entry(timer->open_list_head.next,
++              struct snd_timer_instance *t =
++                      list_entry(timer->open_list_head.next,
+                                   struct snd_timer_instance, open_list);
+-              if (timeri->flags & SNDRV_TIMER_IFLG_EXCLUSIVE) {
++              if (t->flags & SNDRV_TIMER_IFLG_EXCLUSIVE) {
+                       err = -EBUSY;
+-                      timeri = NULL;
+                       goto unlock;
+               }
+       }
diff --git a/queue-5.3/mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch b/queue-5.3/mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch

new file mode 100644 (file)

index 0000000..dbe4236
--- /dev/null
+++ b/queue-5.3/mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch
@@ -0,0 +1,100 @@
+From 869712fd3de5a90b7ba23ae1272278cddc66b37b Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Tue, 5 Nov 2019 21:17:13 -0800
+Subject: mm: memcontrol: fix network errors from failing __GFP_ATOMIC charges
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 869712fd3de5a90b7ba23ae1272278cddc66b37b upstream.
+
+While upgrading from 4.16 to 5.2, we noticed these allocation errors in
+the log of the new kernel:
+
+  SLUB: Unable to allocate memory on node -1, gfp=0xa20(GFP_ATOMIC)
+    cache: tw_sock_TCPv6(960:helper-logs), object size: 232, buffer size: 240, default order: 1, min order: 0
+    node 0: slabs: 5, objs: 170, free: 0
+
+        slab_out_of_memory+1
+        ___slab_alloc+969
+        __slab_alloc+14
+        kmem_cache_alloc+346
+        inet_twsk_alloc+60
+        tcp_time_wait+46
+        tcp_fin+206
+        tcp_data_queue+2034
+        tcp_rcv_state_process+784
+        tcp_v6_do_rcv+405
+        __release_sock+118
+        tcp_close+385
+        inet_release+46
+        __sock_release+55
+        sock_close+17
+        __fput+170
+        task_work_run+127
+        exit_to_usermode_loop+191
+        do_syscall_64+212
+        entry_SYSCALL_64_after_hwframe+68
+
+accompanied by an increase in machines going completely radio silent
+under memory pressure.
+
+One thing that changed since 4.16 is e699e2c6a654 ("net, mm: account
+sock objects to kmemcg"), which made these slab caches subject to cgroup
+memory accounting and control.
+
+The problem with that is that cgroups, unlike the page allocator, do not
+maintain dedicated atomic reserves.  As a cgroup's usage hovers at its
+limit, atomic allocations - such as done during network rx - can fail
+consistently for extended periods of time.  The kernel is not able to
+operate under these conditions.
+
+We don't want to revert the culprit patch, because it indeed tracks a
+potentially substantial amount of memory used by a cgroup.
+
+We also don't want to implement dedicated atomic reserves for cgroups.
+There is no point in keeping a fixed margin of unused bytes in the
+cgroup's memory budget to accomodate a consumer that is impossible to
+predict - we'd be wasting memory and get into configuration headaches,
+not unlike what we have going with min_free_kbytes.  We do this for
+physical mem because we have to, but cgroups are an accounting game.
+
+Instead, account these privileged allocations to the cgroup, but let
+them bypass the configured limit if they have to.  This way, we get the
+benefits of accounting the consumed memory and have it exert pressure on
+the rest of the cgroup, but like with the page allocator, we shift the
+burden of reclaimining on behalf of atomic allocations onto the regular
+allocations that can block.
+
+Link: http://lkml.kernel.org/r/20191022233708.365764-1-hannes@cmpxchg.org
+Fixes: e699e2c6a654 ("net, mm: account sock objects to kmemcg")
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Cc: Suleiman Souhlal <suleiman@google.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: <stable@vger.kernel.org>   [4.18+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2408,6 +2408,15 @@ retry:
+       }
+ 
+       /*
++       * Memcg doesn't have a dedicated reserve for atomic
++       * allocations. But like the global atomic pool, we need to
++       * put the burden of reclaim on regular allocation requests
++       * and let these go through as privileged allocations.
++       */
++      if (gfp_mask & __GFP_ATOMIC)
++              goto force;
++
++      /*
+        * Unlike in global OOM situations, memcg is not in a physical
+        * memory shortage.  Allow dying and OOM-killed tasks to
+        * bypass the last charges so that they can exit quickly and
diff --git a/queue-5.3/mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch b/queue-5.3/mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch

new file mode 100644 (file)

index 0000000..7ca2e43
--- /dev/null
+++ b/queue-5.3/mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch
@@ -0,0 +1,99 @@
+From 7961eee3978475fd9e8626137f88595b1ca05856 Mon Sep 17 00:00:00 2001
+From: Shakeel Butt <shakeelb@google.com>
+Date: Tue, 5 Nov 2019 21:16:21 -0800
+Subject: mm: memcontrol: fix NULL-ptr deref in percpu stats flush
+
+From: Shakeel Butt <shakeelb@google.com>
+
+commit 7961eee3978475fd9e8626137f88595b1ca05856 upstream.
+
+__mem_cgroup_free() can be called on the failure path in
+mem_cgroup_alloc().  However memcg_flush_percpu_vmstats() and
+memcg_flush_percpu_vmevents() which are called from __mem_cgroup_free()
+access the fields of memcg which can potentially be null if called from
+failure path from mem_cgroup_alloc().  Indeed syzbot has reported the
+following crash:
+
+       kasan: CONFIG_KASAN_INLINE enabled
+       kasan: GPF could be caused by NULL-ptr deref or user memory access
+       general protection fault: 0000 [#1] PREEMPT SMP KASAN
+       CPU: 0 PID: 30393 Comm: syz-executor.1 Not tainted 5.4.0-rc2+ #0
+       Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+       RIP: 0010:memcg_flush_percpu_vmstats+0x4ae/0x930 mm/memcontrol.c:3436
+       Code: 05 41 89 c0 41 0f b6 04 24 41 38 c7 7c 08 84 c0 0f 85 5d 03 00 00 44 3b 05 33 d5 12 08 0f 83 e2 00 00 00 4c 89 f0 48 c1 e8 03 <42> 80 3c 28 00 0f 85 91 03 00 00 48 8b 85 10 fe ff ff 48 8b b0 90
+       RSP: 0018:ffff888095c27980 EFLAGS: 00010206
+       RAX: 0000000000000012 RBX: ffff888095c27b28 RCX: ffffc90008192000
+       RDX: 0000000000040000 RSI: ffffffff8340fae7 RDI: 0000000000000007
+       RBP: ffff888095c27be0 R08: 0000000000000000 R09: ffffed1013f0da33
+       R10: ffffed1013f0da32 R11: ffff88809f86d197 R12: fffffbfff138b760
+       R13: dffffc0000000000 R14: 0000000000000090 R15: 0000000000000007
+       FS:  00007f5027170700(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
+       CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+       CR2: 0000000000710158 CR3: 00000000a7b18000 CR4: 00000000001406f0
+       DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+       DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+       Call Trace:
+       __mem_cgroup_free+0x1a/0x190 mm/memcontrol.c:5021
+       mem_cgroup_free mm/memcontrol.c:5033 [inline]
+       mem_cgroup_css_alloc+0x3a1/0x1ae0 mm/memcontrol.c:5160
+       css_create kernel/cgroup/cgroup.c:5156 [inline]
+       cgroup_apply_control_enable+0x44d/0xc40 kernel/cgroup/cgroup.c:3119
+       cgroup_mkdir+0x899/0x11b0 kernel/cgroup/cgroup.c:5401
+       kernfs_iop_mkdir+0x14d/0x1d0 fs/kernfs/dir.c:1124
+       vfs_mkdir+0x42e/0x670 fs/namei.c:3807
+       do_mkdirat+0x234/0x2a0 fs/namei.c:3830
+       __do_sys_mkdir fs/namei.c:3846 [inline]
+       __se_sys_mkdir fs/namei.c:3844 [inline]
+       __x64_sys_mkdir+0x5c/0x80 fs/namei.c:3844
+       do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290
+       entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+Fixing this by moving the flush to mem_cgroup_free as there is no need
+to flush anything if we see failure in mem_cgroup_alloc().
+
+Link: http://lkml.kernel.org/r/20191018165231.249872-1-shakeelb@google.com
+Fixes: bb65f89b7d3d ("mm: memcontrol: flush percpu vmevents before releasing memcg")
+Fixes: c350a99ea2b1 ("mm: memcontrol: flush percpu vmstats before releasing memcg")
+Signed-off-by: Shakeel Butt <shakeelb@google.com>
+Reported-by: syzbot+515d5bcfe179cdf049b2@syzkaller.appspotmail.com
+Reviewed-by: Roman Gushchin <guro@fb.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |   12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -4763,12 +4763,6 @@ static void __mem_cgroup_free(struct mem
+ {
+       int node;
+ 
+-      /*
+-       * Flush percpu vmstats and vmevents to guarantee the value correctness
+-       * on parent's and all ancestor levels.
+-       */
+-      memcg_flush_percpu_vmstats(memcg, false);
+-      memcg_flush_percpu_vmevents(memcg);
+       for_each_node(node)
+               free_mem_cgroup_per_node_info(memcg, node);
+       free_percpu(memcg->vmstats_percpu);
+@@ -4779,6 +4773,12 @@ static void __mem_cgroup_free(struct mem
+ static void mem_cgroup_free(struct mem_cgroup *memcg)
+ {
+       memcg_wb_domain_exit(memcg);
++      /*
++       * Flush percpu vmstats and vmevents to guarantee the value correctness
++       * on parent's and all ancestor levels.
++       */
++      memcg_flush_percpu_vmstats(memcg, false);
++      memcg_flush_percpu_vmevents(memcg);
+       __mem_cgroup_free(memcg);
+ }
+ 
diff --git a/queue-5.3/mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch b/queue-5.3/mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch

new file mode 100644 (file)

index 0000000..85c10cd
--- /dev/null
+++ b/queue-5.3/mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch
@@ -0,0 +1,120 @@
+From 3e8fc0075e24338b1117cdff6a79477427b8dbed Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@techsingularity.net>
+Date: Tue, 5 Nov 2019 21:16:27 -0800
+Subject: mm, meminit: recalculate pcpu batch and high limits after init completes
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+commit 3e8fc0075e24338b1117cdff6a79477427b8dbed upstream.
+
+Deferred memory initialisation updates zone->managed_pages during the
+initialisation phase but before that finishes, the per-cpu page
+allocator (pcpu) calculates the number of pages allocated/freed in
+batches as well as the maximum number of pages allowed on a per-cpu
+list.  As zone->managed_pages is not up to date yet, the pcpu
+initialisation calculates inappropriately low batch and high values.
+
+This increases zone lock contention quite severely in some cases with
+the degree of severity depending on how many CPUs share a local zone and
+the size of the zone.  A private report indicated that kernel build
+times were excessive with extremely high system CPU usage.  A perf
+profile indicated that a large chunk of time was lost on zone->lock
+contention.
+
+This patch recalculates the pcpu batch and high values after deferred
+initialisation completes for every populated zone in the system.  It was
+tested on a 2-socket AMD EPYC 2 machine using a kernel compilation
+workload -- allmodconfig and all available CPUs.
+
+mmtests configuration: config-workload-kernbench-max Configuration was
+modified to build on a fresh XFS partition.
+
+kernbench
+                                5.4.0-rc3              5.4.0-rc3
+                                  vanilla           resetpcpu-v2
+Amean     user-256    13249.50 (   0.00%)    16401.31 * -23.79%*
+Amean     syst-256    14760.30 (   0.00%)     4448.39 *  69.86%*
+Amean     elsp-256      162.42 (   0.00%)      119.13 *  26.65%*
+Stddev    user-256       42.97 (   0.00%)       19.15 (  55.43%)
+Stddev    syst-256      336.87 (   0.00%)        6.71 (  98.01%)
+Stddev    elsp-256        2.46 (   0.00%)        0.39 (  84.03%)
+
+                   5.4.0-rc3    5.4.0-rc3
+                     vanilla resetpcpu-v2
+Duration User       39766.24     49221.79
+Duration System     44298.10     13361.67
+Duration Elapsed      519.11       388.87
+
+The patch reduces system CPU usage by 69.86% and total build time by
+26.65%.  The variance of system CPU usage is also much reduced.
+
+Before, this was the breakdown of batch and high values over all zones
+was:
+
+    256               batch: 1
+    256               batch: 63
+    512               batch: 7
+    256               high:  0
+    256               high:  378
+    512               high:  42
+
+512 pcpu pagesets had a batch limit of 7 and a high limit of 42.  After
+the patch:
+
+    256               batch: 1
+    768               batch: 63
+    256               high:  0
+    768               high:  378
+
+[mgorman@techsingularity.net: fix merge/linkage snafu]
+  Link: http://lkml.kernel.org/r/20191023084705.GD3016@techsingularity.netLink: http://lkml.kernel.org/r/20191021094808.28824-2-mgorman@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Matt Fleming <matt@codeblueprint.co.uk>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Qian Cai <cai@lca.pw>
+Cc: <stable@vger.kernel.org>   [4.1+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1947,6 +1947,14 @@ void __init page_alloc_init_late(void)
+       wait_for_completion(&pgdat_init_all_done_comp);
+ 
+       /*
++       * The number of managed pages has changed due to the initialisation
++       * so the pcpu batch and high limits needs to be updated or the limits
++       * will be artificially small.
++       */
++      for_each_populated_zone(zone)
++              zone_pcp_update(zone);
++
++      /*
+        * We initialized the rest of the deferred pages.  Permanently disable
+        * on-demand struct page initialization.
+        */
+@@ -8479,7 +8487,6 @@ void free_contig_range(unsigned long pfn
+       WARN(count != 0, "%d pages are still in use!\n", count);
+ }
+ 
+-#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+  * The zone indicated has a new number of managed_pages; batch sizes and percpu
+  * page high values need to be recalulated.
+@@ -8493,7 +8500,6 @@ void __meminit zone_pcp_update(struct zo
+                               per_cpu_ptr(zone->pageset, cpu));
+       mutex_unlock(&pcp_batch_high_lock);
+ }
+-#endif
+ 
+ void zone_pcp_reset(struct zone *zone)
+ {
diff --git a/queue-5.3/mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch b/queue-5.3/mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch

new file mode 100644 (file)

index 0000000..0501a03
--- /dev/null
+++ b/queue-5.3/mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch
@@ -0,0 +1,145 @@
+From 169226f7e0d275c1879551f37484ef6683579a5c Mon Sep 17 00:00:00 2001
+From: Yang Shi <yang.shi@linux.alibaba.com>
+Date: Tue, 5 Nov 2019 21:16:30 -0800
+Subject: mm: thp: handle page cache THP correctly in PageTransCompoundMap
+
+From: Yang Shi <yang.shi@linux.alibaba.com>
+
+commit 169226f7e0d275c1879551f37484ef6683579a5c upstream.
+
+We have a usecase to use tmpfs as QEMU memory backend and we would like
+to take the advantage of THP as well.  But, our test shows the EPT is
+not PMD mapped even though the underlying THP are PMD mapped on host.
+The number showed by /sys/kernel/debug/kvm/largepage is much less than
+the number of PMD mapped shmem pages as the below:
+
+  7f2778200000-7f2878200000 rw-s 00000000 00:14 262232 /dev/shm/qemu_back_mem.mem.Hz2hSf (deleted)
+  Size:            4194304 kB
+  [snip]
+  AnonHugePages:         0 kB
+  ShmemPmdMapped:   579584 kB
+  [snip]
+  Locked:                0 kB
+
+  cat /sys/kernel/debug/kvm/largepages
+  12
+
+And some benchmarks do worse than with anonymous THPs.
+
+By digging into the code we figured out that commit 127393fbe597 ("mm:
+thp: kvm: fix memory corruption in KVM with THP enabled") checks if
+there is a single PTE mapping on the page for anonymous THP when setting
+up EPT map.  But the _mapcount < 0 check doesn't work for page cache THP
+since every subpage of page cache THP would get _mapcount inc'ed once it
+is PMD mapped, so PageTransCompoundMap() always returns false for page
+cache THP.  This would prevent KVM from setting up PMD mapped EPT entry.
+
+So we need handle page cache THP correctly.  However, when page cache
+THP's PMD gets split, kernel just remove the map instead of setting up
+PTE map like what anonymous THP does.  Before KVM calls get_user_pages()
+the subpages may get PTE mapped even though it is still a THP since the
+page cache THP may be mapped by other processes at the mean time.
+
+Checking its _mapcount and whether the THP has PTE mapped or not.
+Although this may report some false negative cases (PTE mapped by other
+processes), it looks not trivial to make this accurate.
+
+With this fix /sys/kernel/debug/kvm/largepage would show reasonable
+pages are PMD mapped by EPT as the below:
+
+  7fbeaee00000-7fbfaee00000 rw-s 00000000 00:14 275464 /dev/shm/qemu_back_mem.mem.SKUvat (deleted)
+  Size:            4194304 kB
+  [snip]
+  AnonHugePages:         0 kB
+  ShmemPmdMapped:   557056 kB
+  [snip]
+  Locked:                0 kB
+
+  cat /sys/kernel/debug/kvm/largepages
+  271
+
+And the benchmarks are as same as anonymous THPs.
+
+[yang.shi@linux.alibaba.com: v4]
+  Link: http://lkml.kernel.org/r/1571865575-42913-1-git-send-email-yang.shi@linux.alibaba.com
+Link: http://lkml.kernel.org/r/1571769577-89735-1-git-send-email-yang.shi@linux.alibaba.com
+Fixes: dd78fedde4b9 ("rmap: support file thp")
+Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
+Reported-by: Gang Deng <gavin.dg@linux.alibaba.com>
+Tested-by: Gang Deng <gavin.dg@linux.alibaba.com>
+Suggested-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mm.h         |    5 -----
+ include/linux/mm_types.h   |    5 +++++
+ include/linux/page-flags.h |   20 ++++++++++++++++++--
+ 3 files changed, 23 insertions(+), 7 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -695,11 +695,6 @@ static inline void *kvcalloc(size_t n, s
+ 
+ extern void kvfree(const void *addr);
+ 
+-static inline atomic_t *compound_mapcount_ptr(struct page *page)
+-{
+-      return &page[1].compound_mapcount;
+-}
+-
+ static inline int compound_mapcount(struct page *page)
+ {
+       VM_BUG_ON_PAGE(!PageCompound(page), page);
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -221,6 +221,11 @@ struct page {
+ #endif
+ } _struct_page_alignment;
+ 
++static inline atomic_t *compound_mapcount_ptr(struct page *page)
++{
++      return &page[1].compound_mapcount;
++}
++
+ /*
+  * Used for sizing the vmemmap region on some architectures
+  */
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -622,12 +622,28 @@ static inline int PageTransCompound(stru
+  *
+  * Unlike PageTransCompound, this is safe to be called only while
+  * split_huge_pmd() cannot run from under us, like if protected by the
+- * MMU notifier, otherwise it may result in page->_mapcount < 0 false
++ * MMU notifier, otherwise it may result in page->_mapcount check false
+  * positives.
++ *
++ * We have to treat page cache THP differently since every subpage of it
++ * would get _mapcount inc'ed once it is PMD mapped.  But, it may be PTE
++ * mapped in the current process so comparing subpage's _mapcount to
++ * compound_mapcount to filter out PTE mapped case.
+  */
+ static inline int PageTransCompoundMap(struct page *page)
+ {
+-      return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0;
++      struct page *head;
++
++      if (!PageTransCompound(page))
++              return 0;
++
++      if (PageAnon(page))
++              return atomic_read(&page->_mapcount) < 0;
++
++      head = compound_head(page);
++      /* File THP is PMD mapped and not PTE mapped */
++      return atomic_read(&page->_mapcount) ==
++             atomic_read(compound_mapcount_ptr(head));
+ }
+ 
+ /*
diff --git a/queue-5.3/mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch b/queue-5.3/mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch

new file mode 100644 (file)

index 0000000..dbc1346
--- /dev/null
+++ b/queue-5.3/mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch
@@ -0,0 +1,57 @@
+From abaed0112c1db08be15a784a2c5c8a8b3063cdd3 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Tue, 5 Nov 2019 21:16:40 -0800
+Subject: mm, vmstat: hide /proc/pagetypeinfo from normal users
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit abaed0112c1db08be15a784a2c5c8a8b3063cdd3 upstream.
+
+/proc/pagetypeinfo is a debugging tool to examine internal page
+allocator state wrt to fragmentation.  It is not very useful for any
+other use so normal users really do not need to read this file.
+
+Waiman Long has noticed that reading this file can have negative side
+effects because zone->lock is necessary for gathering data and that a)
+interferes with the page allocator and its users and b) can lead to hard
+lockups on large machines which have very long free_list.
+
+Reduce both issues by simply not exporting the file to regular users.
+
+Link: http://lkml.kernel.org/r/20191025072610.18526-2-mhocko@kernel.org
+Fixes: 467c996c1e19 ("Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo")
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Reported-by: Waiman Long <longman@redhat.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Waiman Long <longman@redhat.com>
+Acked-by: Rafael Aquini <aquini@redhat.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Roman Gushchin <guro@fb.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: Jann Horn <jannh@google.com>
+Cc: Song Liu <songliubraving@fb.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmstat.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1970,7 +1970,7 @@ void __init init_mm_internals(void)
+ #endif
+ #ifdef CONFIG_PROC_FS
+       proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
+-      proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op);
++      proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
+       proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
+       proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
+ #endif
diff --git a/queue-5.3/series b/queue-5.3/series

index 9413dbc7d4ead499a7f4d2f8df901ffbcee1acb1..d5f69fddd88ab68ebf9a077a56389225c454d36c 100644 (file)
--- a/queue-5.3/series
+++ b/queue-5.3/series
@@ -22,3 +22,11 @@ net-sched-prevent-duplicate-flower-rules-from-tcf_proto-destroy-race.patch
  net-smc-fix-ethernet-interface-refcounting.patch
  vsock-virtio-fix-sock-refcnt-holding-during-the-shutdown.patch
  r8169-fix-page-read-in-r8168g_mdio_read.patch
+alsa-timer-fix-incorrectly-assigned-timer-instance.patch
+alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch
+alsa-hda-ca0132-fix-possible-workqueue-stall.patch
+mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch
+mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch
+mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch
+mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch
+mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 11 Nov 2019 05:46:11 +0000 (06:46 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 11 Nov 2019 05:46:11 +0000 (06:46 +0100)
queue-5.3/alsa-bebob-fix-to-detect-configured-source-of-sampling-clock-for-focusrite-saffire-pro-i-o-series.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/alsa-hda-ca0132-fix-possible-workqueue-stall.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/alsa-timer-fix-incorrectly-assigned-timer-instance.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-memcontrol-fix-network-errors-from-failing-__gfp_atomic-charges.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-memcontrol-fix-null-ptr-deref-in-percpu-stats-flush.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-meminit-recalculate-pcpu-batch-and-high-limits-after-init-completes.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-thp-handle-page-cache-thp-correctly-in-pagetranscompoundmap.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-vmstat-hide-proc-pagetypeinfo-from-normal-users.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/series		patch \| blob \| blame \| history