From: Greg Kroah-Hartman Date: Sat, 19 Dec 2020 12:52:51 +0000 (+0100) Subject: 5.4-stable patches X-Git-Tag: v5.4.85~2 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f186a5812b8fc9e4efe163f4b4129d9622bf8c57;p=thirdparty%2Fkernel%2Fstable-queue.git 5.4-stable patches added patches: kvm-mmu-fix-spte-encoding-of-mmio-generation-upper-half.patch membarrier-explicitly-sync-remote-cores-when-sync_core-is-requested.patch revert-selftests-ftrace-check-for-do_sys_openat2-in-user-memory-test.patch x86-resctrl-fix-incorrect-local-bandwidth-when-mba_sc-is-enabled.patch x86-resctrl-remove-unused-struct-mbm_state-chunks_bw.patch --- diff --git a/queue-5.4/kvm-mmu-fix-spte-encoding-of-mmio-generation-upper-half.patch b/queue-5.4/kvm-mmu-fix-spte-encoding-of-mmio-generation-upper-half.patch new file mode 100644 index 00000000000..b4142a8a599 --- /dev/null +++ b/queue-5.4/kvm-mmu-fix-spte-encoding-of-mmio-generation-upper-half.patch @@ -0,0 +1,124 @@ +From 34c0f6f2695a2db81e09a3ab7bdb2853f45d4d3d Mon Sep 17 00:00:00 2001 +From: "Maciej S. Szmigiero" +Date: Sat, 5 Dec 2020 01:48:08 +0100 +Subject: KVM: mmu: Fix SPTE encoding of MMIO generation upper half + +From: Maciej S. Szmigiero + +commit 34c0f6f2695a2db81e09a3ab7bdb2853f45d4d3d upstream. + +Commit cae7ed3c2cb0 ("KVM: x86: Refactor the MMIO SPTE generation handling") +cleaned up the computation of MMIO generation SPTE masks, however it +introduced a bug how the upper part was encoded: +SPTE bits 52-61 were supposed to contain bits 10-19 of the current +generation number, however a missing shift encoded bits 1-10 there instead +(mostly duplicating the lower part of the encoded generation number that +then consisted of bits 1-9). + +In the meantime, the upper part was shrunk by one bit and moved by +subsequent commits to become an upper half of the encoded generation number +(bits 9-17 of bits 0-17 encoded in a SPTE). + +In addition to the above, commit 56871d444bc4 ("KVM: x86: fix overlap between SPTE_MMIO_MASK and generation") +has changed the SPTE bit range assigned to encode the generation number and +the total number of bits encoded but did not update them in the comment +attached to their defines, nor in the KVM MMU doc. +Let's do it here, too, since it is too trivial thing to warrant a separate +commit. + +Fixes: cae7ed3c2cb0 ("KVM: x86: Refactor the MMIO SPTE generation handling") +Signed-off-by: Maciej S. Szmigiero +Message-Id: <156700708db2a5296c5ed7a8b9ac71f1e9765c85.1607129096.git.maciej.szmigiero@oracle.com> +Cc: stable@vger.kernel.org +[Reorganize macros so that everything is computed from the bit ranges. - Paolo] +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + + +--- + Documentation/virt/kvm/mmu.txt | 2 +- + arch/x86/kvm/mmu.c | 29 ++++++++++++++++++++--------- + 2 files changed, 21 insertions(+), 10 deletions(-) + +--- a/Documentation/virt/kvm/mmu.txt ++++ b/Documentation/virt/kvm/mmu.txt +@@ -420,7 +420,7 @@ If the generation number of the spte doe + number, it will ignore the cached MMIO information and handle the page + fault through the slow path. + +-Since only 19 bits are used to store generation-number on mmio spte, all ++Since only 18 bits are used to store generation-number on mmio spte, all + pages are zapped when there is an overflow. + + Unfortunately, a single memory access might access kvm_memslots(kvm) multiple +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -407,11 +407,11 @@ static inline bool is_access_track_spte( + } + + /* +- * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of ++ * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of + * the memslots generation and is derived as follows: + * + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 +- * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 ++ * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62 + * + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from +@@ -420,18 +420,29 @@ static inline bool is_access_track_spte( + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. + */ +-#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) + + #define MMIO_SPTE_GEN_LOW_START 3 + #define MMIO_SPTE_GEN_LOW_END 11 +-#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ +- MMIO_SPTE_GEN_LOW_START) + + #define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT + #define MMIO_SPTE_GEN_HIGH_END 62 ++ ++#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ ++ MMIO_SPTE_GEN_LOW_START) + #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) + ++#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) ++#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) ++ ++/* remember to adjust the comment above as well if you change these */ ++static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9); ++ ++#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) ++#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) ++ ++#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) ++ + static u64 generation_mmio_spte_mask(u64 gen) + { + u64 mask; +@@ -439,8 +450,8 @@ static u64 generation_mmio_spte_mask(u64 + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); + +- mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; +- mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; ++ mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; ++ mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; + return mask; + } + +@@ -448,8 +459,8 @@ static u64 get_mmio_spte_generation(u64 + { + u64 gen; + +- gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; +- gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; ++ gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT; ++ gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT; + return gen; + } + diff --git a/queue-5.4/membarrier-explicitly-sync-remote-cores-when-sync_core-is-requested.patch b/queue-5.4/membarrier-explicitly-sync-remote-cores-when-sync_core-is-requested.patch new file mode 100644 index 00000000000..d97656a3336 --- /dev/null +++ b/queue-5.4/membarrier-explicitly-sync-remote-cores-when-sync_core-is-requested.patch @@ -0,0 +1,88 @@ +From 758c9373d84168dc7d039cf85a0e920046b17b41 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 3 Dec 2020 21:07:05 -0800 +Subject: membarrier: Explicitly sync remote cores when SYNC_CORE is requested + +From: Andy Lutomirski + +commit 758c9373d84168dc7d039cf85a0e920046b17b41 upstream. + +membarrier() does not explicitly sync_core() remote CPUs; instead, it +relies on the assumption that an IPI will result in a core sync. On x86, +this may be true in practice, but it's not architecturally reliable. In +particular, the SDM and APM do not appear to guarantee that interrupt +delivery is serializing. While IRET does serialize, IPI return can +schedule, thereby switching to another task in the same mm that was +sleeping in a syscall. The new task could then SYSRET back to usermode +without ever executing IRET. + +Make this more robust by explicitly calling sync_core_before_usermode() +on remote cores. (This also helps people who search the kernel tree for +instances of sync_core() and sync_core_before_usermode() -- one might be +surprised that the core membarrier code doesn't currently show up in a +such a search.) + +Fixes: 70216e18e519 ("membarrier: Provide core serializing command, *_SYNC_CORE") +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Mathieu Desnoyers +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/776b448d5f7bd6b12690707f5ed67bcda7f1d427.1607058304.git.luto@kernel.org +Signed-off-by: Greg Kroah-Hartman + + +--- + kernel/sched/membarrier.c | 21 ++++++++++++++++++++- + 1 file changed, 20 insertions(+), 1 deletion(-) + +--- a/kernel/sched/membarrier.c ++++ b/kernel/sched/membarrier.c +@@ -30,6 +30,23 @@ static void ipi_mb(void *info) + smp_mb(); /* IPIs should be serializing but paranoid. */ + } + ++static void ipi_sync_core(void *info) ++{ ++ /* ++ * The smp_mb() in membarrier after all the IPIs is supposed to ++ * ensure that memory on remote CPUs that occur before the IPI ++ * become visible to membarrier()'s caller -- see scenario B in ++ * the big comment at the top of this file. ++ * ++ * A sync_core() would provide this guarantee, but ++ * sync_core_before_usermode() might end up being deferred until ++ * after membarrier()'s smp_mb(). ++ */ ++ smp_mb(); /* IPIs should be serializing but paranoid. */ ++ ++ sync_core_before_usermode(); ++} ++ + static void ipi_sync_rq_state(void *info) + { + struct mm_struct *mm = (struct mm_struct *) info; +@@ -134,6 +151,7 @@ static int membarrier_private_expedited( + int cpu; + cpumask_var_t tmpmask; + struct mm_struct *mm = current->mm; ++ smp_call_func_t ipi_func = ipi_mb; + + if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) +@@ -141,6 +159,7 @@ static int membarrier_private_expedited( + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) + return -EPERM; ++ ipi_func = ipi_sync_core; + } else { + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) +@@ -181,7 +200,7 @@ static int membarrier_private_expedited( + rcu_read_unlock(); + + preempt_disable(); +- smp_call_function_many(tmpmask, ipi_mb, NULL, 1); ++ smp_call_function_many(tmpmask, ipi_func, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); diff --git a/queue-5.4/revert-selftests-ftrace-check-for-do_sys_openat2-in-user-memory-test.patch b/queue-5.4/revert-selftests-ftrace-check-for-do_sys_openat2-in-user-memory-test.patch new file mode 100644 index 00000000000..60c3b77e8aa --- /dev/null +++ b/queue-5.4/revert-selftests-ftrace-check-for-do_sys_openat2-in-user-memory-test.patch @@ -0,0 +1,40 @@ +From kamal@canonical.com Sat Dec 19 13:38:59 2020 +From: Kamal Mostafa +Date: Wed, 16 Dec 2020 10:13:53 -0800 +Subject: Revert "selftests/ftrace: check for do_sys_openat2 in user-memory test" +To: Greg Kroah-Hartman , Sasha Levin +Cc: Kamal Mostafa , stable@vger.kernel.org +Message-ID: <20201216181353.30321-1-kamal@canonical.com> + +From: Kamal Mostafa + +This reverts commit 9110e2f2633dc9383a3a4711a0067094f6948783. + +This commit is not suitable for 5.4-stable because the openat2 system +call does not exist in v5.4. + +Signed-off-by: Kamal Mostafa +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc ++++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc +@@ -11,16 +11,12 @@ grep -A10 "fetcharg:" README | grep -q ' + :;: "user-memory access syntax and ustring working on user memory";: + echo 'p:myevent do_sys_open path=+0($arg2):ustring path2=+u0($arg2):string' \ + > kprobe_events +-echo 'p:myevent2 do_sys_openat2 path=+0($arg2):ustring path2=+u0($arg2):string' \ +- >> kprobe_events + + grep myevent kprobe_events | \ + grep -q 'path=+0($arg2):ustring path2=+u0($arg2):string' + echo 1 > events/kprobes/myevent/enable +-echo 1 > events/kprobes/myevent2/enable + echo > /dev/null + echo 0 > events/kprobes/myevent/enable +-echo 0 > events/kprobes/myevent2/enable + + grep myevent trace | grep -q 'path="/dev/null" path2="/dev/null"' + diff --git a/queue-5.4/series b/queue-5.4/series index 1f00f282e8d..c4bd52b6d89 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -27,3 +27,8 @@ usb-uas-introduce-a-quirk-to-set-no_write_same.patch usb-sisusbvga-make-console-support-depend-on-broken.patch alsa-pcm-oss-fix-potential-out-of-bounds-shift.patch serial-8250_omap-avoid-fifo-corruption-caused-by-mdr1-access.patch +kvm-mmu-fix-spte-encoding-of-mmio-generation-upper-half.patch +revert-selftests-ftrace-check-for-do_sys_openat2-in-user-memory-test.patch +membarrier-explicitly-sync-remote-cores-when-sync_core-is-requested.patch +x86-resctrl-remove-unused-struct-mbm_state-chunks_bw.patch +x86-resctrl-fix-incorrect-local-bandwidth-when-mba_sc-is-enabled.patch diff --git a/queue-5.4/x86-resctrl-fix-incorrect-local-bandwidth-when-mba_sc-is-enabled.patch b/queue-5.4/x86-resctrl-fix-incorrect-local-bandwidth-when-mba_sc-is-enabled.patch new file mode 100644 index 00000000000..e56dd3110d8 --- /dev/null +++ b/queue-5.4/x86-resctrl-fix-incorrect-local-bandwidth-when-mba_sc-is-enabled.patch @@ -0,0 +1,118 @@ +From foo@baz Sat Dec 19 01:50:24 PM CET 2020 +From: Xiaochen Shen +Date: Fri, 4 Dec 2020 14:27:59 +0800 +Subject: x86/resctrl: Fix incorrect local bandwidth when mba_sc is enabled + +From: Xiaochen Shen + +commit 06c5fe9b12dde1b62821f302f177c972bb1c81f9 upstream + +The MBA software controller (mba_sc) is a feedback loop which +periodically reads MBM counters and tries to restrict the bandwidth +below a user-specified value. It tags along the MBM counter overflow +handler to do the updates with 1s interval in mbm_update() and +update_mba_bw(). + +The purpose of mbm_update() is to periodically read the MBM counters to +make sure that the hardware counter doesn't wrap around more than once +between user samplings. mbm_update() calls __mon_event_count() for local +bandwidth updating when mba_sc is not enabled, but calls mbm_bw_count() +instead when mba_sc is enabled. __mon_event_count() will not be called +for local bandwidth updating in MBM counter overflow handler, but it is +still called when reading MBM local bandwidth counter file +'mbm_local_bytes', the call path is as below: + + rdtgroup_mondata_show() + mon_event_read() + mon_event_count() + __mon_event_count() + +In __mon_event_count(), m->chunks is updated by delta chunks which is +calculated from previous MSR value (m->prev_msr) and current MSR value. +When mba_sc is enabled, m->chunks is also updated in mbm_update() by +mistake by the delta chunks which is calculated from m->prev_bw_msr +instead of m->prev_msr. But m->chunks is not used in update_mba_bw() in +the mba_sc feedback loop. + +When reading MBM local bandwidth counter file, m->chunks was changed +unexpectedly by mbm_bw_count(). As a result, the incorrect local +bandwidth counter which calculated from incorrect m->chunks is shown to +the user. + +Fix this by removing incorrect m->chunks updating in mbm_bw_count() in +MBM counter overflow handler, and always calling __mon_event_count() in +mbm_update() to make sure that the hardware local bandwidth counter +doesn't wrap around. + +Test steps: + # Run workload with aggressive memory bandwidth (e.g., 10 GB/s) + git clone https://github.com/intel/intel-cmt-cat && cd intel-cmt-cat + && make + ./tools/membw/membw -c 0 -b 10000 --read + + # Enable MBA software controller + mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl + + # Create control group c1 + mkdir /sys/fs/resctrl/c1 + + # Set MB throttle to 6 GB/s + echo "MB:0=6000;1=6000" > /sys/fs/resctrl/c1/schemata + + # Write PID of the workload to tasks file + echo `pidof membw` > /sys/fs/resctrl/c1/tasks + + # Read local bytes counters twice with 1s interval, the calculated + # local bandwidth is not as expected (approaching to 6 GB/s): + local_1=`cat /sys/fs/resctrl/c1/mon_data/mon_L3_00/mbm_local_bytes` + sleep 1 + local_2=`cat /sys/fs/resctrl/c1/mon_data/mon_L3_00/mbm_local_bytes` + echo "local b/w (bytes/s):" `expr $local_2 - $local_1` + +Before fix: + local b/w (bytes/s): 11076796416 + +After fix: + local b/w (bytes/s): 5465014272 + +Fixes: ba0f26d8529c (x86/intel_rdt/mba_sc: Prepare for feedback loop) +Signed-off-by: Xiaochen Shen +Signed-off-by: Borislav Petkov +Reviewed-by: Tony Luck +Cc: +Link: https://lkml.kernel.org/r/1607063279-19437-1-git-send-email-xiaochen.shen@intel.com +[sudip: adjust context] +Signed-off-by: Sudip Mukherjee +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/resctrl/monitor.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/arch/x86/kernel/cpu/resctrl/monitor.c ++++ b/arch/x86/kernel/cpu/resctrl/monitor.c +@@ -280,7 +280,6 @@ static void mbm_bw_count(u32 rmid, struc + return; + + chunks = mbm_overflow_count(m->prev_bw_msr, tval); +- m->chunks += chunks; + cur_bw = (chunks * r->mon_scale) >> 20; + + if (m->delta_comp) +@@ -450,15 +449,14 @@ static void mbm_update(struct rdt_domain + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; ++ __mon_event_count(rmid, &rr); + + /* + * Call the MBA software controller only for the + * control groups and when user has enabled + * the software controller explicitly. + */ +- if (!is_mba_sc(NULL)) +- __mon_event_count(rmid, &rr); +- else ++ if (is_mba_sc(NULL)) + mbm_bw_count(rmid, &rr); + } + } diff --git a/queue-5.4/x86-resctrl-remove-unused-struct-mbm_state-chunks_bw.patch b/queue-5.4/x86-resctrl-remove-unused-struct-mbm_state-chunks_bw.patch new file mode 100644 index 00000000000..3f4cec1f8af --- /dev/null +++ b/queue-5.4/x86-resctrl-remove-unused-struct-mbm_state-chunks_bw.patch @@ -0,0 +1,54 @@ +From foo@baz Sat Dec 19 01:50:12 PM CET 2020 +From: James Morse +Date: Wed, 8 Jul 2020 16:39:20 +0000 +Subject: x86/resctrl: Remove unused struct mbm_state::chunks_bw + +From: James Morse + +commit abe8f12b44250d02937665033a8b750c1bfeb26e upstream + +Nothing reads struct mbm_states's chunks_bw value, its a copy of +chunks. Remove it. + +Signed-off-by: James Morse +Signed-off-by: Borislav Petkov +Reviewed-by: Reinette Chatre +Link: https://lkml.kernel.org/r/20200708163929.2783-2-james.morse@arm.com +[sudip: adjust context] +Signed-off-by: Sudip Mukherjee +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/resctrl/internal.h | 2 -- + arch/x86/kernel/cpu/resctrl/monitor.c | 3 +-- + 2 files changed, 1 insertion(+), 4 deletions(-) + +--- a/arch/x86/kernel/cpu/resctrl/internal.h ++++ b/arch/x86/kernel/cpu/resctrl/internal.h +@@ -276,7 +276,6 @@ struct rftype { + * struct mbm_state - status for each MBM counter in each domain + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it +- * @chunks_bw Total local data moved. Used for bandwidth calculation + * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting + * @prev_bw The most recent bandwidth in MBps + * @delta_bw Difference between the current and previous bandwidth +@@ -285,7 +284,6 @@ struct rftype { + struct mbm_state { + u64 chunks; + u64 prev_msr; +- u64 chunks_bw; + u64 prev_bw_msr; + u32 prev_bw; + u32 delta_bw; +--- a/arch/x86/kernel/cpu/resctrl/monitor.c ++++ b/arch/x86/kernel/cpu/resctrl/monitor.c +@@ -280,8 +280,7 @@ static void mbm_bw_count(u32 rmid, struc + return; + + chunks = mbm_overflow_count(m->prev_bw_msr, tval); +- m->chunks_bw += chunks; +- m->chunks = m->chunks_bw; ++ m->chunks += chunks; + cur_bw = (chunks * r->mon_scale) >> 20; + + if (m->delta_comp)