--- /dev/null
+From 96cfe2c0fd23ea7c2368d14f769d287e7ae1082e Mon Sep 17 00:00:00 2001
+From: Suren Baghdasaryan <surenb@google.com>
+Date: Fri, 12 Mar 2021 21:08:06 -0800
+Subject: mm/madvise: replace ptrace attach requirement for process_madvise
+
+From: Suren Baghdasaryan <surenb@google.com>
+
+commit 96cfe2c0fd23ea7c2368d14f769d287e7ae1082e upstream.
+
+process_madvise currently requires ptrace attach capability.
+PTRACE_MODE_ATTACH gives one process complete control over another
+process. It effectively removes the security boundary between the two
+processes (in one direction). Granting ptrace attach capability even to a
+system process is considered dangerous since it creates an attack surface.
+This severely limits the usage of this API.
+
+The operations process_madvise can perform do not affect the correctness
+of the operation of the target process; they only affect where the data is
+physically located (and therefore, how fast it can be accessed). What we
+want is the ability for one process to influence another process in order
+to optimize performance across the entire system while leaving the
+security boundary intact.
+
+Replace PTRACE_MODE_ATTACH with a combination of PTRACE_MODE_READ and
+CAP_SYS_NICE. PTRACE_MODE_READ to prevent leaking ASLR metadata and
+CAP_SYS_NICE for influencing process performance.
+
+Link: https://lkml.kernel.org/r/20210303185807.2160264-1-surenb@google.com
+Signed-off-by: Suren Baghdasaryan <surenb@google.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Acked-by: David Rientjes <rientjes@google.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Jeff Vander Stoep <jeffv@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Tim Murray <timmurray@google.com>
+Cc: Florian Weimer <fweimer@redhat.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: James Morris <jmorris@namei.org>
+Cc: <stable@vger.kernel.org> [5.10+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c | 13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1202,12 +1202,22 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+ goto release_task;
+ }
+
+- mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
++ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
++ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto release_task;
+ }
+
++ /*
++ * Require CAP_SYS_NICE for influencing process performance. Note that
++ * only non-destructive hints are currently supported.
++ */
++ if (!capable(CAP_SYS_NICE)) {
++ ret = -EPERM;
++ goto release_mm;
++ }
++
+ total_len = iov_iter_count(&iter);
+
+ while (iov_iter_count(&iter)) {
+@@ -1222,6 +1232,7 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+ if (ret == 0)
+ ret = total_len - iov_iter_count(&iter);
+
++release_mm:
+ mmput(mm);
+ release_task:
+ put_task_struct(task);
+++ /dev/null
-From e1baddf8475b06cc56f4bafecf9a32a124343d9f Mon Sep 17 00:00:00 2001
-From: Zhou Guanghui <zhouguanghui1@huawei.com>
-Date: Fri, 12 Mar 2021 21:08:33 -0800
-Subject: mm/memcg: set memcg when splitting page
-
-From: Zhou Guanghui <zhouguanghui1@huawei.com>
-
-commit e1baddf8475b06cc56f4bafecf9a32a124343d9f upstream.
-
-As described in the split_page() comment, for the non-compound high order
-page, the sub-pages must be freed individually. If the memcg of the first
-page is valid, the tail pages cannot be uncharged when be freed.
-
-For example, when alloc_pages_exact is used to allocate 1MB continuous
-physical memory, 2MB is charged(kmemcg is enabled and __GFP_ACCOUNT is
-set). When make_alloc_exact free the unused 1MB and free_pages_exact free
-the applied 1MB, actually, only 4KB(one page) is uncharged.
-
-Therefore, the memcg of the tail page needs to be set when splitting a
-page.
-
-Michel:
-
-There are at least two explicit users of __GFP_ACCOUNT with
-alloc_exact_pages added recently. See 7efe8ef274024 ("KVM: arm64:
-Allocate stage-2 pgd pages with GFP_KERNEL_ACCOUNT") and c419621873713
-("KVM: s390: Add memcg accounting to KVM allocations"), so this is not
-just a theoretical issue.
-
-Link: https://lkml.kernel.org/r/20210304074053.65527-3-zhouguanghui1@huawei.com
-Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
-Acked-by: Johannes Weiner <hannes@cmpxchg.org>
-Reviewed-by: Zi Yan <ziy@nvidia.com>
-Reviewed-by: Shakeel Butt <shakeelb@google.com>
-Acked-by: Michal Hocko <mhocko@suse.com>
-Cc: Hanjun Guo <guohanjun@huawei.com>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
-Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
-Cc: Nicholas Piggin <npiggin@gmail.com>
-Cc: Rui Xiang <rui.xiang@huawei.com>
-Cc: Tianhong Ding <dingtianhong@huawei.com>
-Cc: Weilong Chen <chenweilong@huawei.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- mm/page_alloc.c | 1 +
- 1 file changed, 1 insertion(+)
-
---- a/mm/page_alloc.c
-+++ b/mm/page_alloc.c
-@@ -3272,6 +3272,7 @@ void split_page(struct page *page, unsig
- for (i = 1; i < (1 << order); i++)
- set_page_refcounted(page + i);
- split_page_owner(page, 1 << order);
-+ split_page_memcg(page, 1 << order);
- }
- EXPORT_SYMBOL_GPL(split_page);
-
--- /dev/null
+From b6be002bcd1dd1dedb926abf3c90c794eacb77dc Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 2 Nov 2020 12:53:16 -0800
+Subject: x86/entry: Move nmi entry/exit into common code
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit b6be002bcd1dd1dedb926abf3c90c794eacb77dc upstream.
+
+Lockdep state handling on NMI enter and exit is nothing specific to X86. It's
+not any different on other architectures. Also the extra state type is not
+necessary, irqentry_state_t can carry the necessary information as well.
+
+Move it to common code and extend irqentry_state_t to carry lockdep state.
+
+[ Ira: Make exit_rcu and lockdep a union as they are mutually exclusive
+ between the IRQ and NMI exceptions, and add kernel documentation for
+ struct irqentry_state_t ]
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ira Weiny <ira.weiny@intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://lore.kernel.org/r/20201102205320.1458656-7-ira.weiny@intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/entry/common.c | 34 ----------------------------------
+ arch/x86/include/asm/idtentry.h | 3 ---
+ arch/x86/kernel/cpu/mce/core.c | 6 +++---
+ arch/x86/kernel/nmi.c | 6 +++---
+ arch/x86/kernel/traps.c | 13 +++++++------
+ include/linux/entry-common.h | 39 ++++++++++++++++++++++++++++++++++++++-
+ kernel/entry/common.c | 36 ++++++++++++++++++++++++++++++++++++
+ 7 files changed, 87 insertions(+), 50 deletions(-)
+
+--- a/arch/x86/entry/common.c
++++ b/arch/x86/entry/common.c
+@@ -213,40 +213,6 @@ SYSCALL_DEFINE0(ni_syscall)
+ return -ENOSYS;
+ }
+
+-noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
+-{
+- bool irq_state = lockdep_hardirqs_enabled();
+-
+- __nmi_enter();
+- lockdep_hardirqs_off(CALLER_ADDR0);
+- lockdep_hardirq_enter();
+- rcu_nmi_enter();
+-
+- instrumentation_begin();
+- trace_hardirqs_off_finish();
+- ftrace_nmi_enter();
+- instrumentation_end();
+-
+- return irq_state;
+-}
+-
+-noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore)
+-{
+- instrumentation_begin();
+- ftrace_nmi_exit();
+- if (restore) {
+- trace_hardirqs_on_prepare();
+- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+- }
+- instrumentation_end();
+-
+- rcu_nmi_exit();
+- lockdep_hardirq_exit();
+- if (restore)
+- lockdep_hardirqs_on(CALLER_ADDR0);
+- __nmi_exit();
+-}
+-
+ #ifdef CONFIG_XEN_PV
+ #ifndef CONFIG_PREEMPTION
+ /*
+--- a/arch/x86/include/asm/idtentry.h
++++ b/arch/x86/include/asm/idtentry.h
+@@ -11,9 +11,6 @@
+
+ #include <asm/irq_stack.h>
+
+-bool idtentry_enter_nmi(struct pt_regs *regs);
+-void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state);
+-
+ /**
+ * DECLARE_IDTENTRY - Declare functions for simple IDT entry points
+ * No error code pushed by hardware
+--- a/arch/x86/kernel/cpu/mce/core.c
++++ b/arch/x86/kernel/cpu/mce/core.c
+@@ -1986,7 +1986,7 @@ void (*machine_check_vector)(struct pt_r
+
+ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
+ {
+- bool irq_state;
++ irqentry_state_t irq_state;
+
+ WARN_ON_ONCE(user_mode(regs));
+
+@@ -1998,7 +1998,7 @@ static __always_inline void exc_machine_
+ mce_check_crashing_cpu())
+ return;
+
+- irq_state = idtentry_enter_nmi(regs);
++ irq_state = irqentry_nmi_enter(regs);
+ /*
+ * The call targets are marked noinstr, but objtool can't figure
+ * that out because it's an indirect call. Annotate it.
+@@ -2009,7 +2009,7 @@ static __always_inline void exc_machine_
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
+- idtentry_exit_nmi(regs, irq_state);
++ irqentry_nmi_exit(regs, irq_state);
+ }
+
+ static __always_inline void exc_machine_check_user(struct pt_regs *regs)
+--- a/arch/x86/kernel/nmi.c
++++ b/arch/x86/kernel/nmi.c
+@@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi
+
+ DEFINE_IDTENTRY_RAW(exc_nmi)
+ {
+- bool irq_state;
++ irqentry_state_t irq_state;
+
+ /*
+ * Re-enable NMIs right here when running as an SEV-ES guest. This might
+@@ -502,14 +502,14 @@ nmi_restart:
+
+ this_cpu_write(nmi_dr7, local_db_save());
+
+- irq_state = idtentry_enter_nmi(regs);
++ irq_state = irqentry_nmi_enter(regs);
+
+ inc_irq_stat(__nmi_count);
+
+ if (!ignore_nmis)
+ default_do_nmi(regs);
+
+- idtentry_exit_nmi(regs, irq_state);
++ irqentry_nmi_exit(regs, irq_state);
+
+ local_db_restore(this_cpu_read(nmi_dr7));
+
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -406,7 +406,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
+ }
+ #endif
+
+- idtentry_enter_nmi(regs);
++ irqentry_nmi_enter(regs);
+ instrumentation_begin();
+ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
+
+@@ -652,12 +652,13 @@ DEFINE_IDTENTRY_RAW(exc_int3)
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+ } else {
+- bool irq_state = idtentry_enter_nmi(regs);
++ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
++
+ instrumentation_begin();
+ if (!do_int3(regs))
+ die("int3", regs, 0);
+ instrumentation_end();
+- idtentry_exit_nmi(regs, irq_state);
++ irqentry_nmi_exit(regs, irq_state);
+ }
+ }
+
+@@ -851,7 +852,7 @@ static __always_inline void exc_debug_ke
+ * includes the entry stack is excluded for everything.
+ */
+ unsigned long dr7 = local_db_save();
+- bool irq_state = idtentry_enter_nmi(regs);
++ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+ instrumentation_begin();
+
+ /*
+@@ -908,7 +909,7 @@ static __always_inline void exc_debug_ke
+ regs->flags &= ~X86_EFLAGS_TF;
+ out:
+ instrumentation_end();
+- idtentry_exit_nmi(regs, irq_state);
++ irqentry_nmi_exit(regs, irq_state);
+
+ local_db_restore(dr7);
+ }
+@@ -926,7 +927,7 @@ static __always_inline void exc_debug_us
+
+ /*
+ * NB: We can't easily clear DR7 here because
+- * idtentry_exit_to_usermode() can invoke ptrace, schedule, access
++ * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
+ * user memory, etc. This means that a recursive #DB is possible. If
+ * this happens, that #DB will hit exc_debug_kernel() and clear DR7.
+ * Since we're not on the IST stack right now, everything will be
+--- a/include/linux/entry-common.h
++++ b/include/linux/entry-common.h
+@@ -341,8 +341,26 @@ void irqentry_enter_from_user_mode(struc
+ void irqentry_exit_to_user_mode(struct pt_regs *regs);
+
+ #ifndef irqentry_state
++/**
++ * struct irqentry_state - Opaque object for exception state storage
++ * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the
++ * exit path has to invoke rcu_irq_exit().
++ * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that
++ * lockdep state is restored correctly on exit from nmi.
++ *
++ * This opaque object is filled in by the irqentry_*_enter() functions and
++ * must be passed back into the corresponding irqentry_*_exit() functions
++ * when the exception is complete.
++ *
++ * Callers of irqentry_*_[enter|exit]() must consider this structure opaque
++ * and all members private. Descriptions of the members are provided to aid in
++ * the maintenance of the irqentry_*() functions.
++ */
+ typedef struct irqentry_state {
+- bool exit_rcu;
++ union {
++ bool exit_rcu;
++ bool lockdep;
++ };
+ } irqentry_state_t;
+ #endif
+
+@@ -402,4 +420,23 @@ void irqentry_exit_cond_resched(void);
+ */
+ void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state);
+
++/**
++ * irqentry_nmi_enter - Handle NMI entry
++ * @regs: Pointer to currents pt_regs
++ *
++ * Similar to irqentry_enter() but taking care of the NMI constraints.
++ */
++irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs);
++
++/**
++ * irqentry_nmi_exit - Handle return from NMI handling
++ * @regs: Pointer to pt_regs (NMI entry regs)
++ * @irq_state: Return value from matching call to irqentry_nmi_enter()
++ *
++ * Last action before returning to the low level assmenbly code.
++ *
++ * Counterpart to irqentry_nmi_enter().
++ */
++void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state);
++
+ #endif
+--- a/kernel/entry/common.c
++++ b/kernel/entry/common.c
+@@ -397,3 +397,39 @@ noinstr void irqentry_exit(struct pt_reg
+ rcu_irq_exit();
+ }
+ }
++
++irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
++{
++ irqentry_state_t irq_state;
++
++ irq_state.lockdep = lockdep_hardirqs_enabled();
++
++ __nmi_enter();
++ lockdep_hardirqs_off(CALLER_ADDR0);
++ lockdep_hardirq_enter();
++ rcu_nmi_enter();
++
++ instrumentation_begin();
++ trace_hardirqs_off_finish();
++ ftrace_nmi_enter();
++ instrumentation_end();
++
++ return irq_state;
++}
++
++void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
++{
++ instrumentation_begin();
++ ftrace_nmi_exit();
++ if (irq_state.lockdep) {
++ trace_hardirqs_on_prepare();
++ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
++ }
++ instrumentation_end();
++
++ rcu_nmi_exit();
++ lockdep_hardirq_exit();
++ if (irq_state.lockdep)
++ lockdep_hardirqs_on(CALLER_ADDR0);
++ __nmi_exit();
++}