]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.2-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 2 Aug 2019 08:49:30 +0000 (10:49 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 2 Aug 2019 08:49:30 +0000 (10:49 +0200)
added patches:
ceph-hold-i_ceph_lock-when-removing-caps-for-freeing-inode.patch
drivers-pps-pps.c-clear-offset-flags-in-pps_setparams-ioctl.patch
fix-allyesconfig-output.patch
proc-pid-cmdline-add-back-the-setproctitle-special-case.patch
proc-pid-cmdline-remove-all-the-special-cases.patch
sched-fair-don-t-free-p-numa_faults-with-concurrent-readers.patch
sched-fair-use-rcu-accessors-consistently-for-numa_group.patch

queue-5.2/ceph-hold-i_ceph_lock-when-removing-caps-for-freeing-inode.patch [new file with mode: 0644]
queue-5.2/drivers-pps-pps.c-clear-offset-flags-in-pps_setparams-ioctl.patch [new file with mode: 0644]
queue-5.2/fix-allyesconfig-output.patch [new file with mode: 0644]
queue-5.2/proc-pid-cmdline-add-back-the-setproctitle-special-case.patch [new file with mode: 0644]
queue-5.2/proc-pid-cmdline-remove-all-the-special-cases.patch [new file with mode: 0644]
queue-5.2/sched-fair-don-t-free-p-numa_faults-with-concurrent-readers.patch [new file with mode: 0644]
queue-5.2/sched-fair-use-rcu-accessors-consistently-for-numa_group.patch [new file with mode: 0644]
queue-5.2/series

diff --git a/queue-5.2/ceph-hold-i_ceph_lock-when-removing-caps-for-freeing-inode.patch b/queue-5.2/ceph-hold-i_ceph_lock-when-removing-caps-for-freeing-inode.patch
new file mode 100644 (file)
index 0000000..b79778a
--- /dev/null
@@ -0,0 +1,74 @@
+From d6e47819721ae2d9d090058ad5570a66f3c42e39 Mon Sep 17 00:00:00 2001
+From: "Yan, Zheng" <zyan@redhat.com>
+Date: Thu, 23 May 2019 11:01:37 +0800
+Subject: ceph: hold i_ceph_lock when removing caps for freeing inode
+
+From: Yan, Zheng <zyan@redhat.com>
+
+commit d6e47819721ae2d9d090058ad5570a66f3c42e39 upstream.
+
+ceph_d_revalidate(, LOOKUP_RCU) may call __ceph_caps_issued_mask()
+on a freeing inode.
+
+Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
+Reviewed-by: Jeff Layton <jlayton@redhat.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ceph/caps.c  |   10 ++++++----
+ fs/ceph/inode.c |    2 +-
+ fs/ceph/super.h |    2 +-
+ 3 files changed, 8 insertions(+), 6 deletions(-)
+
+--- a/fs/ceph/caps.c
++++ b/fs/ceph/caps.c
+@@ -1263,20 +1263,22 @@ static int send_cap_msg(struct cap_msg_a
+ }
+ /*
+- * Queue cap releases when an inode is dropped from our cache.  Since
+- * inode is about to be destroyed, there is no need for i_ceph_lock.
++ * Queue cap releases when an inode is dropped from our cache.
+  */
+-void __ceph_remove_caps(struct inode *inode)
++void __ceph_remove_caps(struct ceph_inode_info *ci)
+ {
+-      struct ceph_inode_info *ci = ceph_inode(inode);
+       struct rb_node *p;
++      /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
++       * may call __ceph_caps_issued_mask() on a freeing inode. */
++      spin_lock(&ci->i_ceph_lock);
+       p = rb_first(&ci->i_caps);
+       while (p) {
+               struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+               p = rb_next(p);
+               __ceph_remove_cap(cap, true);
+       }
++      spin_unlock(&ci->i_ceph_lock);
+ }
+ /*
+--- a/fs/ceph/inode.c
++++ b/fs/ceph/inode.c
+@@ -536,7 +536,7 @@ void ceph_evict_inode(struct inode *inod
+       ceph_fscache_unregister_inode_cookie(ci);
+-      __ceph_remove_caps(inode);
++      __ceph_remove_caps(ci);
+       if (__ceph_has_any_quota(ci))
+               ceph_adjust_quota_realms_count(inode, false);
+--- a/fs/ceph/super.h
++++ b/fs/ceph/super.h
+@@ -1000,7 +1000,7 @@ extern void ceph_add_cap(struct inode *i
+                        unsigned cap, unsigned seq, u64 realmino, int flags,
+                        struct ceph_cap **new_cap);
+ extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+-extern void __ceph_remove_caps(struct inode* inode);
++extern void __ceph_remove_caps(struct ceph_inode_info *ci);
+ extern void ceph_put_cap(struct ceph_mds_client *mdsc,
+                        struct ceph_cap *cap);
+ extern int ceph_is_any_caps(struct inode *inode);
diff --git a/queue-5.2/drivers-pps-pps.c-clear-offset-flags-in-pps_setparams-ioctl.patch b/queue-5.2/drivers-pps-pps.c-clear-offset-flags-in-pps_setparams-ioctl.patch
new file mode 100644 (file)
index 0000000..45de119
--- /dev/null
@@ -0,0 +1,50 @@
+From 5515e9a6273b8c02034466bcbd717ac9f53dab99 Mon Sep 17 00:00:00 2001
+From: Miroslav Lichvar <mlichvar@redhat.com>
+Date: Tue, 16 Jul 2019 16:30:09 -0700
+Subject: drivers/pps/pps.c: clear offset flags in PPS_SETPARAMS ioctl
+
+From: Miroslav Lichvar <mlichvar@redhat.com>
+
+commit 5515e9a6273b8c02034466bcbd717ac9f53dab99 upstream.
+
+The PPS assert/clear offset corrections are set by the PPS_SETPARAMS
+ioctl in the pps_ktime structs, which also contain flags.  The flags are
+not initialized by applications (using the timepps.h header) and they
+are not used by the kernel for anything except returning them back in
+the PPS_GETPARAMS ioctl.
+
+Set the flags to zero to make it clear they are unused and avoid leaking
+uninitialized data of the PPS_SETPARAMS caller to other applications
+that have a read access to the PPS device.
+
+Link: http://lkml.kernel.org/r/20190702092251.24303-1-mlichvar@redhat.com
+Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Rodolfo Giometti <giometti@enneenne.com>
+Cc: Greg KH <greg@kroah.com>
+Cc: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pps/pps.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/drivers/pps/pps.c
++++ b/drivers/pps/pps.c
+@@ -152,6 +152,14 @@ static long pps_cdev_ioctl(struct file *
+                       pps->params.mode |= PPS_CANWAIT;
+               pps->params.api_version = PPS_API_VERS;
++              /*
++               * Clear unused fields of pps_kparams to avoid leaking
++               * uninitialized data of the PPS_SETPARAMS caller via
++               * PPS_GETPARAMS
++               */
++              pps->params.assert_off_tu.flags = 0;
++              pps->params.clear_off_tu.flags = 0;
++
+               spin_unlock_irq(&pps->lock);
+               break;
diff --git a/queue-5.2/fix-allyesconfig-output.patch b/queue-5.2/fix-allyesconfig-output.patch
new file mode 100644 (file)
index 0000000..d896011
--- /dev/null
@@ -0,0 +1,51 @@
+From 1b496469d0c020e09124e03e66a81421c21272a7 Mon Sep 17 00:00:00 2001
+From: Yoshinori Sato <ysato@users.sourceforge.jp>
+Date: Sun, 21 Apr 2019 22:53:58 +0900
+Subject: Fix allyesconfig output.
+
+From: Yoshinori Sato <ysato@users.sourceforge.jp>
+
+commit 1b496469d0c020e09124e03e66a81421c21272a7 upstream.
+
+Conflict JCore-SoC and SolutionEngine 7619.
+
+Signed-off-by: Yoshinori Sato <ysato@users.sourceforge.jp>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/sh/boards/Kconfig |   14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+--- a/arch/sh/boards/Kconfig
++++ b/arch/sh/boards/Kconfig
+@@ -8,27 +8,19 @@ config SH_ALPHA_BOARD
+       bool
+ config SH_DEVICE_TREE
+-      bool "Board Described by Device Tree"
++      bool
+       select OF
+       select OF_EARLY_FLATTREE
+       select TIMER_OF
+       select COMMON_CLK
+       select GENERIC_CALIBRATE_DELAY
+-      help
+-        Select Board Described by Device Tree to build a kernel that
+-        does not hard-code any board-specific knowledge but instead uses
+-        a device tree blob provided by the boot-loader. You must enable
+-        drivers for any hardware you want to use separately. At this
+-        time, only boards based on the open-hardware J-Core processors
+-        have sufficient driver coverage to use this option; do not
+-        select it if you are using original SuperH hardware.
+ config SH_JCORE_SOC
+       bool "J-Core SoC"
+-      depends on SH_DEVICE_TREE && (CPU_SH2 || CPU_J2)
++      select SH_DEVICE_TREE
+       select CLKSRC_JCORE_PIT
+       select JCORE_AIC
+-      default y if CPU_J2
++      depends on CPU_J2
+       help
+         Select this option to include drivers core components of the
+         J-Core SoC, including interrupt controllers and timers.
diff --git a/queue-5.2/proc-pid-cmdline-add-back-the-setproctitle-special-case.patch b/queue-5.2/proc-pid-cmdline-add-back-the-setproctitle-special-case.patch
new file mode 100644 (file)
index 0000000..73a67d1
--- /dev/null
@@ -0,0 +1,147 @@
+From d26d0cd97c88eb1a5704b42e41ab443406807810 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 13 Jul 2019 14:27:14 -0700
+Subject: /proc/<pid>/cmdline: add back the setproctitle() special case
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit d26d0cd97c88eb1a5704b42e41ab443406807810 upstream.
+
+This makes the setproctitle() special case very explicit indeed, and
+handles it with a separate helper function entirely.  In the process, it
+re-instates the original semantics of simply stopping at the first NUL
+character when the original last NUL character is no longer there.
+
+[ The original semantics can still be seen in mm/util.c: get_cmdline()
+  that is limited to a fixed-size buffer ]
+
+This makes the logic about when we use the string lengths etc much more
+obvious, and makes it easier to see what we do and what the two very
+different cases are.
+
+Note that even when we allow walking past the end of the argument array
+(because the setproctitle() might have overwritten and overflowed the
+original argv[] strings), we only allow it when it overflows into the
+environment region if it is immediately adjacent.
+
+[ Fixed for missing 'count' checks noted by Alexey Izbyshev ]
+
+Link: https://lore.kernel.org/lkml/alpine.LNX.2.21.1904052326230.3249@kich.toxcorp.com/
+Fixes: 5ab827189965 ("fs/proc: simplify and clarify get_mm_cmdline() function")
+Cc: Jakub Jankowski <shasta@toxcorp.com>
+Cc: Alexey Dobriyan <adobriyan@gmail.com>
+Cc: Alexey Izbyshev <izbyshev@ispras.ru>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/base.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 77 insertions(+), 4 deletions(-)
+
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -209,12 +209,53 @@ static int proc_root_link(struct dentry
+       return result;
+ }
++/*
++ * If the user used setproctitle(), we just get the string from
++ * user space at arg_start, and limit it to a maximum of one page.
++ */
++static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
++                              size_t count, unsigned long pos,
++                              unsigned long arg_start)
++{
++      char *page;
++      int ret, got;
++
++      if (pos >= PAGE_SIZE)
++              return 0;
++
++      page = (char *)__get_free_page(GFP_KERNEL);
++      if (!page)
++              return -ENOMEM;
++
++      ret = 0;
++      got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
++      if (got > 0) {
++              int len = strnlen(page, got);
++
++              /* Include the NUL character if it was found */
++              if (len < got)
++                      len++;
++
++              if (len > pos) {
++                      len -= pos;
++                      if (len > count)
++                              len = count;
++                      len -= copy_to_user(buf, page+pos, len);
++                      if (!len)
++                              len = -EFAULT;
++                      ret = len;
++              }
++      }
++      free_page((unsigned long)page);
++      return ret;
++}
++
+ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
+                             size_t count, loff_t *ppos)
+ {
+-      unsigned long arg_start, arg_end;
++      unsigned long arg_start, arg_end, env_start, env_end;
+       unsigned long pos, len;
+-      char *page;
++      char *page, c;
+       /* Check if process spawned far enough to have cmdline. */
+       if (!mm->env_end)
+@@ -223,14 +264,46 @@ static ssize_t get_mm_cmdline(struct mm_
+       spin_lock(&mm->arg_lock);
+       arg_start = mm->arg_start;
+       arg_end = mm->arg_end;
++      env_start = mm->env_start;
++      env_end = mm->env_end;
+       spin_unlock(&mm->arg_lock);
+       if (arg_start >= arg_end)
+               return 0;
++      /*
++       * We allow setproctitle() to overwrite the argument
++       * strings, and overflow past the original end. But
++       * only when it overflows into the environment area.
++       */
++      if (env_start != arg_end || env_end < env_start)
++              env_start = env_end = arg_end;
++      len = env_end - arg_start;
++
+       /* We're not going to care if "*ppos" has high bits set */
+-      /* .. but we do check the result is in the proper range */
+-      pos = arg_start + *ppos;
++      pos = *ppos;
++      if (pos >= len)
++              return 0;
++      if (count > len - pos)
++              count = len - pos;
++      if (!count)
++              return 0;
++
++      /*
++       * Magical special case: if the argv[] end byte is not
++       * zero, the user has overwritten it with setproctitle(3).
++       *
++       * Possible future enhancement: do this only once when
++       * pos is 0, and set a flag in the 'struct file'.
++       */
++      if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
++              return get_mm_proctitle(mm, buf, count, pos, arg_start);
++
++      /*
++       * For the non-setproctitle() case we limit things strictly
++       * to the [arg_start, arg_end[ range.
++       */
++      pos += arg_start;
+       if (pos < arg_start || pos >= arg_end)
+               return 0;
+       if (count > arg_end - pos)
diff --git a/queue-5.2/proc-pid-cmdline-remove-all-the-special-cases.patch b/queue-5.2/proc-pid-cmdline-remove-all-the-special-cases.patch
new file mode 100644 (file)
index 0000000..129fb6b
--- /dev/null
@@ -0,0 +1,135 @@
+From 3d712546d8ba9f25cdf080d79f90482aa4231ed4 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 13 Jul 2019 13:40:13 -0700
+Subject: /proc/<pid>/cmdline: remove all the special cases
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 3d712546d8ba9f25cdf080d79f90482aa4231ed4 upstream.
+
+Start off with a clean slate that only reads exactly from arg_start to
+arg_end, without any oddities.  This simplifies the code and in the
+process removes the case that caused us to potentially leak an
+uninitialized byte from the temporary kernel buffer.
+
+Note that in order to start from scratch with an understandable base,
+this simplifies things _too_ much, and removes all the legacy logic to
+handle setproctitle() having changed the argument strings.
+
+We'll add back those special cases very differently in the next commit.
+
+Link: https://lore.kernel.org/lkml/20190712160913.17727-1-izbyshev@ispras.ru/
+Fixes: f5b65348fd77 ("proc: fix missing final NUL in get_mm_cmdline() rewrite")
+Cc: Alexey Izbyshev <izbyshev@ispras.ru>
+Cc: Alexey Dobriyan <adobriyan@gmail.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/base.c |   71 ++++++---------------------------------------------------
+ 1 file changed, 8 insertions(+), 63 deletions(-)
+
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -212,7 +212,7 @@ static int proc_root_link(struct dentry
+ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
+                             size_t count, loff_t *ppos)
+ {
+-      unsigned long arg_start, arg_end, env_start, env_end;
++      unsigned long arg_start, arg_end;
+       unsigned long pos, len;
+       char *page;
+@@ -223,36 +223,18 @@ static ssize_t get_mm_cmdline(struct mm_
+       spin_lock(&mm->arg_lock);
+       arg_start = mm->arg_start;
+       arg_end = mm->arg_end;
+-      env_start = mm->env_start;
+-      env_end = mm->env_end;
+       spin_unlock(&mm->arg_lock);
+       if (arg_start >= arg_end)
+               return 0;
+-      /*
+-       * We have traditionally allowed the user to re-write
+-       * the argument strings and overflow the end result
+-       * into the environment section. But only do that if
+-       * the environment area is contiguous to the arguments.
+-       */
+-      if (env_start != arg_end || env_start >= env_end)
+-              env_start = env_end = arg_end;
+-
+-      /* .. and limit it to a maximum of one page of slop */
+-      if (env_end >= arg_end + PAGE_SIZE)
+-              env_end = arg_end + PAGE_SIZE - 1;
+-
+       /* We're not going to care if "*ppos" has high bits set */
+-      pos = arg_start + *ppos;
+-
+       /* .. but we do check the result is in the proper range */
+-      if (pos < arg_start || pos >= env_end)
++      pos = arg_start + *ppos;
++      if (pos < arg_start || pos >= arg_end)
+               return 0;
+-
+-      /* .. and we never go past env_end */
+-      if (env_end - pos < count)
+-              count = env_end - pos;
++      if (count > arg_end - pos)
++              count = arg_end - pos;
+       page = (char *)__get_free_page(GFP_KERNEL);
+       if (!page)
+@@ -262,48 +244,11 @@ static ssize_t get_mm_cmdline(struct mm_
+       while (count) {
+               int got;
+               size_t size = min_t(size_t, PAGE_SIZE, count);
+-              long offset;
+-
+-              /*
+-               * Are we already starting past the official end?
+-               * We always include the last byte that is *supposed*
+-               * to be NUL
+-               */
+-              offset = (pos >= arg_end) ? pos - arg_end + 1 : 0;
+-              got = access_remote_vm(mm, pos - offset, page, size + offset, FOLL_ANON);
+-              if (got <= offset)
++              got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
++              if (got <= 0)
+                       break;
+-              got -= offset;
+-
+-              /* Don't walk past a NUL character once you hit arg_end */
+-              if (pos + got >= arg_end) {
+-                      int n = 0;
+-
+-                      /*
+-                       * If we started before 'arg_end' but ended up
+-                       * at or after it, we start the NUL character
+-                       * check at arg_end-1 (where we expect the normal
+-                       * EOF to be).
+-                       *
+-                       * NOTE! This is smaller than 'got', because
+-                       * pos + got >= arg_end
+-                       */
+-                      if (pos < arg_end)
+-                              n = arg_end - pos - 1;
+-
+-                      /* Cut off at first NUL after 'n' */
+-                      got = n + strnlen(page+n, offset+got-n);
+-                      if (got < offset)
+-                              break;
+-                      got -= offset;
+-
+-                      /* Include the NUL if it existed */
+-                      if (got < size)
+-                              got++;
+-              }
+-
+-              got -= copy_to_user(buf, page+offset, got);
++              got -= copy_to_user(buf, page, got);
+               if (unlikely(!got)) {
+                       if (!len)
+                               len = -EFAULT;
diff --git a/queue-5.2/sched-fair-don-t-free-p-numa_faults-with-concurrent-readers.patch b/queue-5.2/sched-fair-don-t-free-p-numa_faults-with-concurrent-readers.patch
new file mode 100644 (file)
index 0000000..41c9870
--- /dev/null
@@ -0,0 +1,131 @@
+From 16d51a590a8ce3befb1308e0e7ab77f3b661af33 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 16 Jul 2019 17:20:45 +0200
+Subject: sched/fair: Don't free p->numa_faults with concurrent readers
+
+From: Jann Horn <jannh@google.com>
+
+commit 16d51a590a8ce3befb1308e0e7ab77f3b661af33 upstream.
+
+When going through execve(), zero out the NUMA fault statistics instead of
+freeing them.
+
+During execve, the task is reachable through procfs and the scheduler. A
+concurrent /proc/*/sched reader can read data from a freed ->numa_faults
+allocation (confirmed by KASAN) and write it back to userspace.
+I believe that it would also be possible for a use-after-free read to occur
+through a race between a NUMA fault and execve(): task_numa_fault() can
+lead to task_numa_compare(), which invokes task_weight() on the currently
+running task of a different CPU.
+
+Another way to fix this would be to make ->numa_faults RCU-managed or add
+extra locking, but it seems easier to wipe the NUMA fault statistics on
+execve.
+
+Signed-off-by: Jann Horn <jannh@google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Petr Mladek <pmladek@suse.com>
+Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Will Deacon <will@kernel.org>
+Fixes: 82727018b0d3 ("sched/numa: Call task_numa_free() from do_execve()")
+Link: https://lkml.kernel.org/r/20190716152047.14424-1-jannh@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/exec.c                            |    2 +-
+ include/linux/sched/numa_balancing.h |    4 ++--
+ kernel/fork.c                        |    2 +-
+ kernel/sched/fair.c                  |   24 ++++++++++++++++++++----
+ 4 files changed, 24 insertions(+), 8 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1828,7 +1828,7 @@ static int __do_execve_file(int fd, stru
+       membarrier_execve(current);
+       rseq_execve(current);
+       acct_update_integrals(current);
+-      task_numa_free(current);
++      task_numa_free(current, false);
+       free_bprm(bprm);
+       kfree(pathbuf);
+       if (filename)
+--- a/include/linux/sched/numa_balancing.h
++++ b/include/linux/sched/numa_balancing.h
+@@ -19,7 +19,7 @@
+ extern void task_numa_fault(int last_node, int node, int pages, int flags);
+ extern pid_t task_numa_group_id(struct task_struct *p);
+ extern void set_numabalancing_state(bool enabled);
+-extern void task_numa_free(struct task_struct *p);
++extern void task_numa_free(struct task_struct *p, bool final);
+ extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
+                                       int src_nid, int dst_cpu);
+ #else
+@@ -34,7 +34,7 @@ static inline pid_t task_numa_group_id(s
+ static inline void set_numabalancing_state(bool enabled)
+ {
+ }
+-static inline void task_numa_free(struct task_struct *p)
++static inline void task_numa_free(struct task_struct *p, bool final)
+ {
+ }
+ static inline bool should_numa_migrate_memory(struct task_struct *p,
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -727,7 +727,7 @@ void __put_task_struct(struct task_struc
+       WARN_ON(tsk == current);
+       cgroup_free(tsk);
+-      task_numa_free(tsk);
++      task_numa_free(tsk, true);
+       security_task_free(tsk);
+       exit_creds(tsk);
+       delayacct_tsk_free(tsk);
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2336,13 +2336,23 @@ no_join:
+       return;
+ }
+-void task_numa_free(struct task_struct *p)
++/*
++ * Get rid of NUMA staticstics associated with a task (either current or dead).
++ * If @final is set, the task is dead and has reached refcount zero, so we can
++ * safely free all relevant data structures. Otherwise, there might be
++ * concurrent reads from places like load balancing and procfs, and we should
++ * reset the data back to default state without freeing ->numa_faults.
++ */
++void task_numa_free(struct task_struct *p, bool final)
+ {
+       struct numa_group *grp = p->numa_group;
+-      void *numa_faults = p->numa_faults;
++      unsigned long *numa_faults = p->numa_faults;
+       unsigned long flags;
+       int i;
++      if (!numa_faults)
++              return;
++
+       if (grp) {
+               spin_lock_irqsave(&grp->lock, flags);
+               for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+@@ -2355,8 +2365,14 @@ void task_numa_free(struct task_struct *
+               put_numa_group(grp);
+       }
+-      p->numa_faults = NULL;
+-      kfree(numa_faults);
++      if (final) {
++              p->numa_faults = NULL;
++              kfree(numa_faults);
++      } else {
++              p->total_numa_faults = 0;
++              for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
++                      numa_faults[i] = 0;
++      }
+ }
+ /*
diff --git a/queue-5.2/sched-fair-use-rcu-accessors-consistently-for-numa_group.patch b/queue-5.2/sched-fair-use-rcu-accessors-consistently-for-numa_group.patch
new file mode 100644 (file)
index 0000000..1d7d863
--- /dev/null
@@ -0,0 +1,386 @@
+From cb361d8cdef69990f6b4504dc1fd9a594d983c97 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 16 Jul 2019 17:20:47 +0200
+Subject: sched/fair: Use RCU accessors consistently for ->numa_group
+
+From: Jann Horn <jannh@google.com>
+
+commit cb361d8cdef69990f6b4504dc1fd9a594d983c97 upstream.
+
+The old code used RCU annotations and accessors inconsistently for
+->numa_group, which can lead to use-after-frees and NULL dereferences.
+
+Let all accesses to ->numa_group use proper RCU helpers to prevent such
+issues.
+
+Signed-off-by: Jann Horn <jannh@google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Petr Mladek <pmladek@suse.com>
+Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Will Deacon <will@kernel.org>
+Fixes: 8c8a743c5087 ("sched/numa: Use {cpu, pid} to create task groups for shared faults")
+Link: https://lkml.kernel.org/r/20190716152047.14424-3-jannh@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sched.h |   10 +++-
+ kernel/sched/fair.c   |  120 +++++++++++++++++++++++++++++++++-----------------
+ 2 files changed, 90 insertions(+), 40 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1026,7 +1026,15 @@ struct task_struct {
+       u64                             last_sum_exec_runtime;
+       struct callback_head            numa_work;
+-      struct numa_group               *numa_group;
++      /*
++       * This pointer is only modified for current in syscall and
++       * pagefault context (and for tasks being destroyed), so it can be read
++       * from any of the following contexts:
++       *  - RCU read-side critical section
++       *  - current->numa_group from everywhere
++       *  - task's runqueue locked, task not running
++       */
++      struct numa_group __rcu         *numa_group;
+       /*
+        * numa_faults is an array split into four regions:
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1067,6 +1067,21 @@ struct numa_group {
+       unsigned long faults[0];
+ };
++/*
++ * For functions that can be called in multiple contexts that permit reading
++ * ->numa_group (see struct task_struct for locking rules).
++ */
++static struct numa_group *deref_task_numa_group(struct task_struct *p)
++{
++      return rcu_dereference_check(p->numa_group, p == current ||
++              (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
++}
++
++static struct numa_group *deref_curr_numa_group(struct task_struct *p)
++{
++      return rcu_dereference_protected(p->numa_group, p == current);
++}
++
+ static inline unsigned long group_faults_priv(struct numa_group *ng);
+ static inline unsigned long group_faults_shared(struct numa_group *ng);
+@@ -1110,10 +1125,12 @@ static unsigned int task_scan_start(stru
+ {
+       unsigned long smin = task_scan_min(p);
+       unsigned long period = smin;
++      struct numa_group *ng;
+       /* Scale the maximum scan period with the amount of shared memory. */
+-      if (p->numa_group) {
+-              struct numa_group *ng = p->numa_group;
++      rcu_read_lock();
++      ng = rcu_dereference(p->numa_group);
++      if (ng) {
+               unsigned long shared = group_faults_shared(ng);
+               unsigned long private = group_faults_priv(ng);
+@@ -1121,6 +1138,7 @@ static unsigned int task_scan_start(stru
+               period *= shared + 1;
+               period /= private + shared + 1;
+       }
++      rcu_read_unlock();
+       return max(smin, period);
+ }
+@@ -1129,13 +1147,14 @@ static unsigned int task_scan_max(struct
+ {
+       unsigned long smin = task_scan_min(p);
+       unsigned long smax;
++      struct numa_group *ng;
+       /* Watch for min being lower than max due to floor calculations */
+       smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+       /* Scale the maximum scan period with the amount of shared memory. */
+-      if (p->numa_group) {
+-              struct numa_group *ng = p->numa_group;
++      ng = deref_curr_numa_group(p);
++      if (ng) {
+               unsigned long shared = group_faults_shared(ng);
+               unsigned long private = group_faults_priv(ng);
+               unsigned long period = smax;
+@@ -1167,7 +1186,7 @@ void init_numa_balancing(unsigned long c
+       p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
+       p->numa_work.next               = &p->numa_work;
+       p->numa_faults                  = NULL;
+-      p->numa_group                   = NULL;
++      RCU_INIT_POINTER(p->numa_group, NULL);
+       p->last_task_numa_placement     = 0;
+       p->last_sum_exec_runtime        = 0;
+@@ -1214,7 +1233,16 @@ static void account_numa_dequeue(struct
+ pid_t task_numa_group_id(struct task_struct *p)
+ {
+-      return p->numa_group ? p->numa_group->gid : 0;
++      struct numa_group *ng;
++      pid_t gid = 0;
++
++      rcu_read_lock();
++      ng = rcu_dereference(p->numa_group);
++      if (ng)
++              gid = ng->gid;
++      rcu_read_unlock();
++
++      return gid;
+ }
+ /*
+@@ -1239,11 +1267,13 @@ static inline unsigned long task_faults(
+ static inline unsigned long group_faults(struct task_struct *p, int nid)
+ {
+-      if (!p->numa_group)
++      struct numa_group *ng = deref_task_numa_group(p);
++
++      if (!ng)
+               return 0;
+-      return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+-              p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
++      return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
++              ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+ }
+ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+@@ -1381,12 +1411,13 @@ static inline unsigned long task_weight(
+ static inline unsigned long group_weight(struct task_struct *p, int nid,
+                                        int dist)
+ {
++      struct numa_group *ng = deref_task_numa_group(p);
+       unsigned long faults, total_faults;
+-      if (!p->numa_group)
++      if (!ng)
+               return 0;
+-      total_faults = p->numa_group->total_faults;
++      total_faults = ng->total_faults;
+       if (!total_faults)
+               return 0;
+@@ -1400,7 +1431,7 @@ static inline unsigned long group_weight
+ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+                               int src_nid, int dst_cpu)
+ {
+-      struct numa_group *ng = p->numa_group;
++      struct numa_group *ng = deref_curr_numa_group(p);
+       int dst_nid = cpu_to_node(dst_cpu);
+       int last_cpupid, this_cpupid;
+@@ -1583,13 +1614,14 @@ static bool load_too_imbalanced(long src
+ static void task_numa_compare(struct task_numa_env *env,
+                             long taskimp, long groupimp, bool maymove)
+ {
++      struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
+       struct rq *dst_rq = cpu_rq(env->dst_cpu);
++      long imp = p_ng ? groupimp : taskimp;
+       struct task_struct *cur;
+       long src_load, dst_load;
+-      long load;
+-      long imp = env->p->numa_group ? groupimp : taskimp;
+-      long moveimp = imp;
+       int dist = env->dist;
++      long moveimp = imp;
++      long load;
+       if (READ_ONCE(dst_rq->numa_migrate_on))
+               return;
+@@ -1628,21 +1660,22 @@ static void task_numa_compare(struct tas
+        * If dst and source tasks are in the same NUMA group, or not
+        * in any group then look only at task weights.
+        */
+-      if (cur->numa_group == env->p->numa_group) {
++      cur_ng = rcu_dereference(cur->numa_group);
++      if (cur_ng == p_ng) {
+               imp = taskimp + task_weight(cur, env->src_nid, dist) -
+                     task_weight(cur, env->dst_nid, dist);
+               /*
+                * Add some hysteresis to prevent swapping the
+                * tasks within a group over tiny differences.
+                */
+-              if (cur->numa_group)
++              if (cur_ng)
+                       imp -= imp / 16;
+       } else {
+               /*
+                * Compare the group weights. If a task is all by itself
+                * (not part of a group), use the task weight instead.
+                */
+-              if (cur->numa_group && env->p->numa_group)
++              if (cur_ng && p_ng)
+                       imp += group_weight(cur, env->src_nid, dist) -
+                              group_weight(cur, env->dst_nid, dist);
+               else
+@@ -1740,11 +1773,12 @@ static int task_numa_migrate(struct task
+               .best_imp = 0,
+               .best_cpu = -1,
+       };
++      unsigned long taskweight, groupweight;
+       struct sched_domain *sd;
++      long taskimp, groupimp;
++      struct numa_group *ng;
+       struct rq *best_rq;
+-      unsigned long taskweight, groupweight;
+       int nid, ret, dist;
+-      long taskimp, groupimp;
+       /*
+        * Pick the lowest SD_NUMA domain, as that would have the smallest
+@@ -1790,7 +1824,8 @@ static int task_numa_migrate(struct task
+        *   multiple NUMA nodes; in order to better consolidate the group,
+        *   we need to check other locations.
+        */
+-      if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
++      ng = deref_curr_numa_group(p);
++      if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
+               for_each_online_node(nid) {
+                       if (nid == env.src_nid || nid == p->numa_preferred_nid)
+                               continue;
+@@ -1823,7 +1858,7 @@ static int task_numa_migrate(struct task
+        * A task that migrated to a second choice node will be better off
+        * trying for a better one later. Do not set the preferred node here.
+        */
+-      if (p->numa_group) {
++      if (ng) {
+               if (env.best_cpu == -1)
+                       nid = env.src_nid;
+               else
+@@ -2118,6 +2153,7 @@ static void task_numa_placement(struct t
+       unsigned long total_faults;
+       u64 runtime, period;
+       spinlock_t *group_lock = NULL;
++      struct numa_group *ng;
+       /*
+        * The p->mm->numa_scan_seq field gets updated without
+@@ -2135,8 +2171,9 @@ static void task_numa_placement(struct t
+       runtime = numa_get_avg_runtime(p, &period);
+       /* If the task is part of a group prevent parallel updates to group stats */
+-      if (p->numa_group) {
+-              group_lock = &p->numa_group->lock;
++      ng = deref_curr_numa_group(p);
++      if (ng) {
++              group_lock = &ng->lock;
+               spin_lock_irq(group_lock);
+       }
+@@ -2177,7 +2214,7 @@ static void task_numa_placement(struct t
+                       p->numa_faults[cpu_idx] += f_diff;
+                       faults += p->numa_faults[mem_idx];
+                       p->total_numa_faults += diff;
+-                      if (p->numa_group) {
++                      if (ng) {
+                               /*
+                                * safe because we can only change our own group
+                                *
+@@ -2185,14 +2222,14 @@ static void task_numa_placement(struct t
+                                * nid and priv in a specific region because it
+                                * is at the beginning of the numa_faults array.
+                                */
+-                              p->numa_group->faults[mem_idx] += diff;
+-                              p->numa_group->faults_cpu[mem_idx] += f_diff;
+-                              p->numa_group->total_faults += diff;
+-                              group_faults += p->numa_group->faults[mem_idx];
++                              ng->faults[mem_idx] += diff;
++                              ng->faults_cpu[mem_idx] += f_diff;
++                              ng->total_faults += diff;
++                              group_faults += ng->faults[mem_idx];
+                       }
+               }
+-              if (!p->numa_group) {
++              if (!ng) {
+                       if (faults > max_faults) {
+                               max_faults = faults;
+                               max_nid = nid;
+@@ -2203,8 +2240,8 @@ static void task_numa_placement(struct t
+               }
+       }
+-      if (p->numa_group) {
+-              numa_group_count_active_nodes(p->numa_group);
++      if (ng) {
++              numa_group_count_active_nodes(ng);
+               spin_unlock_irq(group_lock);
+               max_nid = preferred_group_nid(p, max_nid);
+       }
+@@ -2238,7 +2275,7 @@ static void task_numa_group(struct task_
+       int cpu = cpupid_to_cpu(cpupid);
+       int i;
+-      if (unlikely(!p->numa_group)) {
++      if (unlikely(!deref_curr_numa_group(p))) {
+               unsigned int size = sizeof(struct numa_group) +
+                                   4*nr_node_ids*sizeof(unsigned long);
+@@ -2274,7 +2311,7 @@ static void task_numa_group(struct task_
+       if (!grp)
+               goto no_join;
+-      my_grp = p->numa_group;
++      my_grp = deref_curr_numa_group(p);
+       if (grp == my_grp)
+               goto no_join;
+@@ -2345,7 +2382,8 @@ no_join:
+  */
+ void task_numa_free(struct task_struct *p, bool final)
+ {
+-      struct numa_group *grp = p->numa_group;
++      /* safe: p either is current or is being freed by current */
++      struct numa_group *grp = rcu_dereference_raw(p->numa_group);
+       unsigned long *numa_faults = p->numa_faults;
+       unsigned long flags;
+       int i;
+@@ -2425,7 +2463,7 @@ void task_numa_fault(int last_cpupid, in
+        * actively using should be counted as local. This allows the
+        * scan rate to slow down when a workload has settled down.
+        */
+-      ng = p->numa_group;
++      ng = deref_curr_numa_group(p);
+       if (!priv && !local && ng && ng->active_nodes > 1 &&
+                               numa_is_active_node(cpu_node, ng) &&
+                               numa_is_active_node(mem_node, ng))
+@@ -10724,18 +10762,22 @@ void show_numa_stats(struct task_struct
+ {
+       int node;
+       unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
++      struct numa_group *ng;
++      rcu_read_lock();
++      ng = rcu_dereference(p->numa_group);
+       for_each_online_node(node) {
+               if (p->numa_faults) {
+                       tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
+                       tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
+               }
+-              if (p->numa_group) {
+-                      gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
+-                      gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
++              if (ng) {
++                      gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
++                      gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+               }
+               print_numa_stats(m, node, tsf, tpf, gsf, gpf);
+       }
++      rcu_read_unlock();
+ }
+ #endif /* CONFIG_NUMA_BALANCING */
+ #endif /* CONFIG_SCHED_DEBUG */
index e00c3b4da561481a6f3372af6c1f023b7a97a748..fbb8279c22542df0cd8a2d0dbcf710308127c8ef 100644 (file)
@@ -11,3 +11,10 @@ media-radio-raremono-change-devm_k-alloc-to-k-alloc.patch
 xfrm-policy-fix-bydst-hlist-corruption-on-hash-rebuild.patch
 nvme-fix-multipath-crash-when-ana-is-deactivated.patch
 bluetooth-hci_uart-check-for-missing-tty-operations.patch
+sched-fair-don-t-free-p-numa_faults-with-concurrent-readers.patch
+sched-fair-use-rcu-accessors-consistently-for-numa_group.patch
+proc-pid-cmdline-remove-all-the-special-cases.patch
+proc-pid-cmdline-add-back-the-setproctitle-special-case.patch
+drivers-pps-pps.c-clear-offset-flags-in-pps_setparams-ioctl.patch
+fix-allyesconfig-output.patch
+ceph-hold-i_ceph_lock-when-removing-caps-for-freeing-inode.patch