fixes for 4.9

author Sasha Levin <sashal@kernel.org>

Tue, 3 Sep 2019 04:56:24 +0000 (00:56 -0400)

committer Sasha Levin <sashal@kernel.org>

Tue, 3 Sep 2019 04:56:24 +0000 (00:56 -0400)
author Sasha Levin <sashal@kernel.org>
Tue, 3 Sep 2019 04:56:24 +0000 (00:56 -0400)
committer Sasha Levin <sashal@kernel.org>
Tue, 3 Sep 2019 04:56:24 +0000 (00:56 -0400)
diff --git a/queue-4.9/mm-zsmalloc.c-fix-race-condition-in-zs_destroy_pool.patch b/queue-4.9/mm-zsmalloc.c-fix-race-condition-in-zs_destroy_pool.patch

new file mode 100644 (file)

index 0000000..760c859
--- /dev/null
+++ b/queue-4.9/mm-zsmalloc.c-fix-race-condition-in-zs_destroy_pool.patch
@@ -0,0 +1,173 @@
+From 1d30127334c751070832ef2138717186cb765fa7 Mon Sep 17 00:00:00 2001
+From: Henry Burns <henryburns@google.com>
+Date: Sat, 24 Aug 2019 17:55:06 -0700
+Subject: mm/zsmalloc.c: fix race condition in zs_destroy_pool
+
+[ Upstream commit 701d678599d0c1623aaf4139c03eea260a75b027 ]
+
+In zs_destroy_pool() we call flush_work(&pool->free_work).  However, we
+have no guarantee that migration isn't happening in the background at
+that time.
+
+Since migration can't directly free pages, it relies on free_work being
+scheduled to free the pages.  But there's nothing preventing an
+in-progress migrate from queuing the work *after*
+zs_unregister_migration() has called flush_work().  Which would mean
+pages still pointing at the inode when we free it.
+
+Since we know at destroy time all objects should be free, no new
+migrations can come in (since zs_page_isolate() fails for fully-free
+zspages).  This means it is sufficient to track a "# isolated zspages"
+count by class, and have the destroy logic ensure all such pages have
+drained before proceeding.  Keeping that state under the class spinlock
+keeps the logic straightforward.
+
+In this case a memory leak could lead to an eventual crash if compaction
+hits the leaked page.  This crash would only occur if people are
+changing their zswap backend at runtime (which eventually starts
+destruction).
+
+Link: http://lkml.kernel.org/r/20190809181751.219326-2-henryburns@google.com
+Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
+Signed-off-by: Henry Burns <henryburns@google.com>
+Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Cc: Henry Burns <henrywolfeburns@gmail.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Jonathan Adams <jwadams@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/zsmalloc.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 59 insertions(+), 2 deletions(-)
+
+diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
+index f624cc2d91d98..ad8a34bd15ca7 100644
+--- a/mm/zsmalloc.c
++++ b/mm/zsmalloc.c
+@@ -52,6 +52,7 @@
+ #include <linux/zpool.h>
+ #include <linux/mount.h>
+ #include <linux/migrate.h>
++#include <linux/wait.h>
+ #include <linux/pagemap.h>
+ 
+ #define ZSPAGE_MAGIC  0x58
+@@ -265,6 +266,10 @@ struct zs_pool {
+ #ifdef CONFIG_COMPACTION
+       struct inode *inode;
+       struct work_struct free_work;
++      /* A wait queue for when migration races with async_free_zspage() */
++      wait_queue_head_t migration_wait;
++      atomic_long_t isolated_pages;
++      bool destroying;
+ #endif
+ };
+ 
+@@ -1951,6 +1956,19 @@ static void putback_zspage_deferred(struct zs_pool *pool,
+ 
+ }
+ 
++static inline void zs_pool_dec_isolated(struct zs_pool *pool)
++{
++      VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
++      atomic_long_dec(&pool->isolated_pages);
++      /*
++       * There's no possibility of racing, since wait_for_isolated_drain()
++       * checks the isolated count under &class->lock after enqueuing
++       * on migration_wait.
++       */
++      if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
++              wake_up_all(&pool->migration_wait);
++}
++
+ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
+                               struct page *newpage, struct page *oldpage)
+ {
+@@ -2020,6 +2038,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+        */
+       if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+               get_zspage_mapping(zspage, &class_idx, &fullness);
++              atomic_long_inc(&pool->isolated_pages);
+               remove_zspage(class, zspage, fullness);
+       }
+ 
+@@ -2108,8 +2127,16 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+        * Page migration is done so let's putback isolated zspage to
+        * the list if @page is final isolated subpage in the zspage.
+        */
+-      if (!is_zspage_isolated(zspage))
++      if (!is_zspage_isolated(zspage)) {
++              /*
++               * We cannot race with zs_destroy_pool() here because we wait
++               * for isolation to hit zero before we start destroying.
++               * Also, we ensure that everyone can see pool->destroying before
++               * we start waiting.
++               */
+               putback_zspage_deferred(pool, class, zspage);
++              zs_pool_dec_isolated(pool);
++      }
+ 
+       reset_page(page);
+       put_page(page);
+@@ -2161,8 +2188,8 @@ void zs_page_putback(struct page *page)
+                * so let's defer.
+                */
+               putback_zspage_deferred(pool, class, zspage);
++              zs_pool_dec_isolated(pool);
+       }
+-
+       spin_unlock(&class->lock);
+ }
+ 
+@@ -2185,8 +2212,36 @@ static int zs_register_migration(struct zs_pool *pool)
+       return 0;
+ }
+ 
++static bool pool_isolated_are_drained(struct zs_pool *pool)
++{
++      return atomic_long_read(&pool->isolated_pages) == 0;
++}
++
++/* Function for resolving migration */
++static void wait_for_isolated_drain(struct zs_pool *pool)
++{
++
++      /*
++       * We're in the process of destroying the pool, so there are no
++       * active allocations. zs_page_isolate() fails for completely free
++       * zspages, so we need only wait for the zs_pool's isolated
++       * count to hit zero.
++       */
++      wait_event(pool->migration_wait,
++                 pool_isolated_are_drained(pool));
++}
++
+ static void zs_unregister_migration(struct zs_pool *pool)
+ {
++      pool->destroying = true;
++      /*
++       * We need a memory barrier here to ensure global visibility of
++       * pool->destroying. Thus pool->isolated pages will either be 0 in which
++       * case we don't care, or it will be > 0 and pool->destroying will
++       * ensure that we wake up once isolation hits 0.
++       */
++      smp_mb();
++      wait_for_isolated_drain(pool); /* This can block */
+       flush_work(&pool->free_work);
+       iput(pool->inode);
+ }
+@@ -2433,6 +2488,8 @@ struct zs_pool *zs_create_pool(const char *name)
+       if (!pool->name)
+               goto err;
+ 
++      init_waitqueue_head(&pool->migration_wait);
++
+       if (create_cache(pool))
+               goto err;
+ 
+-- 
+2.20.1
+
diff --git a/queue-4.9/series b/queue-4.9/series

index 8514de9d81ca982a665d6a50f3fab7a673765aa3..4a8e52073aeba43993859ba2ef50c6ea206544ea 100644 (file)
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -61,3 +61,5 @@ alsa-seq-fix-potential-concurrent-access-to-the-deleted-pool.patch
  kvm-x86-don-t-update-rip-or-do-single-step-on-faulting-emulation.patch
  x86-apic-do-not-initialize-ldr-and-dfr-for-bigsmp.patch
  x86-apic-include-the-ldr-when-clearing-out-apic-registers.patch
+uprobes-x86-fix-detection-of-32-bit-user-mode.patch
+mm-zsmalloc.c-fix-race-condition-in-zs_destroy_pool.patch
diff --git a/queue-4.9/uprobes-x86-fix-detection-of-32-bit-user-mode.patch b/queue-4.9/uprobes-x86-fix-detection-of-32-bit-user-mode.patch

new file mode 100644 (file)

index 0000000..f0308ab
--- /dev/null
+++ b/queue-4.9/uprobes-x86-fix-detection-of-32-bit-user-mode.patch
@@ -0,0 +1,130 @@
+From c932a2cd326caac68cd394e7203c616214b93a67 Mon Sep 17 00:00:00 2001
+From: Sebastian Mayr <me@sam.st>
+Date: Sun, 28 Jul 2019 17:26:17 +0200
+Subject: uprobes/x86: Fix detection of 32-bit user mode
+
+[ Upstream commit 9212ec7d8357ea630031e89d0d399c761421c83b ]
+
+32-bit processes running on a 64-bit kernel are not always detected
+correctly, causing the process to crash when uretprobes are installed.
+
+The reason for the crash is that in_ia32_syscall() is used to determine the
+process's mode, which only works correctly when called from a syscall.
+
+In the case of uretprobes, however, the function is called from a exception
+and always returns 'false' on a 64-bit kernel. In consequence this leads to
+corruption of the process's return address.
+
+Fix this by using user_64bit_mode() instead of in_ia32_syscall(), which
+is correct in any situation.
+
+[ tglx: Add a comment and the following historical info ]
+
+This should have been detected by the rename which happened in commit
+
+  abfb9498ee13 ("x86/entry: Rename is_{ia32,x32}_task() to in_{ia32,x32}_syscall()")
+
+which states in the changelog:
+
+    The is_ia32_task()/is_x32_task() function names are a big misnomer: they
+    suggests that the compat-ness of a system call is a task property, which
+    is not true, the compatness of a system call purely depends on how it
+    was invoked through the system call layer.
+    .....
+
+and then it went and blindly renamed every call site.
+
+Sadly enough this was already mentioned here:
+
+   8faaed1b9f50 ("uprobes/x86: Introduce sizeof_long(), cleanup adjust_ret_addr() and
+arch_uretprobe_hijack_return_addr()")
+
+where the changelog says:
+
+    TODO: is_ia32_task() is not what we actually want, TS_COMPAT does
+    not necessarily mean 32bit. Fortunately syscall-like insns can't be
+    probed so it actually works, but it would be better to rename and
+    use is_ia32_frame().
+
+and goes all the way back to:
+
+    0326f5a94dde ("uprobes/core: Handle breakpoint and singlestep exceptions")
+
+Oh well. 7+ years until someone actually tried a uretprobe on a 32bit
+process on a 64bit kernel....
+
+Fixes: 0326f5a94dde ("uprobes/core: Handle breakpoint and singlestep exceptions")
+Signed-off-by: Sebastian Mayr <me@sam.st>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/20190728152617.7308-1-me@sam.st
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/uprobes.c | 17 ++++++++++-------
+ 1 file changed, 10 insertions(+), 7 deletions(-)
+
+diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
+index e78a6b1db74b0..e35466afe989d 100644
+--- a/arch/x86/kernel/uprobes.c
++++ b/arch/x86/kernel/uprobes.c
+@@ -514,9 +514,12 @@ struct uprobe_xol_ops {
+       void    (*abort)(struct arch_uprobe *, struct pt_regs *);
+ };
+ 
+-static inline int sizeof_long(void)
++static inline int sizeof_long(struct pt_regs *regs)
+ {
+-      return in_ia32_syscall() ? 4 : 8;
++      /*
++       * Check registers for mode as in_xxx_syscall() does not apply here.
++       */
++      return user_64bit_mode(regs) ? 8 : 4;
+ }
+ 
+ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+@@ -527,9 +530,9 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+ 
+ static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+ {
+-      unsigned long new_sp = regs->sp - sizeof_long();
++      unsigned long new_sp = regs->sp - sizeof_long(regs);
+ 
+-      if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
++      if (copy_to_user((void __user *)new_sp, &ip, sizeof_long(regs)))
+               return -EFAULT;
+ 
+       regs->sp = new_sp;
+@@ -562,7 +565,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
+               long correction = utask->vaddr - utask->xol_vaddr;
+               regs->ip += correction;
+       } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+-              regs->sp += sizeof_long(); /* Pop incorrect return address */
++              regs->sp += sizeof_long(regs); /* Pop incorrect return address */
+               if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
+                       return -ERESTART;
+       }
+@@ -671,7 +674,7 @@ static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+        * "call" insn was executed out-of-line. Just restore ->sp and restart.
+        * We could also restore ->ip and try to call branch_emulate_op() again.
+        */
+-      regs->sp += sizeof_long();
++      regs->sp += sizeof_long(regs);
+       return -ERESTART;
+ }
+ 
+@@ -962,7 +965,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+ unsigned long
+ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+ {
+-      int rasize = sizeof_long(), nleft;
++      int rasize = sizeof_long(regs), nleft;
+       unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
+ 
+       if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
+-- 
+2.20.1
+
author	Sasha Levin <sashal@kernel.org>
	Tue, 3 Sep 2019 04:56:24 +0000 (00:56 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Tue, 3 Sep 2019 04:56:24 +0000 (00:56 -0400)
queue-4.9/mm-zsmalloc.c-fix-race-condition-in-zs_destroy_pool.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/series		patch \| blob \| blame \| history
queue-4.9/uprobes-x86-fix-detection-of-32-bit-user-mode.patch	[new file with mode: 0644]	patch \| blob