From: Sasha Levin <sashal@kernel.org>
Date: Tue, 3 Sep 2019 04:56:24 +0000 (-0400)
Subject: fixes for 4.9
X-Git-Tag: v4.4.191~39
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=9a73642fed72d62e9e9d1fd7c218a7a79a695a19;p=thirdparty%2Fkernel%2Fstable-queue.git

fixes for 4.9

Signed-off-by: Sasha Levin <sashal@kernel.org>
---

diff --git a/queue-4.9/mm-zsmalloc.c-fix-race-condition-in-zs_destroy_pool.patch b/queue-4.9/mm-zsmalloc.c-fix-race-condition-in-zs_destroy_pool.patch
new file mode 100644
index 00000000000..760c8592968
--- /dev/null
+++ b/queue-4.9/mm-zsmalloc.c-fix-race-condition-in-zs_destroy_pool.patch
@@ -0,0 +1,173 @@
+From 1d30127334c751070832ef2138717186cb765fa7 Mon Sep 17 00:00:00 2001
+From: Henry Burns <henryburns@google.com>
+Date: Sat, 24 Aug 2019 17:55:06 -0700
+Subject: mm/zsmalloc.c: fix race condition in zs_destroy_pool
+
+[ Upstream commit 701d678599d0c1623aaf4139c03eea260a75b027 ]
+
+In zs_destroy_pool() we call flush_work(&pool->free_work).  However, we
+have no guarantee that migration isn't happening in the background at
+that time.
+
+Since migration can't directly free pages, it relies on free_work being
+scheduled to free the pages.  But there's nothing preventing an
+in-progress migrate from queuing the work *after*
+zs_unregister_migration() has called flush_work().  Which would mean
+pages still pointing at the inode when we free it.
+
+Since we know at destroy time all objects should be free, no new
+migrations can come in (since zs_page_isolate() fails for fully-free
+zspages).  This means it is sufficient to track a "# isolated zspages"
+count by class, and have the destroy logic ensure all such pages have
+drained before proceeding.  Keeping that state under the class spinlock
+keeps the logic straightforward.
+
+In this case a memory leak could lead to an eventual crash if compaction
+hits the leaked page.  This crash would only occur if people are
+changing their zswap backend at runtime (which eventually starts
+destruction).
+
+Link: http://lkml.kernel.org/r/20190809181751.219326-2-henryburns@google.com
+Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
+Signed-off-by: Henry Burns <henryburns@google.com>
+Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Cc: Henry Burns <henrywolfeburns@gmail.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: Jonathan Adams <jwadams@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/zsmalloc.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 59 insertions(+), 2 deletions(-)
+
+diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
+index f624cc2d91d98..ad8a34bd15ca7 100644
+--- a/mm/zsmalloc.c
++++ b/mm/zsmalloc.c
+@@ -52,6 +52,7 @@
+ #include <linux/zpool.h>
+ #include <linux/mount.h>
+ #include <linux/migrate.h>
++#include <linux/wait.h>
+ #include <linux/pagemap.h>
+ 
+ #define ZSPAGE_MAGIC	0x58
+@@ -265,6 +266,10 @@ struct zs_pool {
+ #ifdef CONFIG_COMPACTION
+ 	struct inode *inode;
+ 	struct work_struct free_work;
++	/* A wait queue for when migration races with async_free_zspage() */
++	wait_queue_head_t migration_wait;
++	atomic_long_t isolated_pages;
++	bool destroying;
+ #endif
+ };
+ 
+@@ -1951,6 +1956,19 @@ static void putback_zspage_deferred(struct zs_pool *pool,
+ 
+ }
+ 
++static inline void zs_pool_dec_isolated(struct zs_pool *pool)
++{
++	VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
++	atomic_long_dec(&pool->isolated_pages);
++	/*
++	 * There's no possibility of racing, since wait_for_isolated_drain()
++	 * checks the isolated count under &class->lock after enqueuing
++	 * on migration_wait.
++	 */
++	if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
++		wake_up_all(&pool->migration_wait);
++}
++
+ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
+ 				struct page *newpage, struct page *oldpage)
+ {
+@@ -2020,6 +2038,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+ 	 */
+ 	if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+ 		get_zspage_mapping(zspage, &class_idx, &fullness);
++		atomic_long_inc(&pool->isolated_pages);
+ 		remove_zspage(class, zspage, fullness);
+ 	}
+ 
+@@ -2108,8 +2127,16 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+ 	 * Page migration is done so let's putback isolated zspage to
+ 	 * the list if @page is final isolated subpage in the zspage.
+ 	 */
+-	if (!is_zspage_isolated(zspage))
++	if (!is_zspage_isolated(zspage)) {
++		/*
++		 * We cannot race with zs_destroy_pool() here because we wait
++		 * for isolation to hit zero before we start destroying.
++		 * Also, we ensure that everyone can see pool->destroying before
++		 * we start waiting.
++		 */
+ 		putback_zspage_deferred(pool, class, zspage);
++		zs_pool_dec_isolated(pool);
++	}
+ 
+ 	reset_page(page);
+ 	put_page(page);
+@@ -2161,8 +2188,8 @@ void zs_page_putback(struct page *page)
+ 		 * so let's defer.
+ 		 */
+ 		putback_zspage_deferred(pool, class, zspage);
++		zs_pool_dec_isolated(pool);
+ 	}
+-
+ 	spin_unlock(&class->lock);
+ }
+ 
+@@ -2185,8 +2212,36 @@ static int zs_register_migration(struct zs_pool *pool)
+ 	return 0;
+ }
+ 
++static bool pool_isolated_are_drained(struct zs_pool *pool)
++{
++	return atomic_long_read(&pool->isolated_pages) == 0;
++}
++
++/* Function for resolving migration */
++static void wait_for_isolated_drain(struct zs_pool *pool)
++{
++
++	/*
++	 * We're in the process of destroying the pool, so there are no
++	 * active allocations. zs_page_isolate() fails for completely free
++	 * zspages, so we need only wait for the zs_pool's isolated
++	 * count to hit zero.
++	 */
++	wait_event(pool->migration_wait,
++		   pool_isolated_are_drained(pool));
++}
++
+ static void zs_unregister_migration(struct zs_pool *pool)
+ {
++	pool->destroying = true;
++	/*
++	 * We need a memory barrier here to ensure global visibility of
++	 * pool->destroying. Thus pool->isolated pages will either be 0 in which
++	 * case we don't care, or it will be > 0 and pool->destroying will
++	 * ensure that we wake up once isolation hits 0.
++	 */
++	smp_mb();
++	wait_for_isolated_drain(pool); /* This can block */
+ 	flush_work(&pool->free_work);
+ 	iput(pool->inode);
+ }
+@@ -2433,6 +2488,8 @@ struct zs_pool *zs_create_pool(const char *name)
+ 	if (!pool->name)
+ 		goto err;
+ 
++	init_waitqueue_head(&pool->migration_wait);
++
+ 	if (create_cache(pool))
+ 		goto err;
+ 
+-- 
+2.20.1
+
diff --git a/queue-4.9/series b/queue-4.9/series
index 8514de9d81c..4a8e52073ae 100644
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -61,3 +61,5 @@ alsa-seq-fix-potential-concurrent-access-to-the-deleted-pool.patch
 kvm-x86-don-t-update-rip-or-do-single-step-on-faulting-emulation.patch
 x86-apic-do-not-initialize-ldr-and-dfr-for-bigsmp.patch
 x86-apic-include-the-ldr-when-clearing-out-apic-registers.patch
+uprobes-x86-fix-detection-of-32-bit-user-mode.patch
+mm-zsmalloc.c-fix-race-condition-in-zs_destroy_pool.patch
diff --git a/queue-4.9/uprobes-x86-fix-detection-of-32-bit-user-mode.patch b/queue-4.9/uprobes-x86-fix-detection-of-32-bit-user-mode.patch
new file mode 100644
index 00000000000..f0308ab8259
--- /dev/null
+++ b/queue-4.9/uprobes-x86-fix-detection-of-32-bit-user-mode.patch
@@ -0,0 +1,130 @@
+From c932a2cd326caac68cd394e7203c616214b93a67 Mon Sep 17 00:00:00 2001
+From: Sebastian Mayr <me@sam.st>
+Date: Sun, 28 Jul 2019 17:26:17 +0200
+Subject: uprobes/x86: Fix detection of 32-bit user mode
+
+[ Upstream commit 9212ec7d8357ea630031e89d0d399c761421c83b ]
+
+32-bit processes running on a 64-bit kernel are not always detected
+correctly, causing the process to crash when uretprobes are installed.
+
+The reason for the crash is that in_ia32_syscall() is used to determine the
+process's mode, which only works correctly when called from a syscall.
+
+In the case of uretprobes, however, the function is called from a exception
+and always returns 'false' on a 64-bit kernel. In consequence this leads to
+corruption of the process's return address.
+
+Fix this by using user_64bit_mode() instead of in_ia32_syscall(), which
+is correct in any situation.
+
+[ tglx: Add a comment and the following historical info ]
+
+This should have been detected by the rename which happened in commit
+
+  abfb9498ee13 ("x86/entry: Rename is_{ia32,x32}_task() to in_{ia32,x32}_syscall()")
+
+which states in the changelog:
+
+    The is_ia32_task()/is_x32_task() function names are a big misnomer: they
+    suggests that the compat-ness of a system call is a task property, which
+    is not true, the compatness of a system call purely depends on how it
+    was invoked through the system call layer.
+    .....
+
+and then it went and blindly renamed every call site.
+
+Sadly enough this was already mentioned here:
+
+   8faaed1b9f50 ("uprobes/x86: Introduce sizeof_long(), cleanup adjust_ret_addr() and
+arch_uretprobe_hijack_return_addr()")
+
+where the changelog says:
+
+    TODO: is_ia32_task() is not what we actually want, TS_COMPAT does
+    not necessarily mean 32bit. Fortunately syscall-like insns can't be
+    probed so it actually works, but it would be better to rename and
+    use is_ia32_frame().
+
+and goes all the way back to:
+
+    0326f5a94dde ("uprobes/core: Handle breakpoint and singlestep exceptions")
+
+Oh well. 7+ years until someone actually tried a uretprobe on a 32bit
+process on a 64bit kernel....
+
+Fixes: 0326f5a94dde ("uprobes/core: Handle breakpoint and singlestep exceptions")
+Signed-off-by: Sebastian Mayr <me@sam.st>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/20190728152617.7308-1-me@sam.st
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/uprobes.c | 17 ++++++++++-------
+ 1 file changed, 10 insertions(+), 7 deletions(-)
+
+diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
+index e78a6b1db74b0..e35466afe989d 100644
+--- a/arch/x86/kernel/uprobes.c
++++ b/arch/x86/kernel/uprobes.c
+@@ -514,9 +514,12 @@ struct uprobe_xol_ops {
+ 	void	(*abort)(struct arch_uprobe *, struct pt_regs *);
+ };
+ 
+-static inline int sizeof_long(void)
++static inline int sizeof_long(struct pt_regs *regs)
+ {
+-	return in_ia32_syscall() ? 4 : 8;
++	/*
++	 * Check registers for mode as in_xxx_syscall() does not apply here.
++	 */
++	return user_64bit_mode(regs) ? 8 : 4;
+ }
+ 
+ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+@@ -527,9 +530,9 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+ 
+ static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+ {
+-	unsigned long new_sp = regs->sp - sizeof_long();
++	unsigned long new_sp = regs->sp - sizeof_long(regs);
+ 
+-	if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
++	if (copy_to_user((void __user *)new_sp, &ip, sizeof_long(regs)))
+ 		return -EFAULT;
+ 
+ 	regs->sp = new_sp;
+@@ -562,7 +565,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
+ 		long correction = utask->vaddr - utask->xol_vaddr;
+ 		regs->ip += correction;
+ 	} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+-		regs->sp += sizeof_long(); /* Pop incorrect return address */
++		regs->sp += sizeof_long(regs); /* Pop incorrect return address */
+ 		if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
+ 			return -ERESTART;
+ 	}
+@@ -671,7 +674,7 @@ static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+ 	 * "call" insn was executed out-of-line. Just restore ->sp and restart.
+ 	 * We could also restore ->ip and try to call branch_emulate_op() again.
+ 	 */
+-	regs->sp += sizeof_long();
++	regs->sp += sizeof_long(regs);
+ 	return -ERESTART;
+ }
+ 
+@@ -962,7 +965,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+ unsigned long
+ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+ {
+-	int rasize = sizeof_long(), nleft;
++	int rasize = sizeof_long(regs), nleft;
+ 	unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
+ 
+ 	if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
+-- 
+2.20.1
+