From 80dbedfcd181ab967ca85fca42cfe0070185e803 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 10 Jan 2019 17:12:14 +0100 Subject: [PATCH] 4.4-stable patches added patches: fork-record-start_time-late.patch hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch mm-devm_memremap_pages-kill-mapping-system-ram-support.patch mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch sunrpc-fix-cache_head-leak-due-to-queued-request.patch sunrpc-use-svc_net-in-svcauth_gss_-functions.patch --- queue-4.4/fork-record-start_time-late.patch | 78 ++++++++ ...llow-hwpoisoned-pages-to-be-offlined.patch | 169 ++++++++++++++++++ ...ages-kill-mapping-system-ram-support.patch | 60 +++++++ ...evm_memremap_pages-export_symbol_gpl.patch | 56 ++++++ queue-4.4/series | 6 + ...ache_head-leak-due-to-queued-request.patch | 69 +++++++ ...se-svc_net-in-svcauth_gss_-functions.patch | 56 ++++++ 7 files changed, 494 insertions(+) create mode 100644 queue-4.4/fork-record-start_time-late.patch create mode 100644 queue-4.4/hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch create mode 100644 queue-4.4/mm-devm_memremap_pages-kill-mapping-system-ram-support.patch create mode 100644 queue-4.4/mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch create mode 100644 queue-4.4/sunrpc-fix-cache_head-leak-due-to-queued-request.patch create mode 100644 queue-4.4/sunrpc-use-svc_net-in-svcauth_gss_-functions.patch diff --git a/queue-4.4/fork-record-start_time-late.patch b/queue-4.4/fork-record-start_time-late.patch new file mode 100644 index 00000000000..427eb78feb6 --- /dev/null +++ b/queue-4.4/fork-record-start_time-late.patch @@ -0,0 +1,78 @@ +From 7b55851367136b1efd84d98fea81ba57a98304cf Mon Sep 17 00:00:00 2001 +From: David Herrmann +Date: Tue, 8 Jan 2019 13:58:52 +0100 +Subject: fork: record start_time late + +From: David Herrmann + +commit 7b55851367136b1efd84d98fea81ba57a98304cf upstream. + +This changes the fork(2) syscall to record the process start_time after +initializing the basic task structure but still before making the new +process visible to user-space. + +Technically, we could record the start_time anytime during fork(2). But +this might lead to scenarios where a start_time is recorded long before +a process becomes visible to user-space. For instance, with +userfaultfd(2) and TLS, user-space can delay the execution of fork(2) +for an indefinite amount of time (and will, if this causes network +access, or similar). + +By recording the start_time late, it much closer reflects the point in +time where the process becomes live and can be observed by other +processes. + +Lastly, this makes it much harder for user-space to predict and control +the start_time they get assigned. Previously, user-space could fork a +process and stall it in copy_thread_tls() before its pid is allocated, +but after its start_time is recorded. This can be misused to later-on +cycle through PIDs and resume the stalled fork(2) yielding a process +that has the same pid and start_time as a process that existed before. +This can be used to circumvent security systems that identify processes +by their pid+start_time combination. + +Even though user-space was always aware that start_time recording is +flaky (but several projects are known to still rely on start_time-based +identification), changing the start_time to be recorded late will help +mitigate existing attacks and make it much harder for user-space to +control the start_time a process gets assigned. + +Reported-by: Jann Horn +Signed-off-by: Tom Gundersen +Signed-off-by: David Herrmann +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/fork.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1411,8 +1411,6 @@ static struct task_struct *copy_process( + + posix_cpu_timers_init(p); + +- p->start_time = ktime_get_ns(); +- p->real_start_time = ktime_get_boot_ns(); + p->io_context = NULL; + p->audit_context = NULL; + cgroup_fork(p); +@@ -1573,6 +1571,17 @@ static struct task_struct *copy_process( + goto bad_fork_free_pid; + + /* ++ * From this point on we must avoid any synchronous user-space ++ * communication until we take the tasklist-lock. In particular, we do ++ * not want user-space to be able to predict the process start-time by ++ * stalling fork(2) after we recorded the start_time but before it is ++ * visible to the system. ++ */ ++ ++ p->start_time = ktime_get_ns(); ++ p->real_start_time = ktime_get_boot_ns(); ++ ++ /* + * Make it visible to the rest of the system, but dont wake it up yet. + * Need tasklist lock for parent etc handling! + */ diff --git a/queue-4.4/hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch b/queue-4.4/hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch new file mode 100644 index 00000000000..c20cca53111 --- /dev/null +++ b/queue-4.4/hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch @@ -0,0 +1,169 @@ +From b15c87263a69272423771118c653e9a1d0672caa Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Fri, 28 Dec 2018 00:38:01 -0800 +Subject: hwpoison, memory_hotplug: allow hwpoisoned pages to be offlined + +From: Michal Hocko + +commit b15c87263a69272423771118c653e9a1d0672caa upstream. + +We have received a bug report that an injected MCE about faulty memory +prevents memory offline to succeed on 4.4 base kernel. The underlying +reason was that the HWPoison page has an elevated reference count and the +migration keeps failing. There are two problems with that. First of all +it is dubious to migrate the poisoned page because we know that accessing +that memory is possible to fail. Secondly it doesn't make any sense to +migrate a potentially broken content and preserve the memory corruption +over to a new location. + +Oscar has found out that 4.4 and the current upstream kernels behave +slightly differently with his simply testcase + +=== + +int main(void) +{ + int ret; + int i; + int fd; + char *array = malloc(4096); + char *array_locked = malloc(4096); + + fd = open("/tmp/data", O_RDONLY); + read(fd, array, 4095); + + for (i = 0; i < 4096; i++) + array_locked[i] = 'd'; + + ret = mlock((void *)PAGE_ALIGN((unsigned long)array_locked), sizeof(array_locked)); + if (ret) + perror("mlock"); + + sleep (20); + + ret = madvise((void *)PAGE_ALIGN((unsigned long)array_locked), 4096, MADV_HWPOISON); + if (ret) + perror("madvise"); + + for (i = 0; i < 4096; i++) + array_locked[i] = 'd'; + + return 0; +} +=== + ++ offline this memory. + +In 4.4 kernels he saw the hwpoisoned page to be returned back to the LRU +list +kernel: [] dump_trace+0x59/0x340 +kernel: [] show_stack_log_lvl+0xea/0x170 +kernel: [] show_stack+0x21/0x40 +kernel: [] dump_stack+0x5c/0x7c +kernel: [] warn_slowpath_common+0x81/0xb0 +kernel: [] __pagevec_lru_add_fn+0x14c/0x160 +kernel: [] pagevec_lru_move_fn+0xad/0x100 +kernel: [] __lru_cache_add+0x6c/0xb0 +kernel: [] add_to_page_cache_lru+0x46/0x70 +kernel: [] extent_readpages+0xc3/0x1a0 [btrfs] +kernel: [] __do_page_cache_readahead+0x177/0x200 +kernel: [] ondemand_readahead+0x168/0x2a0 +kernel: [] generic_file_read_iter+0x41f/0x660 +kernel: [] __vfs_read+0xcd/0x140 +kernel: [] vfs_read+0x7a/0x120 +kernel: [] kernel_read+0x3b/0x50 +kernel: [] do_execveat_common.isra.29+0x490/0x6f0 +kernel: [] do_execve+0x28/0x30 +kernel: [] call_usermodehelper_exec_async+0xfb/0x130 +kernel: [] ret_from_fork+0x55/0x80 + +And that latter confuses the hotremove path because an LRU page is +attempted to be migrated and that fails due to an elevated reference +count. It is quite possible that the reuse of the HWPoisoned page is some +kind of fixed race condition but I am not really sure about that. + +With the upstream kernel the failure is slightly different. The page +doesn't seem to have LRU bit set but isolate_movable_page simply fails and +do_migrate_range simply puts all the isolated pages back to LRU and +therefore no progress is made and scan_movable_pages finds same set of +pages over and over again. + +Fix both cases by explicitly checking HWPoisoned pages before we even try +to get reference on the page, try to unmap it if it is still mapped. As +explained by Naoya: + +: Hwpoison code never unmapped those for no big reason because +: Ksm pages never dominate memory, so we simply didn't have strong +: motivation to save the pages. + +Also put WARN_ON(PageLRU) in case there is a race and we can hit LRU +HWPoison pages which shouldn't happen but I couldn't convince myself about +that. Naoya has noted the following: + +: Theoretically no such gurantee, because try_to_unmap() doesn't have a +: guarantee of success and then memory_failure() returns immediately +: when hwpoison_user_mappings fails. +: Or the following code (comes after hwpoison_user_mappings block) also impli= +: es +: that the target page can still have PageLRU flag. +: +: /* +: * Torn down by someone else? +: */ +: if (PageLRU(p) && !PageSwapCache(p) && p->mapping =3D=3D NULL) { +: action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); +: res =3D -EBUSY; +: goto out; +: } +: +: So I think it's OK to keep "if (WARN_ON(PageLRU(page)))" block in +: current version of your patch. + +Link: http://lkml.kernel.org/r/20181206120135.14079-1-mhocko@kernel.org +Signed-off-by: Michal Hocko +Reviewed-by: Oscar Salvador +Debugged-by: Oscar Salvador +Tested-by: Oscar Salvador +Acked-by: David Hildenbrand +Acked-by: Naoya Horiguchi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory_hotplug.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/mm/memory_hotplug.c ++++ b/mm/memory_hotplug.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + + #include + +@@ -1471,6 +1472,21 @@ do_migrate_range(unsigned long start_pfn + continue; + } + ++ /* ++ * HWPoison pages have elevated reference counts so the migration would ++ * fail on them. It also doesn't make any sense to migrate them in the ++ * first place. Still try to unmap such a page in case it is still mapped ++ * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep ++ * the unmap as the catch all safety net). ++ */ ++ if (PageHWPoison(page)) { ++ if (WARN_ON(PageLRU(page))) ++ isolate_lru_page(page); ++ if (page_mapped(page)) ++ try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); ++ continue; ++ } ++ + if (!get_page_unless_zero(page)) + continue; + /* diff --git a/queue-4.4/mm-devm_memremap_pages-kill-mapping-system-ram-support.patch b/queue-4.4/mm-devm_memremap_pages-kill-mapping-system-ram-support.patch new file mode 100644 index 00000000000..496d2aa4d99 --- /dev/null +++ b/queue-4.4/mm-devm_memremap_pages-kill-mapping-system-ram-support.patch @@ -0,0 +1,60 @@ +From 06489cfbd915ff36c8e36df27f1c2dc60f97ca56 Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Fri, 28 Dec 2018 00:34:54 -0800 +Subject: mm, devm_memremap_pages: kill mapping "System RAM" support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Dan Williams + +commit 06489cfbd915ff36c8e36df27f1c2dc60f97ca56 upstream. + +Given the fact that devm_memremap_pages() requires a percpu_ref that is +torn down by devm_memremap_pages_release() the current support for mapping +RAM is broken. + +Support for remapping "System RAM" has been broken since the beginning and +there is no existing user of this this code path, so just kill the support +and make it an explicit error. + +This cleanup also simplifies a follow-on patch to fix the error path when +setting a devm release action for devm_memremap_pages_release() fails. + +Link: http://lkml.kernel.org/r/154275557997.76910.14689813630968180480.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: Dan Williams +Reviewed-by: "Jérôme Glisse" +Reviewed-by: Christoph Hellwig +Reviewed-by: Logan Gunthorpe +Cc: Balbir Singh +Cc: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/memremap.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +--- a/kernel/memremap.c ++++ b/kernel/memremap.c +@@ -171,15 +171,12 @@ void *devm_memremap_pages(struct device + struct page_map *page_map; + int error, nid; + +- if (is_ram == REGION_MIXED) { +- WARN_ONCE(1, "%s attempted on mixed region %pr\n", +- __func__, res); ++ if (is_ram != REGION_DISJOINT) { ++ WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__, ++ is_ram == REGION_MIXED ? "mixed" : "ram", res); + return ERR_PTR(-ENXIO); + } + +- if (is_ram == REGION_INTERSECTS) +- return __va(res->start); +- + page_map = devres_alloc_node(devm_memremap_pages_release, + sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); + if (!page_map) diff --git a/queue-4.4/mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch b/queue-4.4/mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch new file mode 100644 index 00000000000..68901283f25 --- /dev/null +++ b/queue-4.4/mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch @@ -0,0 +1,56 @@ +From 808153e1187fa77ac7d7dad261ff476888dcf398 Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Fri, 28 Dec 2018 00:34:50 -0800 +Subject: mm, devm_memremap_pages: mark devm_memremap_pages() EXPORT_SYMBOL_GPL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Dan Williams + +commit 808153e1187fa77ac7d7dad261ff476888dcf398 upstream. + +devm_memremap_pages() is a facility that can create struct page entries +for any arbitrary range and give drivers the ability to subvert core +aspects of page management. + +Specifically the facility is tightly integrated with the kernel's memory +hotplug functionality. It injects an altmap argument deep into the +architecture specific vmemmap implementation to allow allocating from +specific reserved pages, and it has Linux specific assumptions about page +structure reference counting relative to get_user_pages() and +get_user_pages_fast(). It was an oversight and a mistake that this was +not marked EXPORT_SYMBOL_GPL from the outset. + +Again, devm_memremap_pagex() exposes and relies upon core kernel internal +assumptions and will continue to evolve along with 'struct page', memory +hotplug, and support for new memory types / topologies. Only an in-kernel +GPL-only driver is expected to keep up with this ongoing evolution. This +interface, and functionality derived from this interface, is not suitable +for kernel-external drivers. + +Link: http://lkml.kernel.org/r/154275557457.76910.16923571232582744134.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: Dan Williams +Reviewed-by: Christoph Hellwig +Acked-by: Michal Hocko +Cc: "Jérôme Glisse" +Cc: Balbir Singh +Cc: Logan Gunthorpe +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/memremap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/memremap.c ++++ b/kernel/memremap.c +@@ -202,5 +202,5 @@ void *devm_memremap_pages(struct device + devres_add(dev, page_map); + return __va(res->start); + } +-EXPORT_SYMBOL(devm_memremap_pages); ++EXPORT_SYMBOL_GPL(devm_memremap_pages); + #endif /* CONFIG_ZONE_DEVICE */ diff --git a/queue-4.4/series b/queue-4.4/series index a678f3a44dd..550b6da0374 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -61,3 +61,9 @@ xfrm-fix-bucket-count-reported-to-userspace.patch scsi-bnx2fc-fix-null-dereference-in-error-handling.patch input-omap-keypad-fix-idle-configuration-to-not-bloc.patch scsi-zfcp-fix-posting-too-many-status-read-buffers-leading-to-adapter-shutdown.patch +fork-record-start_time-late.patch +hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch +mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch +mm-devm_memremap_pages-kill-mapping-system-ram-support.patch +sunrpc-fix-cache_head-leak-due-to-queued-request.patch +sunrpc-use-svc_net-in-svcauth_gss_-functions.patch diff --git a/queue-4.4/sunrpc-fix-cache_head-leak-due-to-queued-request.patch b/queue-4.4/sunrpc-fix-cache_head-leak-due-to-queued-request.patch new file mode 100644 index 00000000000..203e0c1e2cf --- /dev/null +++ b/queue-4.4/sunrpc-fix-cache_head-leak-due-to-queued-request.patch @@ -0,0 +1,69 @@ +From 4ecd55ea074217473f94cfee21bb72864d39f8d7 Mon Sep 17 00:00:00 2001 +From: Vasily Averin +Date: Wed, 28 Nov 2018 11:45:57 +0300 +Subject: sunrpc: fix cache_head leak due to queued request + +From: Vasily Averin + +commit 4ecd55ea074217473f94cfee21bb72864d39f8d7 upstream. + +After commit d202cce8963d, an expired cache_head can be removed from the +cache_detail's hash. + +However, the expired cache_head may be waiting for a reply from a +previously submitted request. Such a cache_head has an increased +refcounter and therefore it won't be freed after cache_put(freeme). + +Because the cache_head was removed from the hash it cannot be found +during cache_clean() and can be leaked forever, together with stalled +cache_request and other taken resources. + +In our case we noticed it because an entry in the export cache was +holding a reference on a filesystem. + +Fixes d202cce8963d ("sunrpc: never return expired entries in sunrpc_cache_lookup") +Cc: Pavel Tikhomirov +Cc: stable@kernel.org # 2.6.35 +Signed-off-by: Vasily Averin +Reviewed-by: NeilBrown +Signed-off-by: J. Bruce Fields +Signed-off-by: Greg Kroah-Hartman + +--- + net/sunrpc/cache.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/net/sunrpc/cache.c ++++ b/net/sunrpc/cache.c +@@ -54,6 +54,11 @@ static void cache_init(struct cache_head + h->last_refresh = now; + } + ++static void cache_fresh_locked(struct cache_head *head, time_t expiry, ++ struct cache_detail *detail); ++static void cache_fresh_unlocked(struct cache_head *head, ++ struct cache_detail *detail); ++ + struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, + struct cache_head *key, int hash) + { +@@ -95,6 +100,7 @@ struct cache_head *sunrpc_cache_lookup(s + if (cache_is_expired(detail, tmp)) { + hlist_del_init(&tmp->cache_list); + detail->entries --; ++ cache_fresh_locked(tmp, 0, detail); + freeme = tmp; + break; + } +@@ -110,8 +116,10 @@ struct cache_head *sunrpc_cache_lookup(s + cache_get(new); + write_unlock(&detail->hash_lock); + +- if (freeme) ++ if (freeme) { ++ cache_fresh_unlocked(freeme, detail); + cache_put(freeme, detail); ++ } + return new; + } + EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); diff --git a/queue-4.4/sunrpc-use-svc_net-in-svcauth_gss_-functions.patch b/queue-4.4/sunrpc-use-svc_net-in-svcauth_gss_-functions.patch new file mode 100644 index 00000000000..6844b1999eb --- /dev/null +++ b/queue-4.4/sunrpc-use-svc_net-in-svcauth_gss_-functions.patch @@ -0,0 +1,56 @@ +From b8be5674fa9a6f3677865ea93f7803c4212f3e10 Mon Sep 17 00:00:00 2001 +From: Vasily Averin +Date: Mon, 24 Dec 2018 14:44:42 +0300 +Subject: sunrpc: use SVC_NET() in svcauth_gss_* functions + +From: Vasily Averin + +commit b8be5674fa9a6f3677865ea93f7803c4212f3e10 upstream. + +Signed-off-by: Vasily Averin +Cc: stable@vger.kernel.org +Signed-off-by: J. Bruce Fields +Signed-off-by: Greg Kroah-Hartman + +--- + net/sunrpc/auth_gss/svcauth_gss.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/net/sunrpc/auth_gss/svcauth_gss.c ++++ b/net/sunrpc/auth_gss/svcauth_gss.c +@@ -1105,7 +1105,7 @@ static int svcauth_gss_legacy_init(struc + struct kvec *resv = &rqstp->rq_res.head[0]; + struct rsi *rsip, rsikey; + int ret; +- struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id); ++ struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); + + memset(&rsikey, 0, sizeof(rsikey)); + ret = gss_read_verf(gc, argv, authp, +@@ -1216,7 +1216,7 @@ static int svcauth_gss_proxy_init(struct + uint64_t handle; + int status; + int ret; +- struct net *net = rqstp->rq_xprt->xpt_net; ++ struct net *net = SVC_NET(rqstp); + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + memset(&ud, 0, sizeof(ud)); +@@ -1406,7 +1406,7 @@ svcauth_gss_accept(struct svc_rqst *rqst + __be32 *rpcstart; + __be32 *reject_stat = resv->iov_base + resv->iov_len; + int ret; +- struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id); ++ struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); + + dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n", + argv->iov_len); +@@ -1694,7 +1694,7 @@ svcauth_gss_release(struct svc_rqst *rqs + struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct xdr_buf *resbuf = &rqstp->rq_res; + int stat = -EINVAL; +- struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id); ++ struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); + + if (gc->gc_proc != RPC_GSS_PROC_DATA) + goto out; -- 2.47.3