From: Greg Kroah-Hartman Date: Sun, 16 Sep 2018 13:40:37 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v4.18.9~23 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=4d83a26e1210266bd1ab2b678154443ff4c4c29b;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: block-blkcg-use-__gfp_nowarn-for-best-effort-allocations-in-blkcg.patch cfq-give-a-chance-for-arming-slice-idle-timer-in-case-of-group_idle.patch ib-rxe-do-not-copy-extra-stack-memory-to-skb.patch kthread-fix-boot-hang-regression-on-mips-openrisc.patch kthread-fix-use-after-free-if-kthread-fork-fails.patch locking-osq_lock-fix-osq_lock-queue-corruption.patch locking-rwsem-xadd-fix-missed-wakeup-due-to-reordering-of-load.patch mm-remove-seemingly-spurious-reclaimability-check-from-laptop_mode-gating.patch mm-vmscan-clear-pgdat_writeback-when-zone-is-balanced.patch nl80211-fix-null-ptr-dereference-on-invalid-mesh-configuration.patch selinux-use-gfp_nowait-in-the-avc-kmem_caches.patch staging-rt5208-fix-a-sleep-in-atomic-bug-in-xd_copy_page.patch staging-rts5208-fix-read-overflow-in-memcpy.patch --- diff --git a/queue-4.9/block-blkcg-use-__gfp_nowarn-for-best-effort-allocations-in-blkcg.patch b/queue-4.9/block-blkcg-use-__gfp_nowarn-for-best-effort-allocations-in-blkcg.patch new file mode 100644 index 00000000000..006509699dd --- /dev/null +++ b/queue-4.9/block-blkcg-use-__gfp_nowarn-for-best-effort-allocations-in-blkcg.patch @@ -0,0 +1,81 @@ +From e00f4f4d0ff7e13b9115428a245b49108d625f09 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Mon, 21 Nov 2016 18:03:32 -0500 +Subject: block,blkcg: use __GFP_NOWARN for best-effort allocations in blkcg + +From: Tejun Heo + +commit e00f4f4d0ff7e13b9115428a245b49108d625f09 upstream. + +blkcg allocates some per-cgroup data structures with GFP_NOWAIT and +when that fails falls back to operations which aren't specific to the +cgroup. Occassional failures are expected under pressure and falling +back to non-cgroup operation is the right thing to do. + +Unfortunately, I forgot to add __GFP_NOWARN to these allocations and +these expected failures end up creating a lot of noise. Add +__GFP_NOWARN. + +Signed-off-by: Tejun Heo +Reported-by: Marc MERLIN +Reported-by: Vlastimil Babka +Signed-off-by: Jens Axboe +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + block/blk-cgroup.c | 9 +++++---- + block/cfq-iosched.c | 3 ++- + 2 files changed, 7 insertions(+), 5 deletions(-) + +--- a/block/blk-cgroup.c ++++ b/block/blk-cgroup.c +@@ -185,7 +185,8 @@ static struct blkcg_gq *blkg_create(stru + } + + wb_congested = wb_congested_get_create(&q->backing_dev_info, +- blkcg->css.id, GFP_NOWAIT); ++ blkcg->css.id, ++ GFP_NOWAIT | __GFP_NOWARN); + if (!wb_congested) { + ret = -ENOMEM; + goto err_put_css; +@@ -193,7 +194,7 @@ static struct blkcg_gq *blkg_create(stru + + /* allocate */ + if (!new_blkg) { +- new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); ++ new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); + if (unlikely(!new_blkg)) { + ret = -ENOMEM; + goto err_put_congested; +@@ -1022,7 +1023,7 @@ blkcg_css_alloc(struct cgroup_subsys_sta + } + + spin_lock_init(&blkcg->lock); +- INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); ++ INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); + INIT_HLIST_HEAD(&blkcg->blkg_list); + #ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); +@@ -1238,7 +1239,7 @@ pd_prealloc: + if (blkg->pd[pol->plid]) + continue; + +- pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); ++ pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node); + if (!pd) + swap(pd, pd_prealloc); + if (!pd) { +--- a/block/cfq-iosched.c ++++ b/block/cfq-iosched.c +@@ -3868,7 +3868,8 @@ cfq_get_queue(struct cfq_data *cfqd, boo + goto out; + } + +- cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, ++ cfqq = kmem_cache_alloc_node(cfq_pool, ++ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, + cfqd->queue->node); + if (!cfqq) { + cfqq = &cfqd->oom_cfqq; diff --git a/queue-4.9/cfq-give-a-chance-for-arming-slice-idle-timer-in-case-of-group_idle.patch b/queue-4.9/cfq-give-a-chance-for-arming-slice-idle-timer-in-case-of-group_idle.patch new file mode 100644 index 00000000000..f9cdee30501 --- /dev/null +++ b/queue-4.9/cfq-give-a-chance-for-arming-slice-idle-timer-in-case-of-group_idle.patch @@ -0,0 +1,72 @@ +From b3193bc0dca9bb69c8ba1ec1a318105c76eb4172 Mon Sep 17 00:00:00 2001 +From: Ritesh Harjani +Date: Wed, 9 Aug 2017 18:28:32 +0530 +Subject: cfq: Give a chance for arming slice idle timer in case of group_idle + +From: Ritesh Harjani + +commit b3193bc0dca9bb69c8ba1ec1a318105c76eb4172 upstream. + +In below scenario blkio cgroup does not work as per their assigned +weights :- +1. When the underlying device is nonrotational with a single HW queue +with depth of >= CFQ_HW_QUEUE_MIN +2. When the use case is forming two blkio cgroups cg1(weight 1000) & +cg2(wight 100) and two processes(file1 and file2) doing sync IO in +their respective blkio cgroups. + +For above usecase result of fio (without this patch):- +file1: (groupid=0, jobs=1): err= 0: pid=685: Thu Jan 1 19:41:49 1970 + write: IOPS=1315, BW=41.1MiB/s (43.1MB/s)(1024MiB/24906msec) +<...> +file2: (groupid=0, jobs=1): err= 0: pid=686: Thu Jan 1 19:41:49 1970 + write: IOPS=1295, BW=40.5MiB/s (42.5MB/s)(1024MiB/25293msec) +<...> +// both the process BW is equal even though they belong to diff. +cgroups with weight of 1000(cg1) and 100(cg2) + +In above case (for non rotational NCQ devices), +as soon as the request from cg1 is completed and even +though it is provided with higher set_slice=10, because of CFQ +algorithm when the driver tries to fetch the request, CFQ expires +this group without providing any idle time nor weight priority +and schedules another cfq group (in this case cg2). +And thus both cfq groups(cg1 & cg2) keep alternating to get the +disk time and hence loses the cgroup weight based scheduling. + +Below patch gives a chance to cfq algorithm (cfq_arm_slice_timer) +to arm the slice timer in case group_idle is enabled. +In case if group_idle is also not required (including for nonrotational +NCQ drives), we need to explicitly set group_idle = 0 from sysfs for +such cases. + +With this patch result of fio(for above usecase) :- +file1: (groupid=0, jobs=1): err= 0: pid=690: Thu Jan 1 00:06:08 1970 + write: IOPS=1706, BW=53.3MiB/s (55.9MB/s)(1024MiB/19197msec) +<..> +file2: (groupid=0, jobs=1): err= 0: pid=691: Thu Jan 1 00:06:08 1970 + write: IOPS=1043, BW=32.6MiB/s (34.2MB/s)(1024MiB/31401msec) +<..> +// In this processes BW is as per their respective cgroups weight. + +Signed-off-by: Ritesh Harjani +Signed-off-by: Jens Axboe +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + block/cfq-iosched.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/block/cfq-iosched.c ++++ b/block/cfq-iosched.c +@@ -2951,7 +2951,8 @@ static void cfq_arm_slice_timer(struct c + * for devices that support queuing, otherwise we still have a problem + * with sync vs async workloads. + */ +- if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) ++ if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag && ++ !cfqd->cfq_group_idle) + return; + + WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); diff --git a/queue-4.9/ib-rxe-do-not-copy-extra-stack-memory-to-skb.patch b/queue-4.9/ib-rxe-do-not-copy-extra-stack-memory-to-skb.patch new file mode 100644 index 00000000000..ba10605e0e8 --- /dev/null +++ b/queue-4.9/ib-rxe-do-not-copy-extra-stack-memory-to-skb.patch @@ -0,0 +1,62 @@ +From 4c93496f18ce5044d78e4f7f9e018682a4f44b3d Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Wed, 12 Jul 2017 14:36:01 -0700 +Subject: IB/rxe: do not copy extra stack memory to skb + +From: Kees Cook + +commit 4c93496f18ce5044d78e4f7f9e018682a4f44b3d upstream. + +This fixes a over-read condition detected by FORTIFY_SOURCE for this +line: + + memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(skb->cb)); + +The error was: + + In file included from ./include/linux/bitmap.h:8:0, + from ./include/linux/cpumask.h:11, + from ./include/linux/mm_types_task.h:13, + from ./include/linux/mm_types.h:4, + from ./include/linux/kmemcheck.h:4, + from ./include/linux/skbuff.h:18, + from drivers/infiniband/sw/rxe/rxe_resp.c:34: + In function 'memcpy', + inlined from 'send_atomic_ack.constprop' at drivers/infiniband/sw/rxe/rxe_resp.c:998:2, + inlined from 'acknowledge' at drivers/infiniband/sw/rxe/rxe_resp.c:1026:3, + inlined from 'rxe_responder' at drivers/infiniband/sw/rxe/rxe_resp.c:1286:10: + ./include/linux/string.h:309:4: error: call to '__read_overflow2' declared with attribute error: detected read beyond size of object passed as 2nd parameter + __read_overflow2(); + +Daniel Micay noted that struct rxe_pkt_info is 32 bytes on 32-bit +architectures, but skb->cb is still 64. The memcpy() over-reads 32 +bytes. This fixes it by zeroing the unused bytes in skb->cb. + +Link: http://lkml.kernel.org/r/1497903987-21002-5-git-send-email-keescook@chromium.org +Signed-off-by: Kees Cook +Cc: Moni Shoua +Cc: Doug Ledford +Cc: Sean Hefty +Cc: Daniel Micay +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/sw/rxe/rxe_resp.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/sw/rxe/rxe_resp.c ++++ b/drivers/infiniband/sw/rxe/rxe_resp.c +@@ -978,7 +978,9 @@ static int send_atomic_ack(struct rxe_qp + free_rd_atomic_resource(qp, res); + rxe_advance_resp_resource(qp); + +- memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(skb->cb)); ++ memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(ack_pkt)); ++ memset((unsigned char *)SKB_TO_PKT(skb) + sizeof(ack_pkt), 0, ++ sizeof(skb->cb) - sizeof(ack_pkt)); + + res->type = RXE_ATOMIC_MASK; + res->atomic.skb = skb; diff --git a/queue-4.9/kthread-fix-boot-hang-regression-on-mips-openrisc.patch b/queue-4.9/kthread-fix-boot-hang-regression-on-mips-openrisc.patch new file mode 100644 index 00000000000..153bd229415 --- /dev/null +++ b/queue-4.9/kthread-fix-boot-hang-regression-on-mips-openrisc.patch @@ -0,0 +1,73 @@ +From b0f5a8f32e8bbdaae1abb8abe2d3cbafaba57e08 Mon Sep 17 00:00:00 2001 +From: Vegard Nossum +Date: Mon, 29 May 2017 09:22:07 +0200 +Subject: kthread: fix boot hang (regression) on MIPS/OpenRISC + +From: Vegard Nossum + +commit b0f5a8f32e8bbdaae1abb8abe2d3cbafaba57e08 upstream. + +This fixes a regression in commit 4d6501dce079 where I didn't notice +that MIPS and OpenRISC were reinitialising p->{set,clear}_child_tid to +NULL after our initialisation in copy_process(). + +We can simply get rid of the arch-specific initialisation here since it +is now always done in copy_process() before hitting copy_thread{,_tls}(). + +Review notes: + + - As far as I can tell, copy_process() is the only user of + copy_thread_tls(), which is the only caller of copy_thread() for + architectures that don't implement copy_thread_tls(). + + - After this patch, there is no arch-specific code touching + p->set_child_tid or p->clear_child_tid whatsoever. + + - It may look like MIPS/OpenRISC wanted to always have these fields be + NULL, but that's not true, as copy_process() would unconditionally + set them again _after_ calling copy_thread_tls() before commit + 4d6501dce079. + +Fixes: 4d6501dce079c1eb6bf0b1d8f528a5e81770109e ("kthread: Fix use-after-free if kthread fork fails") +Reported-by: Guenter Roeck +Tested-by: Guenter Roeck # MIPS only +Acked-by: Stafford Horne +Acked-by: Oleg Nesterov +Cc: Ralf Baechle +Cc: linux-mips@linux-mips.org +Cc: Jonas Bonn +Cc: Stefan Kristiansson +Cc: openrisc@lists.librecores.org +Cc: Jamie Iles +Cc: Thomas Gleixner +Signed-off-by: Vegard Nossum +Signed-off-by: Linus Torvalds +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + arch/mips/kernel/process.c | 1 - + arch/openrisc/kernel/process.c | 2 -- + 2 files changed, 3 deletions(-) + +--- a/arch/mips/kernel/process.c ++++ b/arch/mips/kernel/process.c +@@ -118,7 +118,6 @@ int copy_thread(unsigned long clone_flag + struct thread_info *ti = task_thread_info(p); + struct pt_regs *childregs, *regs = current_pt_regs(); + unsigned long childksp; +- p->set_child_tid = p->clear_child_tid = NULL; + + childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE - 32; + +--- a/arch/openrisc/kernel/process.c ++++ b/arch/openrisc/kernel/process.c +@@ -152,8 +152,6 @@ copy_thread(unsigned long clone_flags, u + + top_of_kernel_stack = sp; + +- p->set_child_tid = p->clear_child_tid = NULL; +- + /* Locate userspace context on stack... */ + sp -= STACK_FRAME_OVERHEAD; /* redzone */ + sp -= sizeof(struct pt_regs); diff --git a/queue-4.9/kthread-fix-use-after-free-if-kthread-fork-fails.patch b/queue-4.9/kthread-fix-use-after-free-if-kthread-fork-fails.patch new file mode 100644 index 00000000000..bead9fcf088 --- /dev/null +++ b/queue-4.9/kthread-fix-use-after-free-if-kthread-fork-fails.patch @@ -0,0 +1,94 @@ +From 4d6501dce079c1eb6bf0b1d8f528a5e81770109e Mon Sep 17 00:00:00 2001 +From: Vegard Nossum +Date: Tue, 9 May 2017 09:39:59 +0200 +Subject: kthread: Fix use-after-free if kthread fork fails + +From: Vegard Nossum + +commit 4d6501dce079c1eb6bf0b1d8f528a5e81770109e upstream. + +If a kthread forks (e.g. usermodehelper since commit 1da5c46fa965) but +fails in copy_process() between calling dup_task_struct() and setting +p->set_child_tid, then the value of p->set_child_tid will be inherited +from the parent and get prematurely freed by free_kthread_struct(). + + kthread() + - worker_thread() + - process_one_work() + | - call_usermodehelper_exec_work() + | - kernel_thread() + | - _do_fork() + | - copy_process() + | - dup_task_struct() + | - arch_dup_task_struct() + | - tsk->set_child_tid = current->set_child_tid // implied + | - ... + | - goto bad_fork_* + | - ... + | - free_task(tsk) + | - free_kthread_struct(tsk) + | - kfree(tsk->set_child_tid) + - ... + - schedule() + - __schedule() + - wq_worker_sleeping() + - kthread_data(task)->flags // UAF + +The problem started showing up with commit 1da5c46fa965 since it reused +->set_child_tid for the kthread worker data. + +A better long-term solution might be to get rid of the ->set_child_tid +abuse. The comment in set_kthread_struct() also looks slightly wrong. + +Debugged-by: Jamie Iles +Fixes: 1da5c46fa965 ("kthread: Make struct kthread kmalloc'ed") +Signed-off-by: Vegard Nossum +Acked-by: Oleg Nesterov +Cc: Peter Zijlstra +Cc: Greg Kroah-Hartman +Cc: Andy Lutomirski +Cc: Frederic Weisbecker +Cc: Jamie Iles +Cc: stable@vger.kernel.org +Link: http://lkml.kernel.org/r/20170509073959.17858-1-vegard.nossum@oracle.com +Signed-off-by: Thomas Gleixner +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/fork.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1532,6 +1532,18 @@ static __latent_entropy struct task_stru + if (!p) + goto fork_out; + ++ /* ++ * This _must_ happen before we call free_task(), i.e. before we jump ++ * to any of the bad_fork_* labels. This is to avoid freeing ++ * p->set_child_tid which is (ab)used as a kthread's data pointer for ++ * kernel threads (PF_KTHREAD). ++ */ ++ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; ++ /* ++ * Clear TID on mm_release()? ++ */ ++ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; ++ + ftrace_graph_init_task(p); + + rt_mutex_init_task(p); +@@ -1693,11 +1705,6 @@ static __latent_entropy struct task_stru + } + } + +- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; +- /* +- * Clear TID on mm_release()? +- */ +- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + #ifdef CONFIG_BLOCK + p->plug = NULL; + #endif diff --git a/queue-4.9/locking-osq_lock-fix-osq_lock-queue-corruption.patch b/queue-4.9/locking-osq_lock-fix-osq_lock-queue-corruption.patch new file mode 100644 index 00000000000..5640f102bda --- /dev/null +++ b/queue-4.9/locking-osq_lock-fix-osq_lock-queue-corruption.patch @@ -0,0 +1,132 @@ +From 50972fe78f24f1cd0b9d7bbf1f87d2be9e4f412e Mon Sep 17 00:00:00 2001 +From: Prateek Sood +Date: Fri, 14 Jul 2017 19:17:56 +0530 +Subject: locking/osq_lock: Fix osq_lock queue corruption + +From: Prateek Sood + +commit 50972fe78f24f1cd0b9d7bbf1f87d2be9e4f412e upstream. + +Fix ordering of link creation between node->prev and prev->next in +osq_lock(). A case in which the status of optimistic spin queue is +CPU6->CPU2 in which CPU6 has acquired the lock. + + tail + v + ,-. <- ,-. + |6| |2| + `-' -> `-' + +At this point if CPU0 comes in to acquire osq_lock, it will update the +tail count. + + CPU2 CPU0 + ---------------------------------- + + tail + v + ,-. <- ,-. ,-. + |6| |2| |0| + `-' -> `-' `-' + +After tail count update if CPU2 starts to unqueue itself from +optimistic spin queue, it will find an updated tail count with CPU0 and +update CPU2 node->next to NULL in osq_wait_next(). + + unqueue-A + + tail + v + ,-. <- ,-. ,-. + |6| |2| |0| + `-' `-' `-' + + unqueue-B + + ->tail != curr && !node->next + +If reordering of following stores happen then prev->next where prev +being CPU2 would be updated to point to CPU0 node: + + tail + v + ,-. <- ,-. ,-. + |6| |2| |0| + `-' `-' -> `-' + + osq_wait_next() + node->next <- 0 + xchg(node->next, NULL) + + tail + v + ,-. <- ,-. ,-. + |6| |2| |0| + `-' `-' `-' + + unqueue-C + +At this point if next instruction + WRITE_ONCE(next->prev, prev); +in CPU2 path is committed before the update of CPU0 node->prev = prev then +CPU0 node->prev will point to CPU6 node. + + tail + v----------. v + ,-. <- ,-. ,-. + |6| |2| |0| + `-' `-' `-' + `----------^ + +At this point if CPU0 path's node->prev = prev is committed resulting +in change of CPU0 prev back to CPU2 node. CPU2 node->next is NULL +currently, + + tail + v + ,-. <- ,-. <- ,-. + |6| |2| |0| + `-' `-' `-' + `----------^ + +so if CPU0 gets into unqueue path of osq_lock it will keep spinning +in infinite loop as condition prev->next == node will never be true. + +Signed-off-by: Prateek Sood +[ Added pictures, rewrote comments. ] +Signed-off-by: Peter Zijlstra (Intel) +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: sramana@codeaurora.org +Link: http://lkml.kernel.org/r/1500040076-27626-1-git-send-email-prsood@codeaurora.org +Signed-off-by: Ingo Molnar +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/locking/osq_lock.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +--- a/kernel/locking/osq_lock.c ++++ b/kernel/locking/osq_lock.c +@@ -104,6 +104,19 @@ bool osq_lock(struct optimistic_spin_que + + prev = decode_cpu(old); + node->prev = prev; ++ ++ /* ++ * osq_lock() unqueue ++ * ++ * node->prev = prev osq_wait_next() ++ * WMB MB ++ * prev->next = node next->prev = prev // unqueue-C ++ * ++ * Here 'node->prev' and 'next->prev' are the same variable and we need ++ * to ensure these stores happen in-order to avoid corrupting the list. ++ */ ++ smp_wmb(); ++ + WRITE_ONCE(prev->next, node); + + /* diff --git a/queue-4.9/locking-rwsem-xadd-fix-missed-wakeup-due-to-reordering-of-load.patch b/queue-4.9/locking-rwsem-xadd-fix-missed-wakeup-due-to-reordering-of-load.patch new file mode 100644 index 00000000000..68eede87bf5 --- /dev/null +++ b/queue-4.9/locking-rwsem-xadd-fix-missed-wakeup-due-to-reordering-of-load.patch @@ -0,0 +1,94 @@ +From 9c29c31830a4eca724e137a9339137204bbb31be Mon Sep 17 00:00:00 2001 +From: Prateek Sood +Date: Thu, 7 Sep 2017 20:00:58 +0530 +Subject: locking/rwsem-xadd: Fix missed wakeup due to reordering of load + +From: Prateek Sood + +commit 9c29c31830a4eca724e137a9339137204bbb31be upstream. + +If a spinner is present, there is a chance that the load of +rwsem_has_spinner() in rwsem_wake() can be reordered with +respect to decrement of rwsem count in __up_write() leading +to wakeup being missed: + + spinning writer up_write caller + --------------- ----------------------- + [S] osq_unlock() [L] osq + spin_lock(wait_lock) + sem->count=0xFFFFFFFF00000001 + +0xFFFFFFFF00000000 + count=sem->count + MB + sem->count=0xFFFFFFFE00000001 + -0xFFFFFFFF00000001 + spin_trylock(wait_lock) + return + rwsem_try_write_lock(count) + spin_unlock(wait_lock) + schedule() + +Reordering of atomic_long_sub_return_release() in __up_write() +and rwsem_has_spinner() in rwsem_wake() can cause missing of +wakeup in up_write() context. In spinning writer, sem->count +and local variable count is 0XFFFFFFFE00000001. It would result +in rwsem_try_write_lock() failing to acquire rwsem and spinning +writer going to sleep in rwsem_down_write_failed(). + +The smp_rmb() will make sure that the spinner state is +consulted after sem->count is updated in up_write context. + +Signed-off-by: Prateek Sood +Signed-off-by: Peter Zijlstra (Intel) +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: dave@stgolabs.net +Cc: longman@redhat.com +Cc: parri.andrea@gmail.com +Cc: sramana@codeaurora.org +Link: http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prsood@codeaurora.org +Signed-off-by: Ingo Molnar +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/locking/rwsem-xadd.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +--- a/kernel/locking/rwsem-xadd.c ++++ b/kernel/locking/rwsem-xadd.c +@@ -574,6 +574,33 @@ struct rw_semaphore *rwsem_wake(struct r + WAKE_Q(wake_q); + + /* ++ * __rwsem_down_write_failed_common(sem) ++ * rwsem_optimistic_spin(sem) ++ * osq_unlock(sem->osq) ++ * ... ++ * atomic_long_add_return(&sem->count) ++ * ++ * - VS - ++ * ++ * __up_write() ++ * if (atomic_long_sub_return_release(&sem->count) < 0) ++ * rwsem_wake(sem) ++ * osq_is_locked(&sem->osq) ++ * ++ * And __up_write() must observe !osq_is_locked() when it observes the ++ * atomic_long_add_return() in order to not miss a wakeup. ++ * ++ * This boils down to: ++ * ++ * [S.rel] X = 1 [RmW] r0 = (Y += 0) ++ * MB RMB ++ * [RmW] Y += 1 [L] r1 = X ++ * ++ * exists (r0=1 /\ r1=0) ++ */ ++ smp_rmb(); ++ ++ /* + * If a spinner is present, it is not necessary to do the wakeup. + * Try to do wakeup only if the trylock succeeds to minimize + * spinlock contention which may introduce too much delay in the diff --git a/queue-4.9/mm-remove-seemingly-spurious-reclaimability-check-from-laptop_mode-gating.patch b/queue-4.9/mm-remove-seemingly-spurious-reclaimability-check-from-laptop_mode-gating.patch new file mode 100644 index 00000000000..35a347e9c4c --- /dev/null +++ b/queue-4.9/mm-remove-seemingly-spurious-reclaimability-check-from-laptop_mode-gating.patch @@ -0,0 +1,44 @@ +From 047d72c30eedcb953222810f1e7dcaae663aa452 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 3 May 2017 14:51:57 -0700 +Subject: mm: remove seemingly spurious reclaimability check from laptop_mode gating + +From: Johannes Weiner + +commit 047d72c30eedcb953222810f1e7dcaae663aa452 upstream. + +Commit 1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of +nodes") allowed laptop_mode=1 to start writing not just when the +priority drops to DEF_PRIORITY - 2 but also when the node is +unreclaimable. + +That appears to be a spurious change in this patch as I doubt the series +was tested with laptop_mode, and neither is that particular change +mentioned in the changelog. Remove it, it's still recent. + +Link: http://lkml.kernel.org/r/20170228214007.5621-4-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Acked-by: Hillf Danton +Acked-by: Mel Gorman +Acked-by: Michal Hocko +Cc: Jia He +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3301,7 +3301,7 @@ static int balance_pgdat(pg_data_t *pgda + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ +- if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) ++ if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* Call soft limit reclaim before calling shrink_node. */ diff --git a/queue-4.9/mm-vmscan-clear-pgdat_writeback-when-zone-is-balanced.patch b/queue-4.9/mm-vmscan-clear-pgdat_writeback-when-zone-is-balanced.patch new file mode 100644 index 00000000000..d1e4202b310 --- /dev/null +++ b/queue-4.9/mm-vmscan-clear-pgdat_writeback-when-zone-is-balanced.patch @@ -0,0 +1,52 @@ +From c2f83143f1c67d186520b72b6cefbf0aa07a34ee Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Fri, 24 Feb 2017 14:59:07 -0800 +Subject: mm, vmscan: clear PGDAT_WRITEBACK when zone is balanced + +From: Mel Gorman + +commit c2f83143f1c67d186520b72b6cefbf0aa07a34ee upstream. + +Hillf Danton pointed out that since commit 1d82de618dd ("mm, vmscan: +make kswapd reclaim in terms of nodes") that PGDAT_WRITEBACK is no +longer cleared. + +It was not noticed as triggering it requires pages under writeback to +cycle twice through the LRU and before kswapd gets stalled. +Historically, such issues tended to occur on small machines writing +heavily to slow storage such as a USB stick. + +Once kswapd stalls, direct reclaim stalls may be higher but due to the +fact that memory pressure is required, it would not be very noticable. + +Michal Hocko suggested removing the flag entirely but the conservative +fix is to restore the intended PGDAT_WRITEBACK behaviour and clear the +flag when a suitable zone is balanced. + +Fixes: 1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of nodes") +Link: http://lkml.kernel.org/r/20170203203222.gq7hk66yc36lpgtb@suse.de +Signed-off-by: Mel Gorman +Acked-by: Johannes Weiner +Acked-by: Michal Hocko +Acked-by: Hillf Danton +Cc: Minchan Kim +Cc: Rik van Riel +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3123,6 +3123,7 @@ static bool zone_balanced(struct zone *z + */ + clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); + clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); ++ clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); + + return true; + } diff --git a/queue-4.9/nl80211-fix-null-ptr-dereference-on-invalid-mesh-configuration.patch b/queue-4.9/nl80211-fix-null-ptr-dereference-on-invalid-mesh-configuration.patch new file mode 100644 index 00000000000..52d74d51353 --- /dev/null +++ b/queue-4.9/nl80211-fix-null-ptr-dereference-on-invalid-mesh-configuration.patch @@ -0,0 +1,35 @@ +From 265698d7e6132a2d41471135534f4f36ad15b09c Mon Sep 17 00:00:00 2001 +From: Johannes Berg +Date: Mon, 18 Sep 2017 22:46:36 +0200 +Subject: nl80211: fix null-ptr dereference on invalid mesh configuration + +From: Johannes Berg + +commit 265698d7e6132a2d41471135534f4f36ad15b09c upstream. + +If TX rates are specified during mesh join, the channel must +also be specified. Check the channel pointer to avoid a null +pointer dereference if it isn't. + +Reported-by: Jouni Malinen +Fixes: 8564e38206de ("cfg80211: add checks for beacon rate, extend to mesh") +Signed-off-by: Johannes Berg +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + net/wireless/nl80211.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/net/wireless/nl80211.c ++++ b/net/wireless/nl80211.c +@@ -9481,6 +9481,9 @@ static int nl80211_join_mesh(struct sk_b + if (err) + return err; + ++ if (!setup.chandef.chan) ++ return -EINVAL; ++ + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, + &setup.beacon_rate); + if (err) diff --git a/queue-4.9/selinux-use-gfp_nowait-in-the-avc-kmem_caches.patch b/queue-4.9/selinux-use-gfp_nowait-in-the-avc-kmem_caches.patch new file mode 100644 index 00000000000..91237c2477b --- /dev/null +++ b/queue-4.9/selinux-use-gfp_nowait-in-the-avc-kmem_caches.patch @@ -0,0 +1,79 @@ +From 476accbe2f6ef69caeebe99f52a286e12ac35aee Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Thu, 3 Aug 2017 10:11:52 +0200 +Subject: selinux: use GFP_NOWAIT in the AVC kmem_caches + +From: Michal Hocko + +commit 476accbe2f6ef69caeebe99f52a286e12ac35aee upstream. + +There is a strange __GFP_NOMEMALLOC usage pattern in SELinux, +specifically GFP_ATOMIC | __GFP_NOMEMALLOC which doesn't make much +sense. GFP_ATOMIC on its own allows to access memory reserves while +__GFP_NOMEMALLOC dictates we cannot use memory reserves. Replace this +with the much more sane GFP_NOWAIT in the AVC code as we can tolerate +memory allocation failures in that code. + +Signed-off-by: Michal Hocko +Acked-by: Mel Gorman +Signed-off-by: Paul Moore +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + security/selinux/avc.c | 14 ++++++-------- + 1 file changed, 6 insertions(+), 8 deletions(-) + +--- a/security/selinux/avc.c ++++ b/security/selinux/avc.c +@@ -348,27 +348,26 @@ static struct avc_xperms_decision_node + struct avc_xperms_decision_node *xpd_node; + struct extended_perms_decision *xpd; + +- xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, +- GFP_ATOMIC | __GFP_NOMEMALLOC); ++ xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT); + if (!xpd_node) + return NULL; + + xpd = &xpd_node->xpd; + if (which & XPERMS_ALLOWED) { + xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep, +- GFP_ATOMIC | __GFP_NOMEMALLOC); ++ GFP_NOWAIT); + if (!xpd->allowed) + goto error; + } + if (which & XPERMS_AUDITALLOW) { + xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep, +- GFP_ATOMIC | __GFP_NOMEMALLOC); ++ GFP_NOWAIT); + if (!xpd->auditallow) + goto error; + } + if (which & XPERMS_DONTAUDIT) { + xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep, +- GFP_ATOMIC | __GFP_NOMEMALLOC); ++ GFP_NOWAIT); + if (!xpd->dontaudit) + goto error; + } +@@ -396,8 +395,7 @@ static struct avc_xperms_node *avc_xperm + { + struct avc_xperms_node *xp_node; + +- xp_node = kmem_cache_zalloc(avc_xperms_cachep, +- GFP_ATOMIC|__GFP_NOMEMALLOC); ++ xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT); + if (!xp_node) + return xp_node; + INIT_LIST_HEAD(&xp_node->xpd_head); +@@ -550,7 +548,7 @@ static struct avc_node *avc_alloc_node(v + { + struct avc_node *node; + +- node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC); ++ node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT); + if (!node) + goto out; + diff --git a/queue-4.9/series b/queue-4.9/series index 9983f4112cc..be9f14fa771 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -2,3 +2,16 @@ i2c-xiic-make-the-start-and-the-byte-count-write-atomic.patch i2c-i801-fix-dnv-s-smbctrl-register-offset.patch kvm-s390-vsie-copy-wrapping-keys-to-right-place.patch alsa-hda-fix-cancel_work_sync-stall-from-jackpoll-work.patch +cfq-give-a-chance-for-arming-slice-idle-timer-in-case-of-group_idle.patch +kthread-fix-use-after-free-if-kthread-fork-fails.patch +kthread-fix-boot-hang-regression-on-mips-openrisc.patch +staging-rt5208-fix-a-sleep-in-atomic-bug-in-xd_copy_page.patch +staging-rts5208-fix-read-overflow-in-memcpy.patch +ib-rxe-do-not-copy-extra-stack-memory-to-skb.patch +block-blkcg-use-__gfp_nowarn-for-best-effort-allocations-in-blkcg.patch +nl80211-fix-null-ptr-dereference-on-invalid-mesh-configuration.patch +locking-rwsem-xadd-fix-missed-wakeup-due-to-reordering-of-load.patch +selinux-use-gfp_nowait-in-the-avc-kmem_caches.patch +locking-osq_lock-fix-osq_lock-queue-corruption.patch +mm-vmscan-clear-pgdat_writeback-when-zone-is-balanced.patch +mm-remove-seemingly-spurious-reclaimability-check-from-laptop_mode-gating.patch diff --git a/queue-4.9/staging-rt5208-fix-a-sleep-in-atomic-bug-in-xd_copy_page.patch b/queue-4.9/staging-rt5208-fix-a-sleep-in-atomic-bug-in-xd_copy_page.patch new file mode 100644 index 00000000000..8e0a1f08500 --- /dev/null +++ b/queue-4.9/staging-rt5208-fix-a-sleep-in-atomic-bug-in-xd_copy_page.patch @@ -0,0 +1,41 @@ +From 498c4b4e9c23855d17ecc2a108d949bb68020481 Mon Sep 17 00:00:00 2001 +From: Jia-Ju Bai +Date: Mon, 5 Jun 2017 15:30:16 +0800 +Subject: staging: rt5208: Fix a sleep-in-atomic bug in xd_copy_page + +From: Jia-Ju Bai + +commit 498c4b4e9c23855d17ecc2a108d949bb68020481 upstream. + +The driver may sleep under a spin lock, and the function call path is: +rtsx_exclusive_enter_ss (acquire the lock by spin_lock) + rtsx_enter_ss + rtsx_power_off_card + xd_cleanup_work + xd_delay_write + xd_finish_write + xd_copy_page + wait_timeout + schedule_timeout --> may sleep + +To fix it, "wait_timeout" is replaced with mdelay in xd_copy_page. + +Signed-off-by: Jia-Ju Bai +Signed-off-by: Amit Pundir +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/staging/rts5208/xd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/staging/rts5208/xd.c ++++ b/drivers/staging/rts5208/xd.c +@@ -1247,7 +1247,7 @@ static int xd_copy_page(struct rtsx_chip + reg = 0; + rtsx_read_register(chip, XD_CTL, ®); + if (reg & (XD_ECC1_ERROR | XD_ECC2_ERROR)) { +- wait_timeout(100); ++ mdelay(100); + + if (detect_card_cd(chip, + XD_CARD) != STATUS_SUCCESS) { diff --git a/queue-4.9/staging-rts5208-fix-read-overflow-in-memcpy.patch b/queue-4.9/staging-rts5208-fix-read-overflow-in-memcpy.patch new file mode 100644 index 00000000000..b1300572b42 --- /dev/null +++ b/queue-4.9/staging-rts5208-fix-read-overflow-in-memcpy.patch @@ -0,0 +1,34 @@ +From 88a5b39b69ab1828fd4130e2baadd184109cea69 Mon Sep 17 00:00:00 2001 +From: Daniel Micay +Date: Mon, 5 Jun 2017 21:52:34 -0700 +Subject: staging/rts5208: Fix read overflow in memcpy + +From: Daniel Micay + +commit 88a5b39b69ab1828fd4130e2baadd184109cea69 upstream. + +Noticed by FORTIFY_SOURCE, this swaps memcpy() for strncpy() to zero-value +fill the end of the buffer instead of over-reading a string from .rodata. + +Signed-off-by: Daniel Micay +[kees: wrote commit log] +Signed-off-by: Kees Cook +Cc: Greg Kroah-Hartman +Cc: Wayne Porter +Signed-off-by: Amit Pundir + +--- + drivers/staging/rts5208/rtsx_scsi.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/staging/rts5208/rtsx_scsi.c ++++ b/drivers/staging/rts5208/rtsx_scsi.c +@@ -536,7 +536,7 @@ static int inquiry(struct scsi_cmnd *srb + + if (sendbytes > 8) { + memcpy(buf, inquiry_buf, 8); +- memcpy(buf + 8, inquiry_string, sendbytes - 8); ++ strncpy(buf + 8, inquiry_string, sendbytes - 8); + if (pro_formatter_flag) { + /* Additional Length */ + buf[4] = 0x33;