From: Greg Kroah-Hartman Date: Tue, 2 Aug 2016 07:04:26 +0000 (+0200) Subject: 4.6-stable patches X-Git-Tag: v3.14.75~23 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=da38c99b6f2a39ef72b20144ed91eb6ed611502e;p=thirdparty%2Fkernel%2Fstable-queue.git 4.6-stable patches added patches: arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch dmaengine-at_xdmac-align-descriptors-on-64-bits.patch dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch dmaengine-at_xdmac-fix-residue-corruption.patch fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch mm-compaction-abort-free-scanner-if-split-fails.patch mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch pps-do-not-crash-when-failed-to-register.patch radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch sched-debug-fix-deadlock-when-enabling-sched-events.patch uapi-export-lirc.h-header.patch vmlinux.lds-account-for-destructor-sections.patch x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch x86-quirks-reintroduce-scanning-of-secondary-buses.patch xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch xen-blkfront-fix-resume-issues-after-a-migration.patch xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch xen-pciback-fix-conf_space-read-write-overlap-check.patch xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch xenbus-don-t-bug-on-user-mode-induced-condition.patch --- diff --git a/queue-4.6/arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch b/queue-4.6/arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch new file mode 100644 index 00000000000..22a3ffad921 --- /dev/null +++ b/queue-4.6/arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch @@ -0,0 +1,45 @@ +From f52e126cc7476196f44f3c313b7d9f0699a881fc Mon Sep 17 00:00:00 2001 +From: Vineet Gupta +Date: Tue, 28 Jun 2016 09:42:25 +0530 +Subject: ARC: unwind: ensure that .debug_frame is generated (vs. .eh_frame) + +From: Vineet Gupta + +commit f52e126cc7476196f44f3c313b7d9f0699a881fc upstream. + +With recent binutils update to support dwarf CFI pseudo-ops in gas, we +now get .eh_frame vs. .debug_frame. Although the call frame info is +exactly the same in both, the CIE differs, which the current kernel +unwinder can't cope with. + +This broke both the kernel unwinder as well as loadable modules (latter +because of a new unhandled relo R_ARC_32_PCREL from .rela.eh_frame in +the module loader) + +The ideal solution would be to switch unwinder to .eh_frame. +For now however we can make do by just ensureing .debug_frame is +generated by removing -fasynchronous-unwind-tables + + .eh_frame generated with -gdwarf-2 -fasynchronous-unwind-tables + .debug_frame generated with -gdwarf-2 + +Fixes STAR 9001058196 + +Signed-off-by: Vineet Gupta +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arc/Makefile | 2 -- + 1 file changed, 2 deletions(-) + +--- a/arch/arc/Makefile ++++ b/arch/arc/Makefile +@@ -66,8 +66,6 @@ endif + + endif + +-cflags-$(CONFIG_ARC_DW2_UNWIND) += -fasynchronous-unwind-tables +- + # By default gcc 4.8 generates dwarf4 which kernel unwinder can't grok + ifeq ($(atleast_gcc48),y) + cflags-$(CONFIG_ARC_DW2_UNWIND) += -gdwarf-2 diff --git a/queue-4.6/arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch b/queue-4.6/arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch new file mode 100644 index 00000000000..3a54d1d4e47 --- /dev/null +++ b/queue-4.6/arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch @@ -0,0 +1,41 @@ +From 9bd54517ee86cb164c734f72ea95aeba4804f10b Mon Sep 17 00:00:00 2001 +From: Alexey Brodkin +Date: Thu, 23 Jun 2016 11:00:39 +0300 +Subject: arc: unwind: warn only once if DW2_UNWIND is disabled + +From: Alexey Brodkin + +commit 9bd54517ee86cb164c734f72ea95aeba4804f10b upstream. + +If CONFIG_ARC_DW2_UNWIND is disabled every time arc_unwind_core() +gets called following message gets printed in debug console: +----------------->8--------------- +CONFIG_ARC_DW2_UNWIND needs to be enabled +----------------->8--------------- + +That message makes sense if user indeed wants to see a backtrace or +get nice function call-graphs in perf but what if user disabled +unwinder for the purpose? Why pollute his debug console? + +So instead we'll warn user about possibly missing feature once and +let him decide if that was what he or she really wanted. + +Signed-off-by: Alexey Brodkin +Signed-off-by: Vineet Gupta +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arc/kernel/stacktrace.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/arc/kernel/stacktrace.c ++++ b/arch/arc/kernel/stacktrace.c +@@ -142,7 +142,7 @@ arc_unwind_core(struct task_struct *tsk, + * prelogue is setup (callee regs saved and then fp set and not other + * way around + */ +- pr_warn("CONFIG_ARC_DW2_UNWIND needs to be enabled\n"); ++ pr_warn_once("CONFIG_ARC_DW2_UNWIND needs to be enabled\n"); + return 0; + + #endif diff --git a/queue-4.6/dmaengine-at_xdmac-align-descriptors-on-64-bits.patch b/queue-4.6/dmaengine-at_xdmac-align-descriptors-on-64-bits.patch new file mode 100644 index 00000000000..d5fe7164757 --- /dev/null +++ b/queue-4.6/dmaengine-at_xdmac-align-descriptors-on-64-bits.patch @@ -0,0 +1,43 @@ +From 4a9723e8df68cfce4048517ee32e37f78854b6fb Mon Sep 17 00:00:00 2001 +From: Ludovic Desroches +Date: Thu, 12 May 2016 16:54:08 +0200 +Subject: dmaengine: at_xdmac: align descriptors on 64 bits + +From: Ludovic Desroches + +commit 4a9723e8df68cfce4048517ee32e37f78854b6fb upstream. + +Having descriptors aligned on 64 bits allows update CNDA and CUBC in an +atomic way. + +Signed-off-by: Ludovic Desroches +Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel +eXtended DMA Controller driver") +Reviewed-by: Nicolas Ferre +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/dma/at_xdmac.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/dma/at_xdmac.c ++++ b/drivers/dma/at_xdmac.c +@@ -242,7 +242,7 @@ struct at_xdmac_lld { + u32 mbr_dus; /* Destination Microblock Stride Register */ + }; + +- ++/* 64-bit alignment needed to update CNDA and CUBC registers in an atomic way. */ + struct at_xdmac_desc { + struct at_xdmac_lld lld; + enum dma_transfer_direction direction; +@@ -253,7 +253,7 @@ struct at_xdmac_desc { + unsigned int xfer_size; + struct list_head descs_list; + struct list_head xfer_node; +-}; ++} __aligned(sizeof(u64)); + + static inline void __iomem *at_xdmac_chan_reg_base(struct at_xdmac *atxdmac, unsigned int chan_nb) + { diff --git a/queue-4.6/dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch b/queue-4.6/dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch new file mode 100644 index 00000000000..fbc401b3b2d --- /dev/null +++ b/queue-4.6/dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch @@ -0,0 +1,65 @@ +From 9295c41d77ca93aac79cfca6fa09fa1ca5cab66f Mon Sep 17 00:00:00 2001 +From: Ludovic Desroches +Date: Thu, 12 May 2016 16:54:10 +0200 +Subject: dmaengine: at_xdmac: double FIFO flush needed to compute residue + +From: Ludovic Desroches + +commit 9295c41d77ca93aac79cfca6fa09fa1ca5cab66f upstream. + +Due to the way CUBC register is updated, a double flush is needed to +compute an accurate residue. First flush aim is to get data from the DMA +FIFO and second one ensures that we won't report data which are not in +memory. + +Signed-off-by: Ludovic Desroches +Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel +eXtended DMA Controller driver") +Reviewed-by: Nicolas Ferre +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/dma/at_xdmac.c | 24 +++++++++++++++++++++++- + 1 file changed, 23 insertions(+), 1 deletion(-) + +--- a/drivers/dma/at_xdmac.c ++++ b/drivers/dma/at_xdmac.c +@@ -1425,7 +1425,16 @@ at_xdmac_tx_status(struct dma_chan *chan + residue = desc->xfer_size; + /* + * Flush FIFO: only relevant when the transfer is source peripheral +- * synchronized. ++ * synchronized. Flush is needed before reading CUBC because data in ++ * the FIFO are not reported by CUBC. Reporting a residue of the ++ * transfer length while we have data in FIFO can cause issue. ++ * Usecase: atmel USART has a timeout which means I have received ++ * characters but there is no more character received for a while. On ++ * timeout, it requests the residue. If the data are in the DMA FIFO, ++ * we will return a residue of the transfer length. It means no data ++ * received. If an application is waiting for these data, it will hang ++ * since we won't have another USART timeout without receiving new ++ * data. + */ + mask = AT_XDMAC_CC_TYPE | AT_XDMAC_CC_DSYNC; + value = AT_XDMAC_CC_TYPE_PER_TRAN | AT_XDMAC_CC_DSYNC_PER2MEM; +@@ -1481,6 +1490,19 @@ at_xdmac_tx_status(struct dma_chan *chan + } + + /* ++ * Flush FIFO: only relevant when the transfer is source peripheral ++ * synchronized. Another flush is needed here because CUBC is updated ++ * when the controller sends the data write command. It can lead to ++ * report data that are not written in the memory or the device. The ++ * FIFO flush ensures that data are really written. ++ */ ++ if ((desc->lld.mbr_cfg & mask) == value) { ++ at_xdmac_write(atxdmac, AT_XDMAC_GSWF, atchan->mask); ++ while (!(at_xdmac_chan_read(atchan, AT_XDMAC_CIS) & AT_XDMAC_CIS_FIS)) ++ cpu_relax(); ++ } ++ ++ /* + * Remove size of all microblocks already transferred and the current + * one. Then add the remaining size to transfer of the current + * microblock. diff --git a/queue-4.6/dmaengine-at_xdmac-fix-residue-corruption.patch b/queue-4.6/dmaengine-at_xdmac-fix-residue-corruption.patch new file mode 100644 index 00000000000..a1a0cea8957 --- /dev/null +++ b/queue-4.6/dmaengine-at_xdmac-fix-residue-corruption.patch @@ -0,0 +1,99 @@ +From 53398f488821c2b5b15291e3debec6ad33f75d3d Mon Sep 17 00:00:00 2001 +From: Ludovic Desroches +Date: Thu, 12 May 2016 16:54:09 +0200 +Subject: dmaengine: at_xdmac: fix residue corruption + +From: Ludovic Desroches + +commit 53398f488821c2b5b15291e3debec6ad33f75d3d upstream. + +An unexpected value of CUBC can lead to a corrupted residue. A more +complex sequence is needed to detect an inaccurate value for NCA or CUBC. + +Signed-off-by: Ludovic Desroches +Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel +eXtended DMA Controller driver") +Reviewed-by: Nicolas Ferre +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/dma/at_xdmac.c | 54 +++++++++++++++++++++++++++++-------------------- + 1 file changed, 32 insertions(+), 22 deletions(-) + +--- a/drivers/dma/at_xdmac.c ++++ b/drivers/dma/at_xdmac.c +@@ -1400,6 +1400,7 @@ at_xdmac_tx_status(struct dma_chan *chan + u32 cur_nda, check_nda, cur_ubc, mask, value; + u8 dwidth = 0; + unsigned long flags; ++ bool initd; + + ret = dma_cookie_status(chan, cookie, txstate); + if (ret == DMA_COMPLETE) +@@ -1435,34 +1436,43 @@ at_xdmac_tx_status(struct dma_chan *chan + } + + /* +- * When processing the residue, we need to read two registers but we +- * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where +- * we stand in the descriptor list and AT_XDMAC_CUBC is used +- * to know how many data are remaining for the current descriptor. +- * Since the dma channel is not paused to not loose data, between the +- * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of +- * descriptor. +- * For that reason, after reading AT_XDMAC_CUBC, we check if we are +- * still using the same descriptor by reading a second time +- * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to +- * read again AT_XDMAC_CUBC. ++ * The easiest way to compute the residue should be to pause the DMA ++ * but doing this can lead to miss some data as some devices don't ++ * have FIFO. ++ * We need to read several registers because: ++ * - DMA is running therefore a descriptor change is possible while ++ * reading these registers ++ * - When the block transfer is done, the value of the CUBC register ++ * is set to its initial value until the fetch of the next descriptor. ++ * This value will corrupt the residue calculation so we have to skip ++ * it. ++ * ++ * INITD -------- ------------ ++ * |____________________| ++ * _______________________ _______________ ++ * NDA @desc2 \/ @desc3 ++ * _______________________/\_______________ ++ * __________ ___________ _______________ ++ * CUBC 0 \/ MAX desc1 \/ MAX desc2 ++ * __________/\___________/\_______________ ++ * ++ * Since descriptors are aligned on 64 bits, we can assume that ++ * the update of NDA and CUBC is atomic. + * Memory barriers are used to ensure the read order of the registers. +- * A max number of retries is set because unlikely it can never ends if +- * we are transferring a lot of data with small buffers. ++ * A max number of retries is set because unlikely it could never ends. + */ +- cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; +- rmb(); +- cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC); + for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) { +- rmb(); + check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; +- +- if (likely(cur_nda == check_nda)) +- break; +- +- cur_nda = check_nda; ++ rmb(); ++ initd = !!(at_xdmac_chan_read(atchan, AT_XDMAC_CC) & AT_XDMAC_CC_INITD); + rmb(); + cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC); ++ rmb(); ++ cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; ++ rmb(); ++ ++ if ((check_nda == cur_nda) && initd) ++ break; + } + + if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) { diff --git a/queue-4.6/fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch b/queue-4.6/fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch new file mode 100644 index 00000000000..9ced3ae1cb7 --- /dev/null +++ b/queue-4.6/fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch @@ -0,0 +1,59 @@ +From 63d2f95d63396059200c391ca87161897b99e74a Mon Sep 17 00:00:00 2001 +From: Torsten Hilbrich +Date: Fri, 24 Jun 2016 14:50:18 -0700 +Subject: fs/nilfs2: fix potential underflow in call to crc32_le + +From: Torsten Hilbrich + +commit 63d2f95d63396059200c391ca87161897b99e74a upstream. + +The value `bytes' comes from the filesystem which is about to be +mounted. We cannot trust that the value is always in the range we +expect it to be. + +Check its value before using it to calculate the length for the crc32_le +call. It value must be larger (or equal) sumoff + 4. + +This fixes a kernel bug when accidentially mounting an image file which +had the nilfs2 magic value 0x3434 at the right offset 0x406 by chance. +The bytes 0x01 0x00 were stored at 0x408 and were interpreted as a +s_bytes value of 1. This caused an underflow when substracting sumoff + +4 (20) in the call to crc32_le. + + BUG: unable to handle kernel paging request at ffff88021e600000 + IP: crc32_le+0x36/0x100 + ... + Call Trace: + nilfs_valid_sb.part.5+0x52/0x60 [nilfs2] + nilfs_load_super_block+0x142/0x300 [nilfs2] + init_nilfs+0x60/0x390 [nilfs2] + nilfs_mount+0x302/0x520 [nilfs2] + mount_fs+0x38/0x160 + vfs_kern_mount+0x67/0x110 + do_mount+0x269/0xe00 + SyS_mount+0x9f/0x100 + entry_SYSCALL_64_fastpath+0x16/0x71 + +Link: http://lkml.kernel.org/r/1466778587-5184-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp +Signed-off-by: Torsten Hilbrich +Tested-by: Torsten Hilbrich +Signed-off-by: Ryusuke Konishi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nilfs2/the_nilfs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/nilfs2/the_nilfs.c ++++ b/fs/nilfs2/the_nilfs.c +@@ -443,7 +443,7 @@ static int nilfs_valid_sb(struct nilfs_s + if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) + return 0; + bytes = le16_to_cpu(sbp->s_bytes); +- if (bytes > BLOCK_SIZE) ++ if (bytes < sumoff + 4 || bytes > BLOCK_SIZE) + return 0; + crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, + sumoff); diff --git a/queue-4.6/kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch b/queue-4.6/kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch new file mode 100644 index 00000000000..8930c4d1783 --- /dev/null +++ b/queue-4.6/kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch @@ -0,0 +1,52 @@ +From 57675cb976eff977aefb428e68e4e0236d48a9ff Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin +Date: Thu, 9 Jun 2016 15:20:05 +0300 +Subject: kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w + +From: Andrey Ryabinin + +commit 57675cb976eff977aefb428e68e4e0236d48a9ff upstream. + +Lengthy output of sysrq-w may take a lot of time on slow serial console. + +Currently we reset NMI-watchdog on the current CPU to avoid spurious +lockup messages. Sometimes this doesn't work since softlockup watchdog +might trigger on another CPU which is waiting for an IPI to proceed. +We reset softlockup watchdogs on all CPUs, but we do this only after +listing all tasks, and this may be too late on a busy system. + +So, reset watchdogs CPUs earlier, in for_each_process_thread() loop. + +Signed-off-by: Andrey Ryabinin +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched/core.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4993,14 +4993,16 @@ void show_state_filter(unsigned long sta + /* + * reset the NMI-timeout, listing all files on a slow + * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. + */ + touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); + if (!state_filter || (p->state & state_filter)) + sched_show_task(p); + } + +- touch_all_softlockup_watchdogs(); +- + #ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); + #endif diff --git a/queue-4.6/memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch b/queue-4.6/memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch new file mode 100644 index 00000000000..2b1fac219e3 --- /dev/null +++ b/queue-4.6/memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch @@ -0,0 +1,95 @@ +From ea3a9645866e12d2b198434f03df3c3e96fb86ce Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Fri, 24 Jun 2016 14:49:58 -0700 +Subject: memcg: css_alloc should return an ERR_PTR value on error + +From: Tejun Heo + +commit ea3a9645866e12d2b198434f03df3c3e96fb86ce upstream. + +mem_cgroup_css_alloc() was returning NULL on failure while cgroup core +expected it to return an ERR_PTR value leading to the following NULL +deref after a css allocation failure. Fix it by return +ERR_PTR(-ENOMEM) instead. I'll also update cgroup core so that it +can handle NULL returns. + + mkdir: page allocation failure: order:6, mode:0x240c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO) + CPU: 0 PID: 8738 Comm: mkdir Not tainted 4.7.0-rc3+ #123 + ... + Call Trace: + dump_stack+0x68/0xa1 + warn_alloc_failed+0xd6/0x130 + __alloc_pages_nodemask+0x4c6/0xf20 + alloc_pages_current+0x66/0xe0 + alloc_kmem_pages+0x14/0x80 + kmalloc_order_trace+0x2a/0x1a0 + __kmalloc+0x291/0x310 + memcg_update_all_caches+0x6c/0x130 + mem_cgroup_css_alloc+0x590/0x610 + cgroup_apply_control_enable+0x18b/0x370 + cgroup_mkdir+0x1de/0x2e0 + kernfs_iop_mkdir+0x55/0x80 + vfs_mkdir+0xb9/0x150 + SyS_mkdir+0x66/0xd0 + do_syscall_64+0x53/0x120 + entry_SYSCALL64_slow_path+0x25/0x25 + ... + BUG: unable to handle kernel NULL pointer dereference at 00000000000000d0 + IP: init_and_link_css+0x37/0x220 + PGD 34b1e067 PUD 3a109067 PMD 0 + Oops: 0002 [#1] SMP + Modules linked in: + CPU: 0 PID: 8738 Comm: mkdir Not tainted 4.7.0-rc3+ #123 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.2-20160422_131301-anatol 04/01/2014 + task: ffff88007cbc5200 ti: ffff8800666d4000 task.ti: ffff8800666d4000 + RIP: 0010:[] [] init_and_link_css+0x37/0x220 + RSP: 0018:ffff8800666d7d90 EFLAGS: 00010246 + RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 + RDX: ffffffff810f2499 RSI: 0000000000000000 RDI: 0000000000000008 + RBP: ffff8800666d7db8 R08: 0000000000000003 R09: 0000000000000000 + R10: 0000000000000001 R11: 0000000000000000 R12: ffff88005a5fb400 + R13: ffffffff81f0f8a0 R14: ffff88005a5fb400 R15: 0000000000000010 + FS: 00007fc944689700(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f3aed0d2b80 CR3: 000000003a1e8000 CR4: 00000000000006f0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + cgroup_apply_control_enable+0x1ac/0x370 + cgroup_mkdir+0x1de/0x2e0 + kernfs_iop_mkdir+0x55/0x80 + vfs_mkdir+0xb9/0x150 + SyS_mkdir+0x66/0xd0 + do_syscall_64+0x53/0x120 + entry_SYSCALL64_slow_path+0x25/0x25 + Code: 89 f5 48 89 fb 49 89 d4 48 83 ec 08 8b 05 72 3b d8 00 85 c0 0f 85 60 01 00 00 4c 89 e7 e8 72 f7 ff ff 48 8d 7b 08 48 89 d9 31 c0 <48> c7 83 d0 00 00 00 00 00 00 00 48 83 e7 f8 48 29 f9 81 c1 d8 + RIP init_and_link_css+0x37/0x220 + RSP + CR2: 00000000000000d0 + ---[ end trace a2d8836ae1e852d1 ]--- + +Link: http://lkml.kernel.org/r/20160621165740.GJ3262@mtj.duckdns.org +Signed-off-by: Tejun Heo +Reported-by: Johannes Weiner +Reviewed-by: Vladimir Davydov +Acked-by: Johannes Weiner +Acked-by: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -4184,7 +4184,7 @@ mem_cgroup_css_alloc(struct cgroup_subsy + return &memcg->css; + fail: + mem_cgroup_free(memcg); +- return NULL; ++ return ERR_PTR(-ENOMEM); + } + + static int diff --git a/queue-4.6/memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch b/queue-4.6/memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch new file mode 100644 index 00000000000..bf4d1ac879f --- /dev/null +++ b/queue-4.6/memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch @@ -0,0 +1,128 @@ +From d93c4130a7d049b234b5d5a15808eaf5406f2789 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Fri, 24 Jun 2016 14:49:54 -0700 +Subject: memcg: mem_cgroup_migrate() may be called with irq disabled + +From: Tejun Heo + +commit d93c4130a7d049b234b5d5a15808eaf5406f2789 upstream. + +mem_cgroup_migrate() uses local_irq_disable/enable() but can be called +with irq disabled from migrate_page_copy(). This ends up enabling irq +while holding a irq context lock triggering the following lockdep +warning. Fix it by using irq_save/restore instead. + + ================================= + [ INFO: inconsistent lock state ] + 4.7.0-rc1+ #52 Tainted: G W + --------------------------------- + inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. + kcompactd0/151 [HC0[0]:SC0[0]:HE1:SE1] takes: + (&(&ctx->completion_lock)->rlock){+.?.-.}, at: [<000000000038fd96>] aio_migratepage+0x156/0x1e8 + {IN-SOFTIRQ-W} state was registered at: + __lock_acquire+0x5b6/0x1930 + lock_acquire+0xee/0x270 + _raw_spin_lock_irqsave+0x66/0xb0 + aio_complete+0x98/0x328 + dio_complete+0xe4/0x1e0 + blk_update_request+0xd4/0x450 + scsi_end_request+0x48/0x1c8 + scsi_io_completion+0x272/0x698 + blk_done_softirq+0xca/0xe8 + __do_softirq+0xc8/0x518 + irq_exit+0xee/0x110 + do_IRQ+0x6a/0x88 + io_int_handler+0x11a/0x25c + __mutex_unlock_slowpath+0x144/0x1d8 + __mutex_unlock_slowpath+0x140/0x1d8 + kernfs_iop_permission+0x64/0x80 + __inode_permission+0x9e/0xf0 + link_path_walk+0x6e/0x510 + path_lookupat+0xc4/0x1a8 + filename_lookup+0x9c/0x160 + user_path_at_empty+0x5c/0x70 + SyS_readlinkat+0x68/0x140 + system_call+0xd6/0x270 + irq event stamp: 971410 + hardirqs last enabled at (971409): migrate_page_move_mapping+0x3ea/0x588 + hardirqs last disabled at (971410): _raw_spin_lock_irqsave+0x3c/0xb0 + softirqs last enabled at (970526): __do_softirq+0x460/0x518 + softirqs last disabled at (970519): irq_exit+0xee/0x110 + + other info that might help us debug this: + Possible unsafe locking scenario: + + CPU0 + ---- + lock(&(&ctx->completion_lock)->rlock); + + lock(&(&ctx->completion_lock)->rlock); + + *** DEADLOCK *** + + 3 locks held by kcompactd0/151: + #0: (&(&mapping->private_lock)->rlock){+.+.-.}, at: aio_migratepage+0x42/0x1e8 + #1: (&ctx->ring_lock){+.+.+.}, at: aio_migratepage+0x5a/0x1e8 + #2: (&(&ctx->completion_lock)->rlock){+.?.-.}, at: aio_migratepage+0x156/0x1e8 + + stack backtrace: + CPU: 20 PID: 151 Comm: kcompactd0 Tainted: G W 4.7.0-rc1+ #52 + Call Trace: + show_trace+0xea/0xf0 + show_stack+0x72/0xf0 + dump_stack+0x9a/0xd8 + print_usage_bug.part.27+0x2d4/0x2e8 + mark_lock+0x17e/0x758 + mark_held_locks+0xa2/0xd0 + trace_hardirqs_on_caller+0x140/0x1c0 + mem_cgroup_migrate+0x266/0x370 + aio_migratepage+0x16a/0x1e8 + move_to_new_page+0xb0/0x260 + migrate_pages+0x8f4/0x9f0 + compact_zone+0x4dc/0xdc8 + kcompactd_do_work+0x1aa/0x358 + kcompactd+0xba/0x2c8 + kthread+0x10a/0x110 + kernel_thread_starter+0x6/0xc + kernel_thread_starter+0x0/0xc + INFO: lockdep is turned off. + +Link: http://lkml.kernel.org/r/20160620184158.GO3262@mtj.duckdns.org +Link: http://lkml.kernel.org/g/5767CFE5.7080904@de.ibm.com +Fixes: 74485cf2bc85 ("mm: migrate: consolidate mem_cgroup_migrate() calls") +Signed-off-by: Tejun Heo +Reported-by: Christian Borntraeger +Acked-by: Johannes Weiner +Acked-by: Michal Hocko +Reviewed-by: Vladimir Davydov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5524,6 +5524,7 @@ void mem_cgroup_migrate(struct page *old + struct mem_cgroup *memcg; + unsigned int nr_pages; + bool compound; ++ unsigned long flags; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); +@@ -5554,10 +5555,10 @@ void mem_cgroup_migrate(struct page *old + + commit_charge(newpage, memcg, false); + +- local_irq_disable(); ++ local_irq_save(flags); + mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); + memcg_check_events(memcg, newpage); +- local_irq_enable(); ++ local_irq_restore(flags); + } + + DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); diff --git a/queue-4.6/mm-compaction-abort-free-scanner-if-split-fails.patch b/queue-4.6/mm-compaction-abort-free-scanner-if-split-fails.patch new file mode 100644 index 00000000000..cc8afcfdb37 --- /dev/null +++ b/queue-4.6/mm-compaction-abort-free-scanner-if-split-fails.patch @@ -0,0 +1,127 @@ +From a4f04f2c6955aff5e2c08dcb40aca247ff4d7370 Mon Sep 17 00:00:00 2001 +From: David Rientjes +Date: Fri, 24 Jun 2016 14:50:10 -0700 +Subject: mm, compaction: abort free scanner if split fails + +From: David Rientjes + +commit a4f04f2c6955aff5e2c08dcb40aca247ff4d7370 upstream. + +If the memory compaction free scanner cannot successfully split a free +page (only possible due to per-zone low watermark), terminate the free +scanner rather than continuing to scan memory needlessly. If the +watermark is insufficient for a free page of order <= cc->order, then +terminate the scanner since all future splits will also likely fail. + +This prevents the compaction freeing scanner from scanning all memory on +very large zones (very noticeable for zones > 128GB, for instance) when +all splits will likely fail while holding zone->lock. + +compaction_alloc() iterating a 128GB zone has been benchmarked to take +over 400ms on some systems whereas any free page isolated and ready to +be split ends up failing in split_free_page() because of the low +watermark check and thus the iteration continues. + +The next time compaction occurs, the freeing scanner will likely start +at the end of the zone again since no success was made previously and we +get the same lengthy iteration until the zone is brought above the low +watermark. All thp page faults can take >400ms in such a state without +this fix. + +Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606211820350.97086@chino.kir.corp.google.com +Signed-off-by: David Rientjes +Acked-by: Vlastimil Babka +Cc: Minchan Kim +Cc: Joonsoo Kim +Cc: Mel Gorman +Cc: Hugh Dickins +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/compaction.c | 39 +++++++++++++++++++++------------------ + 1 file changed, 21 insertions(+), 18 deletions(-) + +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -436,25 +436,23 @@ static unsigned long isolate_freepages_b + + /* Found a free page, break it into order-0 pages */ + isolated = split_free_page(page); ++ if (!isolated) ++ break; ++ + total_isolated += isolated; ++ cc->nr_freepages += isolated; + for (i = 0; i < isolated; i++) { + list_add(&page->lru, freelist); + page++; + } +- +- /* If a page was split, advance to the end of it */ +- if (isolated) { +- cc->nr_freepages += isolated; +- if (!strict && +- cc->nr_migratepages <= cc->nr_freepages) { +- blockpfn += isolated; +- break; +- } +- +- blockpfn += isolated - 1; +- cursor += isolated - 1; +- continue; ++ if (!strict && cc->nr_migratepages <= cc->nr_freepages) { ++ blockpfn += isolated; ++ break; + } ++ /* Advance to the end of split page */ ++ blockpfn += isolated - 1; ++ cursor += isolated - 1; ++ continue; + + isolate_fail: + if (strict) +@@ -464,6 +462,9 @@ isolate_fail: + + } + ++ if (locked) ++ spin_unlock_irqrestore(&cc->zone->lock, flags); ++ + /* + * There is a tiny chance that we have read bogus compound_order(), + * so be careful to not go outside of the pageblock. +@@ -485,9 +486,6 @@ isolate_fail: + if (strict && blockpfn < end_pfn) + total_isolated = 0; + +- if (locked) +- spin_unlock_irqrestore(&cc->zone->lock, flags); +- + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (blockpfn == end_pfn) + update_pageblock_skip(cc, valid_page, total_isolated, false); +@@ -938,6 +936,7 @@ static void isolate_freepages(struct com + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages, + isolate_start_pfn = block_start_pfn) { ++ unsigned long isolated; + + /* + * This can iterate a massively long zone without finding any +@@ -962,8 +961,12 @@ static void isolate_freepages(struct com + continue; + + /* Found a block suitable for isolating free pages from. */ +- isolate_freepages_block(cc, &isolate_start_pfn, +- block_end_pfn, freelist, false); ++ isolated = isolate_freepages_block(cc, &isolate_start_pfn, ++ block_end_pfn, freelist, false); ++ /* If isolation failed early, do not continue needlessly */ ++ if (!isolated && isolate_start_pfn < block_end_pfn && ++ cc->nr_migratepages > cc->nr_freepages) ++ break; + + /* + * If we isolated enough freepages, or aborted due to async diff --git a/queue-4.6/mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch b/queue-4.6/mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch new file mode 100644 index 00000000000..49eeca45d76 --- /dev/null +++ b/queue-4.6/mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch @@ -0,0 +1,104 @@ +From a46cbf3bc53b6a93fb84a5ffb288c354fa807954 Mon Sep 17 00:00:00 2001 +From: David Rientjes +Date: Thu, 14 Jul 2016 12:06:50 -0700 +Subject: mm, compaction: prevent VM_BUG_ON when terminating freeing scanner + +From: David Rientjes + +commit a46cbf3bc53b6a93fb84a5ffb288c354fa807954 upstream. + +It's possible to isolate some freepages in a pageblock and then fail +split_free_page() due to the low watermark check. In this case, we hit +VM_BUG_ON() because the freeing scanner terminated early without a +contended lock or enough freepages. + +This should never have been a VM_BUG_ON() since it's not a fatal +condition. It should have been a VM_WARN_ON() at best, or even handled +gracefully. + +Regardless, we need to terminate anytime the full pageblock scan was not +done. The logic belongs in isolate_freepages_block(), so handle its +state gracefully by terminating the pageblock loop and making a note to +restart at the same pageblock next time since it was not possible to +complete the scan this time. + +[rientjes@google.com: don't rescan pages in a pageblock] + Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1607111244150.83138@chino.kir.corp.google.com +Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606291436300.145590@chino.kir.corp.google.com +Signed-off-by: David Rientjes +Reported-by: Minchan Kim +Tested-by: Minchan Kim +Cc: Joonsoo Kim +Cc: Hugh Dickins +Cc: Mel Gorman +Cc: Vlastimil Babka +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/compaction.c | 36 ++++++++++++++---------------------- + 1 file changed, 14 insertions(+), 22 deletions(-) + +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -936,8 +936,6 @@ static void isolate_freepages(struct com + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages, + isolate_start_pfn = block_start_pfn) { +- unsigned long isolated; +- + /* + * This can iterate a massively long zone without finding any + * suitable migration targets, so periodically check if we need +@@ -961,36 +959,30 @@ static void isolate_freepages(struct com + continue; + + /* Found a block suitable for isolating free pages from. */ +- isolated = isolate_freepages_block(cc, &isolate_start_pfn, +- block_end_pfn, freelist, false); +- /* If isolation failed early, do not continue needlessly */ +- if (!isolated && isolate_start_pfn < block_end_pfn && +- cc->nr_migratepages > cc->nr_freepages) +- break; ++ isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, ++ freelist, false); + + /* +- * If we isolated enough freepages, or aborted due to async +- * compaction being contended, terminate the loop. +- * Remember where the free scanner should restart next time, +- * which is where isolate_freepages_block() left off. +- * But if it scanned the whole pageblock, isolate_start_pfn +- * now points at block_end_pfn, which is the start of the next +- * pageblock. +- * In that case we will however want to restart at the start +- * of the previous pageblock. ++ * If we isolated enough freepages, or aborted due to lock ++ * contention, terminate. + */ + if ((cc->nr_freepages >= cc->nr_migratepages) + || cc->contended) { +- if (isolate_start_pfn >= block_end_pfn) ++ if (isolate_start_pfn >= block_end_pfn) { ++ /* ++ * Restart at previous pageblock if more ++ * freepages can be isolated next time. ++ */ + isolate_start_pfn = + block_start_pfn - pageblock_nr_pages; ++ } + break; +- } else { ++ } else if (isolate_start_pfn < block_end_pfn) { + /* +- * isolate_freepages_block() should not terminate +- * prematurely unless contended, or isolated enough ++ * If isolation failed early, do not continue ++ * needlessly. + */ +- VM_BUG_ON(isolate_start_pfn < block_end_pfn); ++ break; + } + } + diff --git a/queue-4.6/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch b/queue-4.6/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch new file mode 100644 index 00000000000..a736406294b --- /dev/null +++ b/queue-4.6/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch @@ -0,0 +1,295 @@ +From 73f576c04b9410ed19660f74f97521bee6e1c546 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 20 Jul 2016 15:44:57 -0700 +Subject: mm: memcontrol: fix cgroup creation failure after many small jobs + +From: Johannes Weiner + +commit 73f576c04b9410ed19660f74f97521bee6e1c546 upstream. + +The memory controller has quite a bit of state that usually outlives the +cgroup and pins its CSS until said state disappears. At the same time +it imposes a 16-bit limit on the CSS ID space to economically store IDs +in the wild. Consequently, when we use cgroups to contain frequent but +small and short-lived jobs that leave behind some page cache, we quickly +run into the 64k limitations of outstanding CSSs. Creating a new cgroup +fails with -ENOSPC while there are only a few, or even no user-visible +cgroups in existence. + +Although pinning CSSs past cgroup removal is common, there are only two +instances that actually need an ID after a cgroup is deleted: cache +shadow entries and swapout records. + +Cache shadow entries reference the ID weakly and can deal with the CSS +having disappeared when it's looked up later. They pose no hurdle. + +Swap-out records do need to pin the css to hierarchically attribute +swapins after the cgroup has been deleted; though the only pages that +remain swapped out after offlining are tmpfs/shmem pages. And those +references are under the user's control, so they are manageable. + +This patch introduces a private 16-bit memcg ID and switches swap and +cache shadow entries over to using that. This ID can then be recycled +after offlining when the CSS remains pinned only by objects that don't +specifically need it. + +This script demonstrates the problem by faulting one cache page in a new +cgroup and deleting it again: + + set -e + mkdir -p pages + for x in `seq 128000`; do + [ $((x % 1000)) -eq 0 ] && echo $x + mkdir /cgroup/foo + echo $$ >/cgroup/foo/cgroup.procs + echo trex >pages/$x + echo $$ >/cgroup/cgroup.procs + rmdir /cgroup/foo + done + +When run on an unpatched kernel, we eventually run out of possible IDs +even though there are no visible cgroups: + + [root@ham ~]# ./cssidstress.sh + [...] + 65000 + mkdir: cannot create directory '/cgroup/foo': No space left on device + +After this patch, the IDs get released upon cgroup destruction and the +cache and css objects get released once memory reclaim kicks in. + +[hannes@cmpxchg.org: init the IDR] + Link: http://lkml.kernel.org/r/20160621154601.GA22431@cmpxchg.org +Fixes: b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups") +Link: http://lkml.kernel.org/r/20160617162516.GD19084@cmpxchg.org +Signed-off-by: Johannes Weiner +Reported-by: John Garcia +Reviewed-by: Vladimir Davydov +Acked-by: Tejun Heo +Cc: Nikolay Borisov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/memcontrol.h | 25 +++++-------- + mm/memcontrol.c | 82 +++++++++++++++++++++++++++++++++++++++++---- + mm/slab_common.c | 4 +- + 3 files changed, 87 insertions(+), 24 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -97,6 +97,11 @@ enum mem_cgroup_events_target { + #define MEM_CGROUP_ID_SHIFT 16 + #define MEM_CGROUP_ID_MAX USHRT_MAX + ++struct mem_cgroup_id { ++ int id; ++ atomic_t ref; ++}; ++ + struct mem_cgroup_stat_cpu { + long count[MEMCG_NR_STAT]; + unsigned long events[MEMCG_NR_EVENTS]; +@@ -172,6 +177,9 @@ enum memcg_kmem_state { + struct mem_cgroup { + struct cgroup_subsys_state css; + ++ /* Private memcg ID. Used to ID objects that outlive the cgroup */ ++ struct mem_cgroup_id id; ++ + /* Accounted resources */ + struct page_counter memory; + struct page_counter swap; +@@ -330,22 +338,9 @@ static inline unsigned short mem_cgroup_ + if (mem_cgroup_disabled()) + return 0; + +- return memcg->css.id; +-} +- +-/** +- * mem_cgroup_from_id - look up a memcg from an id +- * @id: the id to look up +- * +- * Caller must hold rcu_read_lock() and use css_tryget() as necessary. +- */ +-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +-{ +- struct cgroup_subsys_state *css; +- +- css = css_from_id(id, &memory_cgrp_subsys); +- return mem_cgroup_from_css(css); ++ return memcg->id.id; + } ++struct mem_cgroup *mem_cgroup_from_id(unsigned short id); + + /** + * parent_mem_cgroup - find the accounting parent of a memcg +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -4038,6 +4038,60 @@ static struct cftype mem_cgroup_legacy_f + { }, /* terminate */ + }; + ++/* ++ * Private memory cgroup IDR ++ * ++ * Swap-out records and page cache shadow entries need to store memcg ++ * references in constrained space, so we maintain an ID space that is ++ * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of ++ * memory-controlled cgroups to 64k. ++ * ++ * However, there usually are many references to the oflline CSS after ++ * the cgroup has been destroyed, such as page cache or reclaimable ++ * slab objects, that don't need to hang on to the ID. We want to keep ++ * those dead CSS from occupying IDs, or we might quickly exhaust the ++ * relatively small ID space and prevent the creation of new cgroups ++ * even when there are much fewer than 64k cgroups - possibly none. ++ * ++ * Maintain a private 16-bit ID space for memcg, and allow the ID to ++ * be freed and recycled when it's no longer needed, which is usually ++ * when the CSS is offlined. ++ * ++ * The only exception to that are records of swapped out tmpfs/shmem ++ * pages that need to be attributed to live ancestors on swapin. But ++ * those references are manageable from userspace. ++ */ ++ ++static DEFINE_IDR(mem_cgroup_idr); ++ ++static void mem_cgroup_id_get(struct mem_cgroup *memcg) ++{ ++ atomic_inc(&memcg->id.ref); ++} ++ ++static void mem_cgroup_id_put(struct mem_cgroup *memcg) ++{ ++ if (atomic_dec_and_test(&memcg->id.ref)) { ++ idr_remove(&mem_cgroup_idr, memcg->id.id); ++ memcg->id.id = 0; ++ ++ /* Memcg ID pins CSS */ ++ css_put(&memcg->css); ++ } ++} ++ ++/** ++ * mem_cgroup_from_id - look up a memcg from a memcg id ++ * @id: the memcg id to look up ++ * ++ * Caller must hold rcu_read_lock(). ++ */ ++struct mem_cgroup *mem_cgroup_from_id(unsigned short id) ++{ ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++ return idr_find(&mem_cgroup_idr, id); ++} ++ + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) + { + struct mem_cgroup_per_node *pn; +@@ -4097,6 +4151,12 @@ static struct mem_cgroup *mem_cgroup_all + if (!memcg) + return NULL; + ++ memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, ++ 1, MEM_CGROUP_ID_MAX, ++ GFP_KERNEL); ++ if (memcg->id.id < 0) ++ goto fail; ++ + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat) + goto fail; +@@ -4123,8 +4183,11 @@ static struct mem_cgroup *mem_cgroup_all + #ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&memcg->cgwb_list); + #endif ++ idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + return memcg; + fail: ++ if (memcg->id.id > 0) ++ idr_remove(&mem_cgroup_idr, memcg->id.id); + mem_cgroup_free(memcg); + return NULL; + } +@@ -4187,12 +4250,11 @@ fail: + return ERR_PTR(-ENOMEM); + } + +-static int +-mem_cgroup_css_online(struct cgroup_subsys_state *css) ++static int mem_cgroup_css_online(struct cgroup_subsys_state *css) + { +- if (css->id > MEM_CGROUP_ID_MAX) +- return -ENOSPC; +- ++ /* Online state pins memcg ID, memcg ID pins CSS */ ++ mem_cgroup_id_get(mem_cgroup_from_css(css)); ++ css_get(css); + return 0; + } + +@@ -4215,6 +4277,8 @@ static void mem_cgroup_css_offline(struc + + memcg_offline_kmem(memcg); + wb_memcg_offline(memcg); ++ ++ mem_cgroup_id_put(memcg); + } + + static void mem_cgroup_css_released(struct cgroup_subsys_state *css) +@@ -5736,6 +5800,7 @@ void mem_cgroup_swapout(struct page *pag + if (!memcg) + return; + ++ mem_cgroup_id_get(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); +@@ -5754,6 +5819,9 @@ void mem_cgroup_swapout(struct page *pag + VM_BUG_ON(!irqs_disabled()); + mem_cgroup_charge_statistics(memcg, page, false, -1); + memcg_check_events(memcg, page); ++ ++ if (!mem_cgroup_is_root(memcg)) ++ css_put(&memcg->css); + } + + /* +@@ -5784,11 +5852,11 @@ int mem_cgroup_try_charge_swap(struct pa + !page_counter_try_charge(&memcg->swap, 1, &counter)) + return -ENOMEM; + ++ mem_cgroup_id_get(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); + +- css_get(&memcg->css); + return 0; + } + +@@ -5817,7 +5885,7 @@ void mem_cgroup_uncharge_swap(swp_entry_ + page_counter_uncharge(&memcg->memsw, 1); + } + mem_cgroup_swap_statistics(memcg, false); +- css_put(&memcg->css); ++ mem_cgroup_id_put(memcg); + } + rcu_read_unlock(); + } +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -526,8 +526,8 @@ void memcg_create_kmem_cache(struct mem_ + goto out_unlock; + + cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); +- cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, +- css->id, memcg_name_buf); ++ cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name, ++ css->serial_nr, memcg_name_buf); + if (!cache_name) + goto out_unlock; + diff --git a/queue-4.6/mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch b/queue-4.6/mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch new file mode 100644 index 00000000000..f80230fe207 --- /dev/null +++ b/queue-4.6/mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch @@ -0,0 +1,52 @@ +From e4568d3803852d00effd41dcdd489e726b998879 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Thu, 14 Jul 2016 12:07:20 -0700 +Subject: mm, meminit: always return a valid node from early_pfn_to_nid + +From: Mel Gorman + +commit e4568d3803852d00effd41dcdd489e726b998879 upstream. + +early_pfn_to_nid can return node 0 if a PFN is invalid on machines that +has no node 0. A machine with only node 1 was observed to crash with +the following message: + + BUG: unable to handle kernel paging request at 000000000002a3c8 + PGD 0 + Modules linked in: + Hardware name: Supermicro H8DSP-8/H8DSP-8, BIOS 080011 06/30/2006 + task: ffffffff81c0d500 ti: ffffffff81c00000 task.ti: ffffffff81c00000 + RIP: reserve_bootmem_region+0x6a/0xef + CR2: 000000000002a3c8 CR3: 0000000001c06000 CR4: 00000000000006b0 + Call Trace: + free_all_bootmem+0x4b/0x12a + mem_init+0x70/0xa3 + start_kernel+0x25b/0x49b + +The problem is that early_page_uninitialised uses the early_pfn_to_nid +helper which returns node 0 for invalid PFNs. No caller of +early_pfn_to_nid cares except early_page_uninitialised. This patch has +early_pfn_to_nid always return a valid node. + +Link: http://lkml.kernel.org/r/1468008031-3848-3-git-send-email-mgorman@techsingularity.net +Signed-off-by: Mel Gorman +Acked-by: David Rientjes +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1110,7 +1110,7 @@ int __meminit early_pfn_to_nid(unsigned + spin_lock(&early_pfn_lock); + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); + if (nid < 0) +- nid = 0; ++ nid = first_online_node; + spin_unlock(&early_pfn_lock); + + return nid; diff --git a/queue-4.6/mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch b/queue-4.6/mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch new file mode 100644 index 00000000000..43223b4a201 --- /dev/null +++ b/queue-4.6/mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch @@ -0,0 +1,39 @@ +From ef70b6f41cda6270165a6f27b2548ed31cfa3cb2 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Thu, 14 Jul 2016 12:07:23 -0700 +Subject: mm, meminit: ensure node is online before checking whether pages are uninitialised + +From: Mel Gorman + +commit ef70b6f41cda6270165a6f27b2548ed31cfa3cb2 upstream. + +early_page_uninitialised looks up an arbitrary PFN. While a machine +without node 0 will boot with "mm, page_alloc: Always return a valid +node from early_pfn_to_nid", it works because it assumes that nodes are +always in PFN order. This is not guaranteed so this patch adds +robustness by always checking if the node being checked is online. + +Link: http://lkml.kernel.org/r/1468008031-3848-4-git-send-email-mgorman@techsingularity.net +Signed-off-by: Mel Gorman +Acked-by: David Rientjes +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -286,7 +286,9 @@ static inline void reset_deferred_memini + /* Returns true if the struct page for the pfn is uninitialised */ + static inline bool __meminit early_page_uninitialised(unsigned long pfn) + { +- if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) ++ int nid = early_pfn_to_nid(pfn); ++ ++ if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; diff --git a/queue-4.6/mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch b/queue-4.6/mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch new file mode 100644 index 00000000000..083f02b1c66 --- /dev/null +++ b/queue-4.6/mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch @@ -0,0 +1,49 @@ +From e838a45f9392a5bd2be1cd3ab0b16ae85857461c Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Fri, 24 Jun 2016 14:49:37 -0700 +Subject: mm, sl[au]b: add __GFP_ATOMIC to the GFP reclaim mask + +From: Mel Gorman + +commit e838a45f9392a5bd2be1cd3ab0b16ae85857461c upstream. + +Commit d0164adc89f6 ("mm, page_alloc: distinguish between being unable +to sleep, unwilling to sleep and avoiding waking kswapd") modified +__GFP_WAIT to explicitly identify the difference between atomic callers +and those that were unwilling to sleep. Later the definition was +removed entirely. + +The GFP_RECLAIM_MASK is the set of flags that affect watermark checking +and reclaim behaviour but __GFP_ATOMIC was never added. Without it, +atomic users of the slab allocator strip the __GFP_ATOMIC flag and +cannot access the page allocator atomic reserves. This patch addresses +the problem. + +The user-visible impact depends on the workload but potentially atomic +allocations unnecessarily fail without this path. + +Link: http://lkml.kernel.org/r/20160610093832.GK2527@techsingularity.net +Signed-off-by: Mel Gorman +Reported-by: Marcin Wojtas +Acked-by: Vlastimil Babka +Acked-by: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/internal.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -24,7 +24,8 @@ + */ + #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ + __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ +- __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) ++ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ ++ __GFP_ATOMIC) + + /* The GFP flags allowed during early boot */ + #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/queue-4.6/mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch b/queue-4.6/mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch new file mode 100644 index 00000000000..1a05a82570c --- /dev/null +++ b/queue-4.6/mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch @@ -0,0 +1,121 @@ +From 8f182270dfec432e93fae14f9208a6b9af01009f Mon Sep 17 00:00:00 2001 +From: Lukasz Odzioba +Date: Fri, 24 Jun 2016 14:50:01 -0700 +Subject: mm/swap.c: flush lru pvecs on compound page arrival + +From: Lukasz Odzioba + +commit 8f182270dfec432e93fae14f9208a6b9af01009f upstream. + +Currently we can have compound pages held on per cpu pagevecs, which +leads to a lot of memory unavailable for reclaim when needed. In the +systems with hundreads of processors it can be GBs of memory. + +On of the way of reproducing the problem is to not call munmap +explicitly on all mapped regions (i.e. after receiving SIGTERM). After +that some pages (with THP enabled also huge pages) may end up on +lru_add_pvec, example below. + + void main() { + #pragma omp parallel + { + size_t size = 55 * 1000 * 1000; // smaller than MEM/CPUS + void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS , -1, 0); + if (p != MAP_FAILED) + memset(p, 0, size); + //munmap(p, size); // uncomment to make the problem go away + } + } + +When we run it with THP enabled it will leave significant amount of +memory on lru_add_pvec. This memory will be not reclaimed if we hit +OOM, so when we run above program in a loop: + + for i in `seq 100`; do ./a.out; done + +many processes (95% in my case) will be killed by OOM. + +The primary point of the LRU add cache is to save the zone lru_lock +contention with a hope that more pages will belong to the same zone and +so their addition can be batched. The huge page is already a form of +batched addition (it will add 512 worth of memory in one go) so skipping +the batching seems like a safer option when compared to a potential +excess in the caching which can be quite large and much harder to fix +because lru_add_drain_all is way to expensive and it is not really clear +what would be a good moment to call it. + +Similarly we can reproduce the problem on lru_deactivate_pvec by adding: +madvise(p, size, MADV_FREE); after memset. + +This patch flushes lru pvecs on compound page arrival making the problem +less severe - after applying it kill rate of above example drops to 0%, +due to reducing maximum amount of memory held on pvec from 28MB (with +THP) to 56kB per CPU. + +Suggested-by: Michal Hocko +Link: http://lkml.kernel.org/r/1466180198-18854-1-git-send-email-lukasz.odzioba@intel.com +Signed-off-by: Lukasz Odzioba +Acked-by: Michal Hocko +Cc: Kirill Shutemov +Cc: Andrea Arcangeli +Cc: Vladimir Davydov +Cc: Ming Li +Cc: Minchan Kim +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/swap.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -239,7 +239,7 @@ void rotate_reclaimable_page(struct page + get_page(page); + local_irq_save(flags); + pvec = this_cpu_ptr(&lru_rotate_pvecs); +- if (!pagevec_add(pvec, page)) ++ if (!pagevec_add(pvec, page) || PageCompound(page)) + pagevec_move_tail(pvec); + local_irq_restore(flags); + } +@@ -295,7 +295,7 @@ void activate_page(struct page *page) + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + get_page(page); +- if (!pagevec_add(pvec, page)) ++ if (!pagevec_add(pvec, page) || PageCompound(page)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); + put_cpu_var(activate_page_pvecs); + } +@@ -390,9 +390,8 @@ static void __lru_cache_add(struct page + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + + get_page(page); +- if (!pagevec_space(pvec)) ++ if (!pagevec_add(pvec, page) || PageCompound(page)) + __pagevec_lru_add(pvec); +- pagevec_add(pvec, page); + put_cpu_var(lru_add_pvec); + } + +@@ -627,7 +626,7 @@ void deactivate_file_page(struct page *p + if (likely(get_page_unless_zero(page))) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); + +- if (!pagevec_add(pvec, page)) ++ if (!pagevec_add(pvec, page) || PageCompound(page)) + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + put_cpu_var(lru_deactivate_file_pvecs); + } +@@ -647,7 +646,7 @@ void deactivate_page(struct page *page) + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + get_page(page); +- if (!pagevec_add(pvec, page)) ++ if (!pagevec_add(pvec, page) || PageCompound(page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } diff --git a/queue-4.6/mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch b/queue-4.6/mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch new file mode 100644 index 00000000000..0c04c9912d1 --- /dev/null +++ b/queue-4.6/mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch @@ -0,0 +1,110 @@ +From 5a49973d7143ebbabd76e1dcd69ee42e349bb7b9 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Thu, 14 Jul 2016 12:07:38 -0700 +Subject: mm: thp: refix false positive BUG in page_move_anon_rmap() + +From: Hugh Dickins + +commit 5a49973d7143ebbabd76e1dcd69ee42e349bb7b9 upstream. + +The VM_BUG_ON_PAGE in page_move_anon_rmap() is more trouble than it's +worth: the syzkaller fuzzer hit it again. It's still wrong for some THP +cases, because linear_page_index() was never intended to apply to +addresses before the start of a vma. + +That's easily fixed with a signed long cast inside linear_page_index(); +and Dmitry has tested such a patch, to verify the false positive. But +why extend linear_page_index() just for this case? when the avoidance in +page_move_anon_rmap() has already grown ugly, and there's no reason for +the check at all (nothing else there is using address or index). + +Remove address arg from page_move_anon_rmap(), remove VM_BUG_ON_PAGE, +remove CONFIG_DEBUG_VM PageTransHuge adjustment. + +And one more thing: should the compound_head(page) be done inside or +outside page_move_anon_rmap()? It's usually pushed down to the lowest +level nowadays (and mm/memory.c shows no other explicit use of it), so I +think it's better done in page_move_anon_rmap() than by caller. + +Fixes: 0798d3c022dc ("mm: thp: avoid false positive VM_BUG_ON_PAGE in page_move_anon_rmap()") +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1607120444540.12528@eggly.anvils +Signed-off-by: Hugh Dickins +Reported-by: Dmitry Vyukov +Acked-by: Kirill A. Shutemov +Cc: Mika Westerberg +Cc: Andrea Arcangeli +Cc: Rik van Riel +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/rmap.h | 2 +- + mm/hugetlb.c | 2 +- + mm/memory.c | 3 +-- + mm/rmap.c | 9 +++------ + 4 files changed, 6 insertions(+), 10 deletions(-) + +--- a/include/linux/rmap.h ++++ b/include/linux/rmap.h +@@ -158,7 +158,7 @@ struct anon_vma *page_get_anon_vma(struc + /* + * rmap interfaces called when adding or removing pte of page + */ +-void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); ++void page_move_anon_rmap(struct page *, struct vm_area_struct *); + void page_add_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); + void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -3328,7 +3328,7 @@ retry_avoidcopy: + /* If no-one else is actually using this page, avoid the copy + * and just make the page writable */ + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { +- page_move_anon_rmap(old_page, vma, address); ++ page_move_anon_rmap(old_page, vma); + set_huge_ptep_writable(vma, address, ptep); + return 0; + } +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -2397,8 +2397,7 @@ static int do_wp_page(struct mm_struct * + * Protected against the rmap code by + * the page lock. + */ +- page_move_anon_rmap(compound_head(old_page), +- vma, address); ++ page_move_anon_rmap(old_page, vma); + } + unlock_page(old_page); + return wp_page_reuse(mm, vma, address, page_table, ptl, +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -1084,23 +1084,20 @@ EXPORT_SYMBOL_GPL(page_mkclean); + * page_move_anon_rmap - move a page to our anon_vma + * @page: the page to move to our anon_vma + * @vma: the vma the page belongs to +- * @address: the user virtual address mapped + * + * When a page belongs exclusively to one process after a COW event, + * that page can be moved into the anon_vma that belongs to just that + * process, so the rmap code will not search the parent or sibling + * processes. + */ +-void page_move_anon_rmap(struct page *page, +- struct vm_area_struct *vma, unsigned long address) ++void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) + { + struct anon_vma *anon_vma = vma->anon_vma; + ++ page = compound_head(page); ++ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_VMA(!anon_vma, vma); +- if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page)) +- address &= HPAGE_PMD_MASK; +- VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + /* diff --git a/queue-4.6/perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch b/queue-4.6/perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch new file mode 100644 index 00000000000..77bb527e350 --- /dev/null +++ b/queue-4.6/perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch @@ -0,0 +1,91 @@ +From 53d0fe68275dbdaf6a532bb4e87f00db5d36c140 Mon Sep 17 00:00:00 2001 +From: Arnaldo Carvalho de Melo +Date: Tue, 19 Apr 2016 12:16:55 -0300 +Subject: perf test: Ignore kcore files in the "vmlinux matches kallsyms" test + +From: Arnaldo Carvalho de Melo + +commit 53d0fe68275dbdaf6a532bb4e87f00db5d36c140 upstream. + +Before: + + # perf test -v kallsyms + + Maps only in vmlinux: + ffffffff81d5e000-ffffffff81ec3ac8 115e000 [kernel].init.text + ffffffff81ec3ac8-ffffffffa0000000 12c3ac8 [kernel].exit.text + ffffffffa0000000-ffffffffa000c000 0 [fjes] + ffffffffa000c000-ffffffffa0017000 0 [video] + ffffffffa0017000-ffffffffa001c000 0 [grace] + + ffffffffa0a7f000-ffffffffa0ba5000 0 [xfs] + ffffffffa0ba5000-ffffffffffffffff 0 [veth] + Maps in vmlinux with a different name in kallsyms: + Maps only in kallsyms: + ffff880000100000-ffff88001000b000 80000103000 [kernel.kallsyms] + ffff88001000b000-ffff880100000000 8001000e000 [kernel.kallsyms] + ffff880100000000-ffffc90000000000 80100003000 [kernel.kallsyms] + + ffffffffa0000000-ffffffffff600000 7fffa0003000 [kernel.kallsyms] + ffffffffff600000-ffffffffffffffff 7fffff603000 [kernel.kallsyms] + test child finished with -1 + ---- end ---- + vmlinux symtab matches kallsyms: FAILED! + # + +After: + + # perf test -v 1 + 1: vmlinux symtab matches kallsyms : + --- start --- + test child forked, pid 7058 + Looking at the vmlinux_path (8 entries long) + Using /lib/modules/4.6.0-rc1+/build/vmlinux for symbols + 0xffffffff81076870: diff end addr for aesni_gcm_dec v: 0xffffffff810791f2 k: 0xffffffff81076902 + 0xffffffff81079200: diff end addr for aesni_gcm_enc v: 0xffffffff8107bb03 k: 0xffffffff81079292 + 0xffffffff8107e8d0: diff end addr for aesni_gcm_enc_avx_gen2 v: 0xffffffff81083e76 k: 0xffffffff8107e943 + 0xffffffff81083e80: diff end addr for aesni_gcm_dec_avx_gen2 v: 0xffffffff81089611 k: 0xffffffff81083ef3 + 0xffffffff81089990: diff end addr for aesni_gcm_enc_avx_gen4 v: 0xffffffff8108e7c4 k: 0xffffffff81089a03 + 0xffffffff8108e7d0: diff end addr for aesni_gcm_dec_avx_gen4 v: 0xffffffff810937ef k: 0xffffffff8108e843 + Maps only in vmlinux: + ffffffff81d5e000-ffffffff81ec3ac8 115e000 [kernel].init.text + ffffffff81ec3ac8-ffffffffa0000000 12c3ac8 [kernel].exit.text + Maps in vmlinux with a different name in kallsyms: + Maps only in kallsyms: + test child finished with -1 + ---- end ---- + vmlinux symtab matches kallsyms: FAILED! + # + +Cc: Adrian Hunter +Cc: David Ahern +Cc: Jiri Olsa +Cc: Namhyung Kim +Cc: Wang Nan +Fixes: 8e0cf965f95e ("perf symbols: Add support for reading from /proc/kcore") +Link: http://lkml.kernel.org/n/tip-n6vrwt9t89w8k769y349govx@git.kernel.org +Signed-off-by: Arnaldo Carvalho de Melo +Signed-off-by: Greg Kroah-Hartman + +--- + tools/perf/tests/vmlinux-kallsyms.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/tools/perf/tests/vmlinux-kallsyms.c ++++ b/tools/perf/tests/vmlinux-kallsyms.c +@@ -54,8 +54,14 @@ int test__vmlinux_matches_kallsyms(int s + * Step 3: + * + * Load and split /proc/kallsyms into multiple maps, one per module. ++ * Do not use kcore, as this test was designed before kcore support ++ * and has parts that only make sense if using the non-kcore code. ++ * XXX: extend it to stress the kcorre code as well, hint: the list ++ * of modules extracted from /proc/kcore, in its current form, can't ++ * be compacted against the list of modules found in the "vmlinux" ++ * code and with the one got from /proc/modules from the "kallsyms" code. + */ +- if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, NULL) <= 0) { ++ if (__machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, true, NULL) <= 0) { + pr_debug("dso__load_kallsyms "); + goto out; + } diff --git a/queue-4.6/pps-do-not-crash-when-failed-to-register.patch b/queue-4.6/pps-do-not-crash-when-failed-to-register.patch new file mode 100644 index 00000000000..3b1fbfeff18 --- /dev/null +++ b/queue-4.6/pps-do-not-crash-when-failed-to-register.patch @@ -0,0 +1,64 @@ +From 368301f2fe4b07e5fb71dba3cc566bc59eb6705f Mon Sep 17 00:00:00 2001 +From: Jiri Slaby +Date: Wed, 20 Jul 2016 15:45:08 -0700 +Subject: pps: do not crash when failed to register + +From: Jiri Slaby + +commit 368301f2fe4b07e5fb71dba3cc566bc59eb6705f upstream. + +With this command sequence: + + modprobe plip + modprobe pps_parport + rmmod pps_parport + +the partport_pps modules causes this crash: + + BUG: unable to handle kernel NULL pointer dereference at (null) + IP: parport_detach+0x1d/0x60 [pps_parport] + Oops: 0000 [#1] SMP + ... + Call Trace: + parport_unregister_driver+0x65/0xc0 [parport] + SyS_delete_module+0x187/0x210 + +The sequence that builds up to this is: + + 1) plip is loaded and takes the parport device for exclusive use: + + plip0: Parallel port at 0x378, using IRQ 7. + + 2) pps_parport then fails to grab the device: + + pps_parport: parallel port PPS client + parport0: cannot grant exclusive access for device pps_parport + pps_parport: couldn't register with parport0 + + 3) rmmod of pps_parport is then killed because it tries to access + pardev->name, but pardev (taken from port->cad) is NULL. + +So add a check for NULL in the test there too. + +Link: http://lkml.kernel.org/r/20160714115245.12651-1-jslaby@suse.cz +Signed-off-by: Jiri Slaby +Acked-by: Rodolfo Giometti +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/pps/clients/pps_parport.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/pps/clients/pps_parport.c ++++ b/drivers/pps/clients/pps_parport.c +@@ -195,7 +195,7 @@ static void parport_detach(struct parpor + struct pps_client_pp *device; + + /* FIXME: oooh, this is ugly! */ +- if (strcmp(pardev->name, KBUILD_MODNAME)) ++ if (!pardev || strcmp(pardev->name, KBUILD_MODNAME)) + /* not our port */ + return; + diff --git a/queue-4.6/radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch b/queue-4.6/radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch new file mode 100644 index 00000000000..c537addc8fb --- /dev/null +++ b/queue-4.6/radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch @@ -0,0 +1,60 @@ +From 3cb9185c67304b2a7ea9be73e7d13df6fb2793a1 Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin +Date: Wed, 20 Jul 2016 15:45:00 -0700 +Subject: radix-tree: fix radix_tree_iter_retry() for tagged iterators. + +From: Andrey Ryabinin + +commit 3cb9185c67304b2a7ea9be73e7d13df6fb2793a1 upstream. + +radix_tree_iter_retry() resets slot to NULL, but it doesn't reset tags. +Then NULL slot and non-zero iter.tags passed to radix_tree_next_slot() +leading to crash: + + RIP: radix_tree_next_slot include/linux/radix-tree.h:473 + find_get_pages_tag+0x334/0x930 mm/filemap.c:1452 + .... + Call Trace: + pagevec_lookup_tag+0x3a/0x80 mm/swap.c:960 + mpage_prepare_extent_to_map+0x321/0xa90 fs/ext4/inode.c:2516 + ext4_writepages+0x10be/0x2b20 fs/ext4/inode.c:2736 + do_writepages+0x97/0x100 mm/page-writeback.c:2364 + __filemap_fdatawrite_range+0x248/0x2e0 mm/filemap.c:300 + filemap_write_and_wait_range+0x121/0x1b0 mm/filemap.c:490 + ext4_sync_file+0x34d/0xdb0 fs/ext4/fsync.c:115 + vfs_fsync_range+0x10a/0x250 fs/sync.c:195 + vfs_fsync fs/sync.c:209 + do_fsync+0x42/0x70 fs/sync.c:219 + SYSC_fdatasync fs/sync.c:232 + SyS_fdatasync+0x19/0x20 fs/sync.c:230 + entry_SYSCALL_64_fastpath+0x23/0xc1 arch/x86/entry/entry_64.S:207 + +We must reset iterator's tags to bail out from radix_tree_next_slot() +and go to the slow-path in radix_tree_next_chunk(). + +Fixes: 46437f9a554f ("radix-tree: fix race in gang lookup") +Link: http://lkml.kernel.org/r/1468495196-10604-1-git-send-email-aryabinin@virtuozzo.com +Signed-off-by: Andrey Ryabinin +Reported-by: Dmitry Vyukov +Acked-by: Konstantin Khlebnikov +Cc: Matthew Wilcox +Cc: Hugh Dickins +Cc: Ross Zwisler +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/radix-tree.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/include/linux/radix-tree.h ++++ b/include/linux/radix-tree.h +@@ -399,6 +399,7 @@ static inline __must_check + void **radix_tree_iter_retry(struct radix_tree_iter *iter) + { + iter->next_index = iter->index; ++ iter->tags = 0; + return NULL; + } + diff --git a/queue-4.6/sched-debug-fix-deadlock-when-enabling-sched-events.patch b/queue-4.6/sched-debug-fix-deadlock-when-enabling-sched-events.patch new file mode 100644 index 00000000000..e69fe4042cf --- /dev/null +++ b/queue-4.6/sched-debug-fix-deadlock-when-enabling-sched-events.patch @@ -0,0 +1,97 @@ +From eda8dca519269c92a0771668b3d5678792de7b78 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 13 Jun 2016 02:32:09 -0500 +Subject: sched/debug: Fix deadlock when enabling sched events + +From: Josh Poimboeuf + +commit eda8dca519269c92a0771668b3d5678792de7b78 upstream. + +I see a hang when enabling sched events: + + echo 1 > /sys/kernel/debug/tracing/events/sched/enable + +The printk buffer shows: + + BUG: spinlock recursion on CPU#1, swapper/1/0 + lock: 0xffff88007d5d8c00, .magic: dead4ead, .owner: swapper/1/0, .owner_cpu: 1 + CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc2+ #1 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 + ... + Call Trace: + [] dump_stack+0x85/0xc2 + [] spin_dump+0x78/0xc0 + [] do_raw_spin_lock+0x11a/0x150 + [] _raw_spin_lock+0x61/0x80 + [] ? try_to_wake_up+0x256/0x4e0 + [] try_to_wake_up+0x256/0x4e0 + [] ? _raw_spin_unlock_irqrestore+0x4a/0x80 + [] wake_up_process+0x15/0x20 + [] insert_work+0x84/0xc0 + [] __queue_work+0x18f/0x660 + [] queue_work_on+0x46/0x90 + [] drm_fb_helper_dirty.isra.11+0xcb/0xe0 [drm_kms_helper] + [] drm_fb_helper_sys_imageblit+0x30/0x40 [drm_kms_helper] + [] soft_cursor+0x1ad/0x230 + [] bit_cursor+0x649/0x680 + [] ? update_attr.isra.2+0x90/0x90 + [] fbcon_cursor+0x14a/0x1c0 + [] hide_cursor+0x28/0x90 + [] vt_console_print+0x3bf/0x3f0 + [] call_console_drivers.constprop.24+0x183/0x200 + [] console_unlock+0x3d4/0x610 + [] vprintk_emit+0x3c5/0x610 + [] vprintk_default+0x29/0x40 + [] printk+0x57/0x73 + [] enqueue_entity+0xc2e/0xc70 + [] enqueue_task_fair+0x59/0xab0 + [] ? kvm_sched_clock_read+0x9/0x20 + [] ? sched_clock+0x9/0x10 + [] activate_task+0x5c/0xa0 + [] ttwu_do_activate+0x54/0xb0 + [] sched_ttwu_pending+0x7a/0xb0 + [] scheduler_ipi+0x61/0x170 + [] smp_trace_reschedule_interrupt+0x4f/0x2a0 + [] trace_reschedule_interrupt+0x96/0xa0 + [] ? native_safe_halt+0x6/0x10 + [] ? trace_hardirqs_on+0xd/0x10 + [] default_idle+0x20/0x1a0 + [] arch_cpu_idle+0xf/0x20 + [] default_idle_call+0x2f/0x50 + [] cpu_startup_entry+0x37e/0x450 + [] start_secondary+0x160/0x1a0 + +Note the hang only occurs when echoing the above from a physical serial +console, not from an ssh session. + +The bug is caused by a deadlock where the task is trying to grab the rq +lock twice because printk()'s aren't safe in sched code. + +Signed-off-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Matt Fleming +Cc: Mel Gorman +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Srikar Dronamraju +Cc: Thomas Gleixner +Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") +Link: http://lkml.kernel.org/r/20160613073209.gdvdybiruljbkn3p@treble +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched/fair.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -3194,7 +3194,7 @@ static inline void check_schedstat_requi + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) { +- pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " ++ printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " + "stat_blocked and stat_runtime require the " + "kernel parameter schedstats=enabled or " + "kernel.sched_schedstats=1\n"); diff --git a/queue-4.6/series b/queue-4.6/series index 54f49665fc7..f665e079812 100644 --- a/queue-4.6/series +++ b/queue-4.6/series @@ -1 +1,33 @@ usb-ohci-don-t-mark-eds-as-ed_oper-if-scheduling-fails.patch +x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch +x86-quirks-reintroduce-scanning-of-secondary-buses.patch +x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch +dmaengine-at_xdmac-align-descriptors-on-64-bits.patch +dmaengine-at_xdmac-fix-residue-corruption.patch +dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch +mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch +memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch +memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch +mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch +mm-compaction-abort-free-scanner-if-split-fails.patch +fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch +mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch +uapi-export-lirc.h-header.patch +mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch +mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch +vmlinux.lds-account-for-destructor-sections.patch +perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch +mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch +mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch +radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch +pps-do-not-crash-when-failed-to-register.patch +kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch +sched-debug-fix-deadlock-when-enabling-sched-events.patch +arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch +arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch +xen-pciback-fix-conf_space-read-write-overlap-check.patch +xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch +xenbus-don-t-bug-on-user-mode-induced-condition.patch +xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch +xen-blkfront-fix-resume-issues-after-a-migration.patch +xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch diff --git a/queue-4.6/uapi-export-lirc.h-header.patch b/queue-4.6/uapi-export-lirc.h-header.patch new file mode 100644 index 00000000000..e3dcee14966 --- /dev/null +++ b/queue-4.6/uapi-export-lirc.h-header.patch @@ -0,0 +1,38 @@ +From 12cb22bb8ae9aff9d72a9c0a234f26d641b20eb6 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab +Date: Thu, 14 Jul 2016 12:07:15 -0700 +Subject: uapi: export lirc.h header + +From: Mauro Carvalho Chehab + +commit 12cb22bb8ae9aff9d72a9c0a234f26d641b20eb6 upstream. + +This header contains the userspace API for lirc. + +This is a fixup for commit b7be755733dc ("[media] bz#75751: Move +internal header file lirc.h to uapi/"). It moved the header to the +right place, but it forgot to add it at Kbuild. So, despite being at +uapi, it is not copied to the right place. + +Fixes: b7be755733dc44c72 ("[media] bz#75751: Move internal header file lirc.h to uapi/") +Link: http://lkml.kernel.org/r/320c765d32bfc82c582e336d52ffe1026c73c644.1468439021.git.mchehab@s-opensource.com +Signed-off-by: Mauro Carvalho Chehab +Cc: Alec Leamas +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/uapi/linux/Kbuild | 1 + + 1 file changed, 1 insertion(+) + +--- a/include/uapi/linux/Kbuild ++++ b/include/uapi/linux/Kbuild +@@ -244,6 +244,7 @@ endif + header-y += hw_breakpoint.h + header-y += l2tp.h + header-y += libc-compat.h ++header-y += lirc.h + header-y += limits.h + header-y += llc.h + header-y += loop.h diff --git a/queue-4.6/vmlinux.lds-account-for-destructor-sections.patch b/queue-4.6/vmlinux.lds-account-for-destructor-sections.patch new file mode 100644 index 00000000000..b09b4dabbd7 --- /dev/null +++ b/queue-4.6/vmlinux.lds-account-for-destructor-sections.patch @@ -0,0 +1,81 @@ +From e41f501d391265ff568f3e49d6128cc30856a36f Mon Sep 17 00:00:00 2001 +From: Dmitry Vyukov +Date: Thu, 14 Jul 2016 12:07:29 -0700 +Subject: vmlinux.lds: account for destructor sections + +From: Dmitry Vyukov + +commit e41f501d391265ff568f3e49d6128cc30856a36f upstream. + +If CONFIG_KASAN is enabled and gcc is configured with +--disable-initfini-array and/or gold linker is used, gcc emits +.ctors/.dtors and .text.startup/.text.exit sections instead of +.init_array/.fini_array. .dtors section is not explicitly accounted in +the linker script and messes vvar/percpu layout. + +We want: + ffffffff822bfd80 D _edata + ffffffff822c0000 D __vvar_beginning_hack + ffffffff822c0000 A __vvar_page + ffffffff822c0080 0000000000000098 D vsyscall_gtod_data + ffffffff822c1000 A __init_begin + ffffffff822c1000 D init_per_cpu__irq_stack_union + ffffffff822c1000 A __per_cpu_load + ffffffff822d3000 D init_per_cpu__gdt_page + +We got: + ffffffff8279a600 D _edata + ffffffff8279b000 A __vvar_page + ffffffff8279c000 A __init_begin + ffffffff8279c000 D init_per_cpu__irq_stack_union + ffffffff8279c000 A __per_cpu_load + ffffffff8279e000 D __vvar_beginning_hack + ffffffff8279e080 0000000000000098 D vsyscall_gtod_data + ffffffff827ae000 D init_per_cpu__gdt_page + +This happens because __vvar_page and .vvar get different addresses in +arch/x86/kernel/vmlinux.lds.S: + + . = ALIGN(PAGE_SIZE); + __vvar_page = .; + + .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + /* work around gold bug 13023 */ + __vvar_beginning_hack = .; + +Discard .dtors/.fini_array/.text.exit, since we don't call dtors. +Merge .text.startup into init text. + +Link: http://lkml.kernel.org/r/1467386363-120030-1-git-send-email-dvyukov@google.com +Signed-off-by: Dmitry Vyukov +Reviewed-by: Andrey Ryabinin +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/asm-generic/vmlinux.lds.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -540,15 +540,19 @@ + + #define INIT_TEXT \ + *(.init.text) \ ++ *(.text.startup) \ + MEM_DISCARD(init.text) + + #define EXIT_DATA \ + *(.exit.data) \ ++ *(.fini_array) \ ++ *(.dtors) \ + MEM_DISCARD(exit.data) \ + MEM_DISCARD(exit.rodata) + + #define EXIT_TEXT \ + *(.exit.text) \ ++ *(.text.exit) \ + MEM_DISCARD(exit.text) + + #define EXIT_CALL \ diff --git a/queue-4.6/x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch b/queue-4.6/x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch new file mode 100644 index 00000000000..c9cb1cdd759 --- /dev/null +++ b/queue-4.6/x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch @@ -0,0 +1,246 @@ +From abb2bafd295fe962bbadc329dbfb2146457283ac Mon Sep 17 00:00:00 2001 +From: Lukas Wunner +Date: Sun, 12 Jun 2016 12:31:53 +0200 +Subject: x86/quirks: Add early quirk to reset Apple AirPort card +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Lukas Wunner + +commit abb2bafd295fe962bbadc329dbfb2146457283ac upstream. + +The EFI firmware on Macs contains a full-fledged network stack for +downloading OS X images from osrecovery.apple.com. Unfortunately +on Macs introduced 2011 and 2012, EFI brings up the Broadcom 4331 +wireless card on every boot and leaves it enabled even after +ExitBootServices has been called. The card continues to assert its IRQ +line, causing spurious interrupts if the IRQ is shared. It also corrupts +memory by DMAing received packets, allowing for remote code execution +over the air. This only stops when a driver is loaded for the wireless +card, which may be never if the driver is not installed or blacklisted. + +The issue seems to be constrained to the Broadcom 4331. Chris Milsted +has verified that the newer Broadcom 4360 built into the MacBookPro11,3 +(2013/2014) does not exhibit this behaviour. The chances that Apple will +ever supply a firmware fix for the older machines appear to be zero. + +The solution is to reset the card on boot by writing to a reset bit in +its mmio space. This must be done as an early quirk and not as a plain +vanilla PCI quirk to successfully combat memory corruption by DMAed +packets: Matthew Garrett found out in 2012 that the packets are written +to EfiBootServicesData memory (http://mjg59.dreamwidth.org/11235.html). +This type of memory is made available to the page allocator by +efi_free_boot_services(). Plain vanilla PCI quirks run much later, in +subsys initcall level. In-between a time window would be open for memory +corruption. Random crashes occurring in this time window and attributed +to DMAed packets have indeed been observed in the wild by Chris +Bainbridge. + +When Matthew Garrett analyzed the memory corruption issue in 2012, he +sought to fix it with a grub quirk which transitions the card to D3hot: +http://git.savannah.gnu.org/cgit/grub.git/commit/?id=9d34bb85da56 + +This approach does not help users with other bootloaders and while it +may prevent DMAed packets, it does not cure the spurious interrupts +emanating from the card. Unfortunately the card's mmio space is +inaccessible in D3hot, so to reset it, we have to undo the effect of +Matthew's grub patch and transition the card back to D0. + +Note that the quirk takes a few shortcuts to reduce the amount of code: +The size of BAR 0 and the location of the PM capability is identical +on all affected machines and therefore hardcoded. Only the address of +BAR 0 differs between models. Also, it is assumed that the BCMA core +currently mapped is the 802.11 core. The EFI driver seems to always take +care of this. + +Michael Büsch, Bjorn Helgaas and Matt Fleming contributed feedback +towards finding the best solution to this problem. + +The following should be a comprehensive list of affected models: + iMac13,1 2012 21.5" [Root Port 00:1c.3 = 8086:1e16] + iMac13,2 2012 27" [Root Port 00:1c.3 = 8086:1e16] + Macmini5,1 2011 i5 2.3 GHz [Root Port 00:1c.1 = 8086:1c12] + Macmini5,2 2011 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1c12] + Macmini5,3 2011 i7 2.0 GHz [Root Port 00:1c.1 = 8086:1c12] + Macmini6,1 2012 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1e12] + Macmini6,2 2012 i7 2.3 GHz [Root Port 00:1c.1 = 8086:1e12] + MacBookPro8,1 2011 13" [Root Port 00:1c.1 = 8086:1c12] + MacBookPro8,2 2011 15" [Root Port 00:1c.1 = 8086:1c12] + MacBookPro8,3 2011 17" [Root Port 00:1c.1 = 8086:1c12] + MacBookPro9,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] + MacBookPro9,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] + MacBookPro10,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] + MacBookPro10,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] + +For posterity, spurious interrupts caused by the Broadcom 4331 wireless +card resulted in splats like this (stacktrace omitted): + + irq 17: nobody cared (try booting with the "irqpoll" option) + handlers: + [] pcie_isr + [] sdhci_irq [sdhci] threaded [] sdhci_thread_irq [sdhci] + [] azx_interrupt [snd_hda_codec] + Disabling IRQ #17 + +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79301 +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111781 +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=728916 +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=895951#c16 +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1009819 +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1098621 +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1149632#c5 +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1279130 +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1332732 +Tested-by: Konstantin Simanov # [MacBookPro8,1] +Tested-by: Lukas Wunner # [MacBookPro9,1] +Tested-by: Bryan Paradis # [MacBookPro9,2] +Tested-by: Andrew Worsley # [MacBookPro10,1] +Tested-by: Chris Bainbridge # [MacBookPro10,2] +Signed-off-by: Lukas Wunner +Acked-by: Rafał Miłecki +Acked-by: Matt Fleming +Cc: Andy Lutomirski +Cc: Bjorn Helgaas +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Chris Milsted +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Matthew Garrett +Cc: Michael Buesch +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Yinghai Lu +Cc: b43-dev@lists.infradead.org +Cc: linux-pci@vger.kernel.org +Cc: linux-wireless@vger.kernel.org +Link: http://lkml.kernel.org/r/48d0972ac82a53d460e5fce77a07b2560db95203.1465690253.git.lukas@wunner.de +[ Did minor readability edits. ] +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/early-quirks.c | 64 +++++++++++++++++++++++++++++++++++++++++ + drivers/bcma/bcma_private.h | 2 - + include/linux/bcma/bcma.h | 1 + 3 files changed, 65 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/early-quirks.c ++++ b/arch/x86/kernel/early-quirks.c +@@ -11,7 +11,11 @@ + + #include + #include ++#include ++#include + #include ++#include ++#include + #include + #include + #include +@@ -21,6 +25,9 @@ + #include + #include + #include ++#include ++ ++#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) + + static void __init fix_hypertransport_config(int num, int slot, int func) + { +@@ -597,6 +604,61 @@ static void __init force_disable_hpet(in + #endif + } + ++#define BCM4331_MMIO_SIZE 16384 ++#define BCM4331_PM_CAP 0x40 ++#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg) ++#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg) ++ ++static void __init apple_airport_reset(int bus, int slot, int func) ++{ ++ void __iomem *mmio; ++ u16 pmcsr; ++ u64 addr; ++ int i; ++ ++ if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) ++ return; ++ ++ /* Card may have been put into PCI_D3hot by grub quirk */ ++ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); ++ ++ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { ++ pmcsr &= ~PCI_PM_CTRL_STATE_MASK; ++ write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr); ++ mdelay(10); ++ ++ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); ++ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { ++ dev_err("Cannot power up Apple AirPort card\n"); ++ return; ++ } ++ } ++ ++ addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); ++ addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32; ++ addr &= PCI_BASE_ADDRESS_MEM_MASK; ++ ++ mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); ++ if (!mmio) { ++ dev_err("Cannot iomap Apple AirPort card\n"); ++ return; ++ } ++ ++ pr_info("Resetting Apple AirPort card (left enabled by EFI)\n"); ++ ++ for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++) ++ udelay(10); ++ ++ bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET); ++ bcma_aread32(BCMA_RESET_CTL); ++ udelay(1); ++ ++ bcma_awrite32(BCMA_RESET_CTL, 0); ++ bcma_aread32(BCMA_RESET_CTL); ++ udelay(10); ++ ++ early_iounmap(mmio, BCM4331_MMIO_SIZE); ++} + + #define QFLAG_APPLY_ONCE 0x1 + #define QFLAG_APPLIED 0x2 +@@ -639,6 +701,8 @@ static struct chipset early_qrk[] __init + */ + { PCI_VENDOR_ID_INTEL, 0x0f00, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, ++ { PCI_VENDOR_ID_BROADCOM, 0x4331, ++ PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, + {} + }; + +--- a/drivers/bcma/bcma_private.h ++++ b/drivers/bcma/bcma_private.h +@@ -8,8 +8,6 @@ + #include + #include + +-#define BCMA_CORE_SIZE 0x1000 +- + #define bcma_err(bus, fmt, ...) \ + pr_err("bus%d: " fmt, (bus)->num, ##__VA_ARGS__) + #define bcma_warn(bus, fmt, ...) \ +--- a/include/linux/bcma/bcma.h ++++ b/include/linux/bcma/bcma.h +@@ -158,6 +158,7 @@ struct bcma_host_ops { + #define BCMA_CORE_DEFAULT 0xFFF + + #define BCMA_MAX_NR_CORES 16 ++#define BCMA_CORE_SIZE 0x1000 + + /* Chip IDs of PCIe devices */ + #define BCMA_CHIP_ID_BCM4313 0x4313 diff --git a/queue-4.6/x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch b/queue-4.6/x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch new file mode 100644 index 00000000000..bec033c653b --- /dev/null +++ b/queue-4.6/x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch @@ -0,0 +1,59 @@ +From 447d29d1d3aed839e74c2401ef63387780ac51ed Mon Sep 17 00:00:00 2001 +From: Lukas Wunner +Date: Sun, 12 Jun 2016 12:31:53 +0200 +Subject: x86/quirks: Apply nvidia_bugs quirk only on root bus + +From: Lukas Wunner + +commit 447d29d1d3aed839e74c2401ef63387780ac51ed upstream. + +Since the following commit: + + 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") + +... early quirks are only applied to devices on the root bus. + +The motivation was to prevent application of the nvidia_bugs quirk on +secondary buses. + +We're about to reintroduce scanning of secondary buses for a quirk to +reset the Broadcom 4331 wireless card on 2011/2012 Macs. To prevent +regressions, open code the requirement to apply nvidia_bugs only on the +root bus. + +Signed-off-by: Lukas Wunner +Cc: Andy Lutomirski +Cc: Bjorn Helgaas +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Yinghai Lu +Link: http://lkml.kernel.org/r/4d5477c1d76b2f0387a780f2142bbcdd9fee869b.1465690253.git.lukas@wunner.de +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/early-quirks.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/arch/x86/kernel/early-quirks.c ++++ b/arch/x86/kernel/early-quirks.c +@@ -76,6 +76,13 @@ static void __init nvidia_bugs(int num, + #ifdef CONFIG_ACPI + #ifdef CONFIG_X86_IO_APIC + /* ++ * Only applies to Nvidia root ports (bus 0) and not to ++ * Nvidia graphics cards with PCI ports on secondary buses. ++ */ ++ if (num) ++ return; ++ ++ /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + * Unfortunately that's not true on many Asus boards. diff --git a/queue-4.6/x86-quirks-reintroduce-scanning-of-secondary-buses.patch b/queue-4.6/x86-quirks-reintroduce-scanning-of-secondary-buses.patch new file mode 100644 index 00000000000..06124d83973 --- /dev/null +++ b/queue-4.6/x86-quirks-reintroduce-scanning-of-secondary-buses.patch @@ -0,0 +1,151 @@ +From 850c321027c2e31d0afc71588974719a4b565550 Mon Sep 17 00:00:00 2001 +From: Lukas Wunner +Date: Sun, 12 Jun 2016 12:31:53 +0200 +Subject: x86/quirks: Reintroduce scanning of secondary buses + +From: Lukas Wunner + +commit 850c321027c2e31d0afc71588974719a4b565550 upstream. + +We used to scan secondary buses until the following commit that +was applied in 2009: + + 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") + +which commit constrained early quirks to the root bus only. Its +motivation was to prevent application of the nvidia_bugs quirk +on secondary buses. + +We're about to add a quirk to reset the Broadcom 4331 wireless card on +2011/2012 Macs, which is located on a secondary bus behind a PCIe root +port. To facilitate that, reintroduce scanning of secondary buses. + +The commit message of 8659c406ade3 notes that scanning only the root bus +"saves quite some unnecessary scanning work". The algorithm used prior +to 8659c406ade3 was particularly time consuming because it scanned +buses 0 to 31 brute force. To avoid lengthening boot time, employ a +recursive strategy which only scans buses that are actually reachable +from the root bus. + +Yinghai Lu pointed out that the secondary bus number read from a +bridge's config space may be invalid, in particular a value of 0 would +cause an infinite loop. The PCI core goes beyond that and recurses to a +child bus only if its bus number is greater than the parent bus number +(see pci_scan_bridge()). Since the root bus is numbered 0, this implies +that secondary buses may not be 0. Do the same on early scanning. + +If this algorithm is found to significantly impact boot time or cause +infinite loops on broken hardware, it would be possible to limit its +recursion depth: The Broadcom 4331 quirk applies at depth 1, all others +at depth 0, so the bus need not be scanned deeper than that for now. An +alternative approach would be to revert to scanning only the root bus, +and apply the Broadcom 4331 quirk to the root ports 8086:1c12, 8086:1e12 +and 8086:1e16. Apple always positioned the card behind either of these +three ports. The quirk would then check presence of the card in slot 0 +below the root port and do its deed. + +Signed-off-by: Lukas Wunner +Cc: Andy Lutomirski +Cc: Bjorn Helgaas +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Yinghai Lu +Cc: linux-pci@vger.kernel.org +Link: http://lkml.kernel.org/r/f0daa70dac1a9b2483abdb31887173eb6ab77bdf.1465690253.git.lukas@wunner.de +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/early-quirks.c | 34 +++++++++++++++++++++------------- + 1 file changed, 21 insertions(+), 13 deletions(-) + +--- a/arch/x86/kernel/early-quirks.c ++++ b/arch/x86/kernel/early-quirks.c +@@ -610,12 +610,6 @@ struct chipset { + void (*f)(int num, int slot, int func); + }; + +-/* +- * Only works for devices on the root bus. If you add any devices +- * not on bus 0 readd another loop level in early_quirks(). But +- * be careful because at least the Nvidia quirk here relies on +- * only matching on bus 0. +- */ + static struct chipset early_qrk[] __initdata = { + { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, +@@ -648,6 +642,8 @@ static struct chipset early_qrk[] __init + {} + }; + ++static void __init early_pci_scan_bus(int bus); ++ + /** + * check_dev_quirk - apply early quirks to a given PCI device + * @num: bus number +@@ -656,7 +652,7 @@ static struct chipset early_qrk[] __init + * + * Check the vendor & device ID against the early quirks table. + * +- * If the device is single function, let early_quirks() know so we don't ++ * If the device is single function, let early_pci_scan_bus() know so we don't + * poke at this device again. + */ + static int __init check_dev_quirk(int num, int slot, int func) +@@ -665,6 +661,7 @@ static int __init check_dev_quirk(int nu + u16 vendor; + u16 device; + u8 type; ++ u8 sec; + int i; + + class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); +@@ -692,25 +689,36 @@ static int __init check_dev_quirk(int nu + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); ++ ++ if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { ++ sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); ++ if (sec > num) ++ early_pci_scan_bus(sec); ++ } ++ + if (!(type & 0x80)) + return -1; + + return 0; + } + +-void __init early_quirks(void) ++static void __init early_pci_scan_bus(int bus) + { + int slot, func; + +- if (!early_pci_allowed()) +- return; +- + /* Poor man's PCI discovery */ +- /* Only scan the root bus */ + for (slot = 0; slot < 32; slot++) + for (func = 0; func < 8; func++) { + /* Only probe function 0 on single fn devices */ +- if (check_dev_quirk(0, slot, func)) ++ if (check_dev_quirk(bus, slot, func)) + break; + } + } ++ ++void __init early_quirks(void) ++{ ++ if (!early_pci_allowed()) ++ return; ++ ++ early_pci_scan_bus(0); ++} diff --git a/queue-4.6/xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch b/queue-4.6/xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch new file mode 100644 index 00000000000..27dd0a76a65 --- /dev/null +++ b/queue-4.6/xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch @@ -0,0 +1,86 @@ +From efd1535270c1deb0487527bf0c3c827301a69c93 Mon Sep 17 00:00:00 2001 +From: Bob Liu +Date: Tue, 7 Jun 2016 10:43:15 -0400 +Subject: xen-blkfront: don't call talk_to_blkback when already connected to blkback + +From: Bob Liu + +commit efd1535270c1deb0487527bf0c3c827301a69c93 upstream. + +Sometimes blkfront may twice receive blkback_changed() notification +(XenbusStateConnected) after migration, which will cause +talk_to_blkback() to be called twice too and confuse xen-blkback. + +The flow is as follow: + blkfront blkback +blkfront_resume() + > talk_to_blkback() + > Set blkfront to XenbusStateInitialised + front changed() + > Connect() + > Set blkback to XenbusStateConnected + +blkback_changed() + > Skip talk_to_blkback() + because frontstate == XenbusStateInitialised + > blkfront_connect() + > Set blkfront to XenbusStateConnected + +----- +And here we get another XenbusStateConnected notification leading +to: +----- +blkback_changed() + > because now frontstate != XenbusStateInitialised + talk_to_blkback() is also called again + > blkfront state changed from + XenbusStateConnected to XenbusStateInitialised + (Which is not correct!) + + front_changed(): + > Do nothing because blkback + already in XenbusStateConnected + +Now blkback is in XenbusStateConnected but blkfront is still +in XenbusStateInitialised - leading to no disks. + +Poking of the XenbusStateConnected state is allowed (to deal with +block disk change) and has to be dealt with. The most likely +cause of this bug are custom udev scripts hooking up the disks +and then validating the size. + +Signed-off-by: Bob Liu +Signed-off-by: Konrad Rzeszutek Wilk +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/xen-blkfront.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +--- a/drivers/block/xen-blkfront.c ++++ b/drivers/block/xen-blkfront.c +@@ -2469,10 +2469,23 @@ static void blkback_changed(struct xenbu + break; + + case XenbusStateConnected: +- if (dev->state != XenbusStateInitialised) { ++ /* ++ * talk_to_blkback sets state to XenbusStateInitialised ++ * and blkfront_connect sets it to XenbusStateConnected ++ * (if connection went OK). ++ * ++ * If the backend (or toolstack) decides to poke at backend ++ * state (and re-trigger the watch by setting the state repeatedly ++ * to XenbusStateConnected (4)) we need to deal with this. ++ * This is allowed as this is used to communicate to the guest ++ * that the size of disk has changed! ++ */ ++ if ((dev->state != XenbusStateInitialised) && ++ (dev->state != XenbusStateConnected)) { + if (talk_to_blkback(dev, info)) + break; + } ++ + blkfront_connect(info); + break; + diff --git a/queue-4.6/xen-blkfront-fix-resume-issues-after-a-migration.patch b/queue-4.6/xen-blkfront-fix-resume-issues-after-a-migration.patch new file mode 100644 index 00000000000..f69190ef26a --- /dev/null +++ b/queue-4.6/xen-blkfront-fix-resume-issues-after-a-migration.patch @@ -0,0 +1,81 @@ +From 2a6f71ad99cabe436e70c3f5fcf58072cb3bc07f Mon Sep 17 00:00:00 2001 +From: Bob Liu +Date: Tue, 31 May 2016 16:59:17 +0800 +Subject: xen-blkfront: fix resume issues after a migration + +From: Bob Liu + +commit 2a6f71ad99cabe436e70c3f5fcf58072cb3bc07f upstream. + +After a migrate to another host (which may not have multiqueue +support), the number of rings (block hardware queues) +may be changed and the ring info structure will also be reallocated. + +This patch fixes two related bugs: + * call blk_mq_update_nr_hw_queues() to make blk-core know the number + of hardware queues have been changed. + * Don't store rinfo pointer to hctx->driver_data, because rinfo may be + reallocated so use hctx->queue_num to get the rinfo structure instead. + +Signed-off-by: Bob Liu +Signed-off-by: Konrad Rzeszutek Wilk +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/xen-blkfront.c | 20 ++++++++------------ + 1 file changed, 8 insertions(+), 12 deletions(-) + +--- a/drivers/block/xen-blkfront.c ++++ b/drivers/block/xen-blkfront.c +@@ -877,8 +877,12 @@ static int blkif_queue_rq(struct blk_mq_ + const struct blk_mq_queue_data *qd) + { + unsigned long flags; +- struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; ++ int qid = hctx->queue_num; ++ struct blkfront_info *info = hctx->queue->queuedata; ++ struct blkfront_ring_info *rinfo = NULL; + ++ BUG_ON(info->nr_rings <= qid); ++ rinfo = &info->rinfo[qid]; + blk_mq_start_request(qd->rq); + spin_lock_irqsave(&rinfo->ring_lock, flags); + if (RING_FULL(&rinfo->ring)) +@@ -904,20 +908,9 @@ out_busy: + return BLK_MQ_RQ_QUEUE_BUSY; + } + +-static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, +- unsigned int index) +-{ +- struct blkfront_info *info = (struct blkfront_info *)data; +- +- BUG_ON(info->nr_rings <= index); +- hctx->driver_data = &info->rinfo[index]; +- return 0; +-} +- + static struct blk_mq_ops blkfront_mq_ops = { + .queue_rq = blkif_queue_rq, + .map_queue = blk_mq_map_queue, +- .init_hctx = blk_mq_init_hctx, + }; + + static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, +@@ -953,6 +946,7 @@ static int xlvbd_init_blk_queue(struct g + return PTR_ERR(rq); + } + ++ rq->queuedata = info; + queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); + + if (info->feature_discard) { +@@ -2137,6 +2131,8 @@ static int blkfront_resume(struct xenbus + return err; + + err = talk_to_blkback(dev, info); ++ if (!err) ++ blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings); + + /* + * We have to wait for the backend to switch to diff --git a/queue-4.6/xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch b/queue-4.6/xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch new file mode 100644 index 00000000000..79ff83a15b2 --- /dev/null +++ b/queue-4.6/xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch @@ -0,0 +1,177 @@ +From 7b427a59538a98161321aa46c13f4ea81b43f4eb Mon Sep 17 00:00:00 2001 +From: Bob Liu +Date: Mon, 27 Jun 2016 16:33:24 +0800 +Subject: xen-blkfront: save uncompleted reqs in blkfront_resume() + +From: Bob Liu + +commit 7b427a59538a98161321aa46c13f4ea81b43f4eb upstream. + +Uncompleted reqs used to be 'saved and resubmitted' in blkfront_recover() during +migration, but that's too late after multi-queue was introduced. + +After a migrate to another host (which may not have multiqueue support), the +number of rings (block hardware queues) may be changed and the ring and shadow +structure will also be reallocated. + +The blkfront_recover() then can't 'save and resubmit' the real +uncompleted reqs because shadow structure have been reallocated. + +This patch fixes this issue by moving the 'save' logic out of +blkfront_recover() to earlier place in blkfront_resume(). + +The 'resubmit' is not changed and still in blkfront_recover(). + +Signed-off-by: Bob Liu +Signed-off-by: Konrad Rzeszutek Wilk +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/xen-blkfront.c | 91 ++++++++++++++++++------------------------- + 1 file changed, 40 insertions(+), 51 deletions(-) + +--- a/drivers/block/xen-blkfront.c ++++ b/drivers/block/xen-blkfront.c +@@ -207,6 +207,9 @@ struct blkfront_info + struct blk_mq_tag_set tag_set; + struct blkfront_ring_info *rinfo; + unsigned int nr_rings; ++ /* Save uncomplete reqs and bios for migration. */ ++ struct list_head requests; ++ struct bio_list bio_list; + }; + + static unsigned int nr_minors; +@@ -2007,69 +2010,22 @@ static int blkif_recover(struct blkfront + { + unsigned int i, r_index; + struct request *req, *n; +- struct blk_shadow *copy; + int rc; + struct bio *bio, *cloned_bio; +- struct bio_list bio_list, merge_bio; + unsigned int segs, offset; + int pending, size; + struct split_bio *split_bio; +- struct list_head requests; + + blkfront_gather_backend_features(info); + segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; + blk_queue_max_segments(info->rq, segs); +- bio_list_init(&bio_list); +- INIT_LIST_HEAD(&requests); + + for (r_index = 0; r_index < info->nr_rings; r_index++) { +- struct blkfront_ring_info *rinfo; +- +- rinfo = &info->rinfo[r_index]; +- /* Stage 1: Make a safe copy of the shadow state. */ +- copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), +- GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); +- if (!copy) +- return -ENOMEM; +- +- /* Stage 2: Set up free list. */ +- memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); +- for (i = 0; i < BLK_RING_SIZE(info); i++) +- rinfo->shadow[i].req.u.rw.id = i+1; +- rinfo->shadow_free = rinfo->ring.req_prod_pvt; +- rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; ++ struct blkfront_ring_info *rinfo = &info->rinfo[r_index]; + + rc = blkfront_setup_indirect(rinfo); +- if (rc) { +- kfree(copy); ++ if (rc) + return rc; +- } +- +- for (i = 0; i < BLK_RING_SIZE(info); i++) { +- /* Not in use? */ +- if (!copy[i].request) +- continue; +- +- /* +- * Get the bios in the request so we can re-queue them. +- */ +- if (copy[i].request->cmd_flags & +- (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { +- /* +- * Flush operations don't contain bios, so +- * we need to requeue the whole request +- */ +- list_add(©[i].request->queuelist, &requests); +- continue; +- } +- merge_bio.head = copy[i].request->bio; +- merge_bio.tail = copy[i].request->biotail; +- bio_list_merge(&bio_list, &merge_bio); +- copy[i].request->bio = NULL; +- blk_end_request_all(copy[i].request, 0); +- } +- +- kfree(copy); + } + xenbus_switch_state(info->xbdev, XenbusStateConnected); + +@@ -2084,7 +2040,7 @@ static int blkif_recover(struct blkfront + kick_pending_request_queues(rinfo); + } + +- list_for_each_entry_safe(req, n, &requests, queuelist) { ++ list_for_each_entry_safe(req, n, &info->requests, queuelist) { + /* Requeue pending requests (flush or discard) */ + list_del_init(&req->queuelist); + BUG_ON(req->nr_phys_segments > segs); +@@ -2092,7 +2048,7 @@ static int blkif_recover(struct blkfront + } + blk_mq_kick_requeue_list(info->rq); + +- while ((bio = bio_list_pop(&bio_list)) != NULL) { ++ while ((bio = bio_list_pop(&info->bio_list)) != NULL) { + /* Traverse the list of pending bios and re-queue them */ + if (bio_segments(bio) > segs) { + /* +@@ -2138,9 +2094,42 @@ static int blkfront_resume(struct xenbus + { + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + int err = 0; ++ unsigned int i, j; + + dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); + ++ bio_list_init(&info->bio_list); ++ INIT_LIST_HEAD(&info->requests); ++ for (i = 0; i < info->nr_rings; i++) { ++ struct blkfront_ring_info *rinfo = &info->rinfo[i]; ++ struct bio_list merge_bio; ++ struct blk_shadow *shadow = rinfo->shadow; ++ ++ for (j = 0; j < BLK_RING_SIZE(info); j++) { ++ /* Not in use? */ ++ if (!shadow[j].request) ++ continue; ++ ++ /* ++ * Get the bios in the request so we can re-queue them. ++ */ ++ if (shadow[j].request->cmd_flags & ++ (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { ++ /* ++ * Flush operations don't contain bios, so ++ * we need to requeue the whole request ++ */ ++ list_add(&shadow[j].request->queuelist, &info->requests); ++ continue; ++ } ++ merge_bio.head = shadow[j].request->bio; ++ merge_bio.tail = shadow[j].request->biotail; ++ bio_list_merge(&info->bio_list, &merge_bio); ++ shadow[j].request->bio = NULL; ++ blk_mq_end_request(shadow[j].request, 0); ++ } ++ } ++ + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); + + err = negotiate_mq(info); diff --git a/queue-4.6/xen-pciback-fix-conf_space-read-write-overlap-check.patch b/queue-4.6/xen-pciback-fix-conf_space-read-write-overlap-check.patch new file mode 100644 index 00000000000..331f342203b --- /dev/null +++ b/queue-4.6/xen-pciback-fix-conf_space-read-write-overlap-check.patch @@ -0,0 +1,55 @@ +From 02ef871ecac290919ea0c783d05da7eedeffc10e Mon Sep 17 00:00:00 2001 +From: Andrey Grodzovsky +Date: Tue, 21 Jun 2016 14:26:36 -0400 +Subject: xen/pciback: Fix conf_space read/write overlap check. + +From: Andrey Grodzovsky + +commit 02ef871ecac290919ea0c783d05da7eedeffc10e upstream. + +Current overlap check is evaluating to false a case where a filter +field is fully contained (proper subset) of a r/w request. This +change applies classical overlap check instead to include all the +scenarios. + +More specifically, for (Hilscher GmbH CIFX 50E-DP(M/S)) device driver +the logic is such that the entire confspace is read and written in 4 +byte chunks. In this case as an example, CACHE_LINE_SIZE, +LATENCY_TIMER and PCI_BIST are arriving together in one call to +xen_pcibk_config_write() with offset == 0xc and size == 4. With the +exsisting overlap check the LATENCY_TIMER field (offset == 0xd, length +== 1) is fully contained in the write request and hence is excluded +from write, which is incorrect. + +Signed-off-by: Andrey Grodzovsky +Reviewed-by: Boris Ostrovsky +Reviewed-by: Jan Beulich +Signed-off-by: David Vrabel +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/xen/xen-pciback/conf_space.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/drivers/xen/xen-pciback/conf_space.c ++++ b/drivers/xen/xen-pciback/conf_space.c +@@ -183,8 +183,7 @@ int xen_pcibk_config_read(struct pci_dev + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + +- if ((req_start >= field_start && req_start < field_end) +- || (req_end > field_start && req_end <= field_end)) { ++ if (req_end > field_start && field_end > req_start) { + err = conf_space_read(dev, cfg_entry, field_start, + &tmp_val); + if (err) +@@ -230,8 +229,7 @@ int xen_pcibk_config_write(struct pci_de + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + +- if ((req_start >= field_start && req_start < field_end) +- || (req_end > field_start && req_end <= field_end)) { ++ if (req_end > field_start && field_end > req_start) { + tmp_val = 0; + + err = xen_pcibk_config_read(dev, field_start, diff --git a/queue-4.6/xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch b/queue-4.6/xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch new file mode 100644 index 00000000000..fd00317ce1f --- /dev/null +++ b/queue-4.6/xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch @@ -0,0 +1,51 @@ +From 7469be95a487319514adce2304ad2af3553d2fc9 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Thu, 7 Jul 2016 01:32:04 -0600 +Subject: xenbus: don't bail early from xenbus_dev_request_and_reply() + +From: Jan Beulich + +commit 7469be95a487319514adce2304ad2af3553d2fc9 upstream. + +xenbus_dev_request_and_reply() needs to track whether a transaction is +open. For XS_TRANSACTION_START messages it calls transaction_start() +and for XS_TRANSACTION_END messages it calls transaction_end(). + +If sending an XS_TRANSACTION_START message fails or responds with an +an error, the transaction is not open and transaction_end() must be +called. + +If sending an XS_TRANSACTION_END message fails, the transaction is +still open, but if an error response is returned the transaction is +closed. + +Commit 027bd7e89906 ("xen/xenbus: Avoid synchronous wait on XenBus +stalling shutdown/restart") introduced a regression where failed +XS_TRANSACTION_START messages were leaving the transaction open. This +can cause problems with suspend (and migration) as all transactions +must be closed before suspending. + +It appears that the problematic change was added accidentally, so just +remove it. + +Signed-off-by: Jan Beulich +Cc: Konrad Rzeszutek Wilk +Signed-off-by: David Vrabel +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/xen/xenbus/xenbus_xs.c | 3 --- + 1 file changed, 3 deletions(-) + +--- a/drivers/xen/xenbus/xenbus_xs.c ++++ b/drivers/xen/xenbus/xenbus_xs.c +@@ -249,9 +249,6 @@ void *xenbus_dev_request_and_reply(struc + + mutex_unlock(&xs_state.request_mutex); + +- if (IS_ERR(ret)) +- return ret; +- + if ((msg->type == XS_TRANSACTION_END) || + ((req_msg.type == XS_TRANSACTION_START) && + (msg->type == XS_ERROR))) diff --git a/queue-4.6/xenbus-don-t-bug-on-user-mode-induced-condition.patch b/queue-4.6/xenbus-don-t-bug-on-user-mode-induced-condition.patch new file mode 100644 index 00000000000..c8e42611595 --- /dev/null +++ b/queue-4.6/xenbus-don-t-bug-on-user-mode-induced-condition.patch @@ -0,0 +1,57 @@ +From 0beef634b86a1350c31da5fcc2992f0d7c8a622b Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Thu, 7 Jul 2016 01:23:57 -0600 +Subject: xenbus: don't BUG() on user mode induced condition + +From: Jan Beulich + +commit 0beef634b86a1350c31da5fcc2992f0d7c8a622b upstream. + +Inability to locate a user mode specified transaction ID should not +lead to a kernel crash. For other than XS_TRANSACTION_START also +don't issue anything to xenbus if the specified ID doesn't match that +of any active transaction. + +Signed-off-by: Jan Beulich +Signed-off-by: David Vrabel +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/xen/xenbus/xenbus_dev_frontend.c | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +--- a/drivers/xen/xenbus/xenbus_dev_frontend.c ++++ b/drivers/xen/xenbus/xenbus_dev_frontend.c +@@ -316,11 +316,18 @@ static int xenbus_write_transaction(unsi + rc = -ENOMEM; + goto out; + } ++ } else { ++ list_for_each_entry(trans, &u->transactions, list) ++ if (trans->handle.id == u->u.msg.tx_id) ++ break; ++ if (&trans->list == &u->transactions) ++ return -ESRCH; + } + + reply = xenbus_dev_request_and_reply(&u->u.msg); + if (IS_ERR(reply)) { +- kfree(trans); ++ if (msg_type == XS_TRANSACTION_START) ++ kfree(trans); + rc = PTR_ERR(reply); + goto out; + } +@@ -333,12 +340,7 @@ static int xenbus_write_transaction(unsi + list_add(&trans->list, &u->transactions); + } + } else if (u->u.msg.type == XS_TRANSACTION_END) { +- list_for_each_entry(trans, &u->transactions, list) +- if (trans->handle.id == u->u.msg.tx_id) +- break; +- BUG_ON(&trans->list == &u->transactions); + list_del(&trans->list); +- + kfree(trans); + } +