4.6-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 2 Aug 2016 07:04:26 +0000 (09:04 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 2 Aug 2016 07:04:26 +0000 (09:04 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 2 Aug 2016 07:04:26 +0000 (09:04 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 2 Aug 2016 07:04:26 +0000 (09:04 +0200)
diff --git a/queue-4.6/arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch b/queue-4.6/arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch

new file mode 100644 (file)

index 0000000..22a3ffa
--- /dev/null
+++ b/queue-4.6/arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch
@@ -0,0 +1,45 @@
+From f52e126cc7476196f44f3c313b7d9f0699a881fc Mon Sep 17 00:00:00 2001
+From: Vineet Gupta <vgupta@synopsys.com>
+Date: Tue, 28 Jun 2016 09:42:25 +0530
+Subject: ARC: unwind: ensure that .debug_frame is generated (vs. .eh_frame)
+
+From: Vineet Gupta <vgupta@synopsys.com>
+
+commit f52e126cc7476196f44f3c313b7d9f0699a881fc upstream.
+
+With recent binutils update to support dwarf CFI pseudo-ops in gas, we
+now get .eh_frame vs. .debug_frame. Although the call frame info is
+exactly the same in both, the CIE differs, which the current kernel
+unwinder can't cope with.
+
+This broke both the kernel unwinder as well as loadable modules (latter
+because of a new unhandled relo R_ARC_32_PCREL from .rela.eh_frame in
+the module loader)
+
+The ideal solution would be to switch unwinder to .eh_frame.
+For now however we can make do by just ensureing .debug_frame is
+generated by removing -fasynchronous-unwind-tables
+
+ .eh_frame    generated with -gdwarf-2 -fasynchronous-unwind-tables
+ .debug_frame generated with -gdwarf-2
+
+Fixes STAR 9001058196
+
+Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arc/Makefile |    2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/arch/arc/Makefile
++++ b/arch/arc/Makefile
+@@ -66,8 +66,6 @@ endif
+ 
+ endif
+ 
+-cflags-$(CONFIG_ARC_DW2_UNWIND)               += -fasynchronous-unwind-tables
+-
+ # By default gcc 4.8 generates dwarf4 which kernel unwinder can't grok
+ ifeq ($(atleast_gcc48),y)
+ cflags-$(CONFIG_ARC_DW2_UNWIND)               += -gdwarf-2
diff --git a/queue-4.6/arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch b/queue-4.6/arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch

new file mode 100644 (file)

index 0000000..3a54d1d
--- /dev/null
+++ b/queue-4.6/arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch
@@ -0,0 +1,41 @@
+From 9bd54517ee86cb164c734f72ea95aeba4804f10b Mon Sep 17 00:00:00 2001
+From: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
+Date: Thu, 23 Jun 2016 11:00:39 +0300
+Subject: arc: unwind: warn only once if DW2_UNWIND is disabled
+
+From: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
+
+commit 9bd54517ee86cb164c734f72ea95aeba4804f10b upstream.
+
+If CONFIG_ARC_DW2_UNWIND is disabled every time arc_unwind_core()
+gets called following message gets printed in debug console:
+----------------->8---------------
+CONFIG_ARC_DW2_UNWIND needs to be enabled
+----------------->8---------------
+
+That message makes sense if user indeed wants to see a backtrace or
+get nice function call-graphs in perf but what if user disabled
+unwinder for the purpose? Why pollute his debug console?
+
+So instead we'll warn user about possibly missing feature once and
+let him decide if that was what he or she really wanted.
+
+Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
+Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arc/kernel/stacktrace.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arc/kernel/stacktrace.c
++++ b/arch/arc/kernel/stacktrace.c
+@@ -142,7 +142,7 @@ arc_unwind_core(struct task_struct *tsk,
+        * prelogue is setup (callee regs saved and then fp set and not other
+        * way around
+        */
+-      pr_warn("CONFIG_ARC_DW2_UNWIND needs to be enabled\n");
++      pr_warn_once("CONFIG_ARC_DW2_UNWIND needs to be enabled\n");
+       return 0;
+ 
+ #endif
diff --git a/queue-4.6/dmaengine-at_xdmac-align-descriptors-on-64-bits.patch b/queue-4.6/dmaengine-at_xdmac-align-descriptors-on-64-bits.patch

new file mode 100644 (file)

index 0000000..d5fe716
--- /dev/null
+++ b/queue-4.6/dmaengine-at_xdmac-align-descriptors-on-64-bits.patch
@@ -0,0 +1,43 @@
+From 4a9723e8df68cfce4048517ee32e37f78854b6fb Mon Sep 17 00:00:00 2001
+From: Ludovic Desroches <ludovic.desroches@atmel.com>
+Date: Thu, 12 May 2016 16:54:08 +0200
+Subject: dmaengine: at_xdmac: align descriptors on 64 bits
+
+From: Ludovic Desroches <ludovic.desroches@atmel.com>
+
+commit 4a9723e8df68cfce4048517ee32e37f78854b6fb upstream.
+
+Having descriptors aligned on 64 bits allows update CNDA and CUBC in an
+atomic way.
+
+Signed-off-by: Ludovic Desroches <ludovic.desroches@atmel.com>
+Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel
+eXtended DMA Controller driver")
+Reviewed-by: Nicolas Ferre <nicolas.ferre@atmel.com>
+Signed-off-by: Vinod Koul <vinod.koul@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/dma/at_xdmac.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/dma/at_xdmac.c
++++ b/drivers/dma/at_xdmac.c
+@@ -242,7 +242,7 @@ struct at_xdmac_lld {
+       u32             mbr_dus;        /* Destination Microblock Stride Register */
+ };
+ 
+-
++/* 64-bit alignment needed to update CNDA and CUBC registers in an atomic way. */
+ struct at_xdmac_desc {
+       struct at_xdmac_lld             lld;
+       enum dma_transfer_direction     direction;
+@@ -253,7 +253,7 @@ struct at_xdmac_desc {
+       unsigned int                    xfer_size;
+       struct list_head                descs_list;
+       struct list_head                xfer_node;
+-};
++} __aligned(sizeof(u64));
+ 
+ static inline void __iomem *at_xdmac_chan_reg_base(struct at_xdmac *atxdmac, unsigned int chan_nb)
+ {
diff --git a/queue-4.6/dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch b/queue-4.6/dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch

new file mode 100644 (file)

index 0000000..fbc401b
--- /dev/null
+++ b/queue-4.6/dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch
@@ -0,0 +1,65 @@
+From 9295c41d77ca93aac79cfca6fa09fa1ca5cab66f Mon Sep 17 00:00:00 2001
+From: Ludovic Desroches <ludovic.desroches@atmel.com>
+Date: Thu, 12 May 2016 16:54:10 +0200
+Subject: dmaengine: at_xdmac: double FIFO flush needed to compute residue
+
+From: Ludovic Desroches <ludovic.desroches@atmel.com>
+
+commit 9295c41d77ca93aac79cfca6fa09fa1ca5cab66f upstream.
+
+Due to the way CUBC register is updated, a double flush is needed to
+compute an accurate residue. First flush aim is to get data from the DMA
+FIFO and second one ensures that we won't report data which are not in
+memory.
+
+Signed-off-by: Ludovic Desroches <ludovic.desroches@atmel.com>
+Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel
+eXtended DMA Controller driver")
+Reviewed-by: Nicolas Ferre <nicolas.ferre@atmel.com>
+Signed-off-by: Vinod Koul <vinod.koul@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/dma/at_xdmac.c |   24 +++++++++++++++++++++++-
+ 1 file changed, 23 insertions(+), 1 deletion(-)
+
+--- a/drivers/dma/at_xdmac.c
++++ b/drivers/dma/at_xdmac.c
+@@ -1425,7 +1425,16 @@ at_xdmac_tx_status(struct dma_chan *chan
+       residue = desc->xfer_size;
+       /*
+        * Flush FIFO: only relevant when the transfer is source peripheral
+-       * synchronized.
++       * synchronized. Flush is needed before reading CUBC because data in
++       * the FIFO are not reported by CUBC. Reporting a residue of the
++       * transfer length while we have data in FIFO can cause issue.
++       * Usecase: atmel USART has a timeout which means I have received
++       * characters but there is no more character received for a while. On
++       * timeout, it requests the residue. If the data are in the DMA FIFO,
++       * we will return a residue of the transfer length. It means no data
++       * received. If an application is waiting for these data, it will hang
++       * since we won't have another USART timeout without receiving new
++       * data.
+        */
+       mask = AT_XDMAC_CC_TYPE | AT_XDMAC_CC_DSYNC;
+       value = AT_XDMAC_CC_TYPE_PER_TRAN | AT_XDMAC_CC_DSYNC_PER2MEM;
+@@ -1481,6 +1490,19 @@ at_xdmac_tx_status(struct dma_chan *chan
+       }
+ 
+       /*
++       * Flush FIFO: only relevant when the transfer is source peripheral
++       * synchronized. Another flush is needed here because CUBC is updated
++       * when the controller sends the data write command. It can lead to
++       * report data that are not written in the memory or the device. The
++       * FIFO flush ensures that data are really written.
++       */
++      if ((desc->lld.mbr_cfg & mask) == value) {
++              at_xdmac_write(atxdmac, AT_XDMAC_GSWF, atchan->mask);
++              while (!(at_xdmac_chan_read(atchan, AT_XDMAC_CIS) & AT_XDMAC_CIS_FIS))
++                      cpu_relax();
++      }
++
++      /*
+        * Remove size of all microblocks already transferred and the current
+        * one. Then add the remaining size to transfer of the current
+        * microblock.
diff --git a/queue-4.6/dmaengine-at_xdmac-fix-residue-corruption.patch b/queue-4.6/dmaengine-at_xdmac-fix-residue-corruption.patch

new file mode 100644 (file)

index 0000000..a1a0cea
--- /dev/null
+++ b/queue-4.6/dmaengine-at_xdmac-fix-residue-corruption.patch
@@ -0,0 +1,99 @@
+From 53398f488821c2b5b15291e3debec6ad33f75d3d Mon Sep 17 00:00:00 2001
+From: Ludovic Desroches <ludovic.desroches@atmel.com>
+Date: Thu, 12 May 2016 16:54:09 +0200
+Subject: dmaengine: at_xdmac: fix residue corruption
+
+From: Ludovic Desroches <ludovic.desroches@atmel.com>
+
+commit 53398f488821c2b5b15291e3debec6ad33f75d3d upstream.
+
+An unexpected value of CUBC can lead to a corrupted residue. A more
+complex sequence is needed to detect an inaccurate value for NCA or CUBC.
+
+Signed-off-by: Ludovic Desroches <ludovic.desroches@atmel.com>
+Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel
+eXtended DMA Controller driver")
+Reviewed-by: Nicolas Ferre <nicolas.ferre@atmel.com>
+Signed-off-by: Vinod Koul <vinod.koul@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/dma/at_xdmac.c |   54 +++++++++++++++++++++++++++++--------------------
+ 1 file changed, 32 insertions(+), 22 deletions(-)
+
+--- a/drivers/dma/at_xdmac.c
++++ b/drivers/dma/at_xdmac.c
+@@ -1400,6 +1400,7 @@ at_xdmac_tx_status(struct dma_chan *chan
+       u32                     cur_nda, check_nda, cur_ubc, mask, value;
+       u8                      dwidth = 0;
+       unsigned long           flags;
++      bool                    initd;
+ 
+       ret = dma_cookie_status(chan, cookie, txstate);
+       if (ret == DMA_COMPLETE)
+@@ -1435,34 +1436,43 @@ at_xdmac_tx_status(struct dma_chan *chan
+       }
+ 
+       /*
+-       * When processing the residue, we need to read two registers but we
+-       * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where
+-       * we stand in the descriptor list and AT_XDMAC_CUBC is used
+-       * to know how many data are remaining for the current descriptor.
+-       * Since the dma channel is not paused to not loose data, between the
+-       * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of
+-       * descriptor.
+-       * For that reason, after reading AT_XDMAC_CUBC, we check if we are
+-       * still using the same descriptor by reading a second time
+-       * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to
+-       * read again AT_XDMAC_CUBC.
++       * The easiest way to compute the residue should be to pause the DMA
++       * but doing this can lead to miss some data as some devices don't
++       * have FIFO.
++       * We need to read several registers because:
++       * - DMA is running therefore a descriptor change is possible while
++       * reading these registers
++       * - When the block transfer is done, the value of the CUBC register
++       * is set to its initial value until the fetch of the next descriptor.
++       * This value will corrupt the residue calculation so we have to skip
++       * it.
++       *
++       * INITD --------                    ------------
++       *              |____________________|
++       *       _______________________  _______________
++       * NDA       @desc2             \/   @desc3
++       *       _______________________/\_______________
++       *       __________  ___________  _______________
++       * CUBC       0    \/ MAX desc1 \/  MAX desc2
++       *       __________/\___________/\_______________
++       *
++       * Since descriptors are aligned on 64 bits, we can assume that
++       * the update of NDA and CUBC is atomic.
+        * Memory barriers are used to ensure the read order of the registers.
+-       * A max number of retries is set because unlikely it can never ends if
+-       * we are transferring a lot of data with small buffers.
++       * A max number of retries is set because unlikely it could never ends.
+        */
+-      cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+-      rmb();
+-      cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+       for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) {
+-              rmb();
+               check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+-
+-              if (likely(cur_nda == check_nda))
+-                      break;
+-
+-              cur_nda = check_nda;
++              rmb();
++              initd = !!(at_xdmac_chan_read(atchan, AT_XDMAC_CC) & AT_XDMAC_CC_INITD);
+               rmb();
+               cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
++              rmb();
++              cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
++              rmb();
++
++              if ((check_nda == cur_nda) && initd)
++                      break;
+       }
+ 
+       if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) {
diff --git a/queue-4.6/fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch b/queue-4.6/fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch

new file mode 100644 (file)

index 0000000..9ced3ae
--- /dev/null
+++ b/queue-4.6/fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch
@@ -0,0 +1,59 @@
+From 63d2f95d63396059200c391ca87161897b99e74a Mon Sep 17 00:00:00 2001
+From: Torsten Hilbrich <torsten.hilbrich@secunet.com>
+Date: Fri, 24 Jun 2016 14:50:18 -0700
+Subject: fs/nilfs2: fix potential underflow in call to crc32_le
+
+From: Torsten Hilbrich <torsten.hilbrich@secunet.com>
+
+commit 63d2f95d63396059200c391ca87161897b99e74a upstream.
+
+The value `bytes' comes from the filesystem which is about to be
+mounted.  We cannot trust that the value is always in the range we
+expect it to be.
+
+Check its value before using it to calculate the length for the crc32_le
+call.  It value must be larger (or equal) sumoff + 4.
+
+This fixes a kernel bug when accidentially mounting an image file which
+had the nilfs2 magic value 0x3434 at the right offset 0x406 by chance.
+The bytes 0x01 0x00 were stored at 0x408 and were interpreted as a
+s_bytes value of 1.  This caused an underflow when substracting sumoff +
+4 (20) in the call to crc32_le.
+
+  BUG: unable to handle kernel paging request at ffff88021e600000
+  IP:  crc32_le+0x36/0x100
+  ...
+  Call Trace:
+    nilfs_valid_sb.part.5+0x52/0x60 [nilfs2]
+    nilfs_load_super_block+0x142/0x300 [nilfs2]
+    init_nilfs+0x60/0x390 [nilfs2]
+    nilfs_mount+0x302/0x520 [nilfs2]
+    mount_fs+0x38/0x160
+    vfs_kern_mount+0x67/0x110
+    do_mount+0x269/0xe00
+    SyS_mount+0x9f/0x100
+    entry_SYSCALL_64_fastpath+0x16/0x71
+
+Link: http://lkml.kernel.org/r/1466778587-5184-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp
+Signed-off-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
+Tested-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
+Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nilfs2/the_nilfs.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/nilfs2/the_nilfs.c
++++ b/fs/nilfs2/the_nilfs.c
+@@ -443,7 +443,7 @@ static int nilfs_valid_sb(struct nilfs_s
+       if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
+               return 0;
+       bytes = le16_to_cpu(sbp->s_bytes);
+-      if (bytes > BLOCK_SIZE)
++      if (bytes < sumoff + 4 || bytes > BLOCK_SIZE)
+               return 0;
+       crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
+                      sumoff);
diff --git a/queue-4.6/kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch b/queue-4.6/kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch

new file mode 100644 (file)

index 0000000..8930c4d
--- /dev/null
+++ b/queue-4.6/kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch
@@ -0,0 +1,52 @@
+From 57675cb976eff977aefb428e68e4e0236d48a9ff Mon Sep 17 00:00:00 2001
+From: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Date: Thu, 9 Jun 2016 15:20:05 +0300
+Subject: kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w
+
+From: Andrey Ryabinin <aryabinin@virtuozzo.com>
+
+commit 57675cb976eff977aefb428e68e4e0236d48a9ff upstream.
+
+Lengthy output of sysrq-w may take a lot of time on slow serial console.
+
+Currently we reset NMI-watchdog on the current CPU to avoid spurious
+lockup messages. Sometimes this doesn't work since softlockup watchdog
+might trigger on another CPU which is waiting for an IPI to proceed.
+We reset softlockup watchdogs on all CPUs, but we do this only after
+listing all tasks, and this may be too late on a busy system.
+
+So, reset watchdogs CPUs earlier, in for_each_process_thread() loop.
+
+Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/core.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4993,14 +4993,16 @@ void show_state_filter(unsigned long sta
+               /*
+                * reset the NMI-timeout, listing all files on a slow
+                * console might take a lot of time:
++               * Also, reset softlockup watchdogs on all CPUs, because
++               * another CPU might be blocked waiting for us to process
++               * an IPI.
+                */
+               touch_nmi_watchdog();
++              touch_all_softlockup_watchdogs();
+               if (!state_filter || (p->state & state_filter))
+                       sched_show_task(p);
+       }
+ 
+-      touch_all_softlockup_watchdogs();
+-
+ #ifdef CONFIG_SCHED_DEBUG
+       sysrq_sched_debug_show();
+ #endif
diff --git a/queue-4.6/memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch b/queue-4.6/memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch

new file mode 100644 (file)

index 0000000..2b1fac2
--- /dev/null
+++ b/queue-4.6/memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch
@@ -0,0 +1,95 @@
+From ea3a9645866e12d2b198434f03df3c3e96fb86ce Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Fri, 24 Jun 2016 14:49:58 -0700
+Subject: memcg: css_alloc should return an ERR_PTR value on error
+
+From: Tejun Heo <tj@kernel.org>
+
+commit ea3a9645866e12d2b198434f03df3c3e96fb86ce upstream.
+
+mem_cgroup_css_alloc() was returning NULL on failure while cgroup core
+expected it to return an ERR_PTR value leading to the following NULL
+deref after a css allocation failure.  Fix it by return
+ERR_PTR(-ENOMEM) instead.  I'll also update cgroup core so that it
+can handle NULL returns.
+
+  mkdir: page allocation failure: order:6, mode:0x240c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO)
+  CPU: 0 PID: 8738 Comm: mkdir Not tainted 4.7.0-rc3+ #123
+  ...
+  Call Trace:
+    dump_stack+0x68/0xa1
+    warn_alloc_failed+0xd6/0x130
+    __alloc_pages_nodemask+0x4c6/0xf20
+    alloc_pages_current+0x66/0xe0
+    alloc_kmem_pages+0x14/0x80
+    kmalloc_order_trace+0x2a/0x1a0
+    __kmalloc+0x291/0x310
+    memcg_update_all_caches+0x6c/0x130
+    mem_cgroup_css_alloc+0x590/0x610
+    cgroup_apply_control_enable+0x18b/0x370
+    cgroup_mkdir+0x1de/0x2e0
+    kernfs_iop_mkdir+0x55/0x80
+    vfs_mkdir+0xb9/0x150
+    SyS_mkdir+0x66/0xd0
+    do_syscall_64+0x53/0x120
+    entry_SYSCALL64_slow_path+0x25/0x25
+  ...
+  BUG: unable to handle kernel NULL pointer dereference at 00000000000000d0
+  IP:  init_and_link_css+0x37/0x220
+  PGD 34b1e067 PUD 3a109067 PMD 0
+  Oops: 0002 [#1] SMP
+  Modules linked in:
+  CPU: 0 PID: 8738 Comm: mkdir Not tainted 4.7.0-rc3+ #123
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.2-20160422_131301-anatol 04/01/2014
+  task: ffff88007cbc5200 ti: ffff8800666d4000 task.ti: ffff8800666d4000
+  RIP: 0010:[<ffffffff810f2ca7>]  [<ffffffff810f2ca7>] init_and_link_css+0x37/0x220
+  RSP: 0018:ffff8800666d7d90  EFLAGS: 00010246
+  RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
+  RDX: ffffffff810f2499 RSI: 0000000000000000 RDI: 0000000000000008
+  RBP: ffff8800666d7db8 R08: 0000000000000003 R09: 0000000000000000
+  R10: 0000000000000001 R11: 0000000000000000 R12: ffff88005a5fb400
+  R13: ffffffff81f0f8a0 R14: ffff88005a5fb400 R15: 0000000000000010
+  FS:  00007fc944689700(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 00007f3aed0d2b80 CR3: 000000003a1e8000 CR4: 00000000000006f0
+  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+  Call Trace:
+    cgroup_apply_control_enable+0x1ac/0x370
+    cgroup_mkdir+0x1de/0x2e0
+    kernfs_iop_mkdir+0x55/0x80
+    vfs_mkdir+0xb9/0x150
+    SyS_mkdir+0x66/0xd0
+    do_syscall_64+0x53/0x120
+    entry_SYSCALL64_slow_path+0x25/0x25
+  Code: 89 f5 48 89 fb 49 89 d4 48 83 ec 08 8b 05 72 3b d8 00 85 c0 0f 85 60 01 00 00 4c 89 e7 e8 72 f7 ff ff 48 8d 7b 08 48 89 d9 31 c0 <48> c7 83 d0 00 00 00 00 00 00 00 48 83 e7 f8 48 29 f9 81 c1 d8
+  RIP   init_and_link_css+0x37/0x220
+   RSP <ffff8800666d7d90>
+  CR2: 00000000000000d0
+  ---[ end trace a2d8836ae1e852d1 ]---
+
+Link: http://lkml.kernel.org/r/20160621165740.GJ3262@mtj.duckdns.org
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -4184,7 +4184,7 @@ mem_cgroup_css_alloc(struct cgroup_subsy
+       return &memcg->css;
+ fail:
+       mem_cgroup_free(memcg);
+-      return NULL;
++      return ERR_PTR(-ENOMEM);
+ }
+ 
+ static int
diff --git a/queue-4.6/memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch b/queue-4.6/memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch

new file mode 100644 (file)

index 0000000..bf4d1ac
--- /dev/null
+++ b/queue-4.6/memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch
@@ -0,0 +1,128 @@
+From d93c4130a7d049b234b5d5a15808eaf5406f2789 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Fri, 24 Jun 2016 14:49:54 -0700
+Subject: memcg: mem_cgroup_migrate() may be called with irq disabled
+
+From: Tejun Heo <tj@kernel.org>
+
+commit d93c4130a7d049b234b5d5a15808eaf5406f2789 upstream.
+
+mem_cgroup_migrate() uses local_irq_disable/enable() but can be called
+with irq disabled from migrate_page_copy().  This ends up enabling irq
+while holding a irq context lock triggering the following lockdep
+warning.  Fix it by using irq_save/restore instead.
+
+  =================================
+  [ INFO: inconsistent lock state ]
+  4.7.0-rc1+ #52 Tainted: G        W
+  ---------------------------------
+  inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
+  kcompactd0/151 [HC0[0]:SC0[0]:HE1:SE1] takes:
+   (&(&ctx->completion_lock)->rlock){+.?.-.}, at: [<000000000038fd96>] aio_migratepage+0x156/0x1e8
+  {IN-SOFTIRQ-W} state was registered at:
+     __lock_acquire+0x5b6/0x1930
+     lock_acquire+0xee/0x270
+     _raw_spin_lock_irqsave+0x66/0xb0
+     aio_complete+0x98/0x328
+     dio_complete+0xe4/0x1e0
+     blk_update_request+0xd4/0x450
+     scsi_end_request+0x48/0x1c8
+     scsi_io_completion+0x272/0x698
+     blk_done_softirq+0xca/0xe8
+     __do_softirq+0xc8/0x518
+     irq_exit+0xee/0x110
+     do_IRQ+0x6a/0x88
+     io_int_handler+0x11a/0x25c
+     __mutex_unlock_slowpath+0x144/0x1d8
+     __mutex_unlock_slowpath+0x140/0x1d8
+     kernfs_iop_permission+0x64/0x80
+     __inode_permission+0x9e/0xf0
+     link_path_walk+0x6e/0x510
+     path_lookupat+0xc4/0x1a8
+     filename_lookup+0x9c/0x160
+     user_path_at_empty+0x5c/0x70
+     SyS_readlinkat+0x68/0x140
+     system_call+0xd6/0x270
+  irq event stamp: 971410
+  hardirqs last  enabled at (971409):  migrate_page_move_mapping+0x3ea/0x588
+  hardirqs last disabled at (971410):  _raw_spin_lock_irqsave+0x3c/0xb0
+  softirqs last  enabled at (970526):  __do_softirq+0x460/0x518
+  softirqs last disabled at (970519):  irq_exit+0xee/0x110
+
+  other info that might help us debug this:
+   Possible unsafe locking scenario:
+
+        CPU0
+        ----
+    lock(&(&ctx->completion_lock)->rlock);
+    <Interrupt>
+      lock(&(&ctx->completion_lock)->rlock);
+
+    *** DEADLOCK ***
+
+  3 locks held by kcompactd0/151:
+   #0:  (&(&mapping->private_lock)->rlock){+.+.-.}, at:  aio_migratepage+0x42/0x1e8
+   #1:  (&ctx->ring_lock){+.+.+.}, at:  aio_migratepage+0x5a/0x1e8
+   #2:  (&(&ctx->completion_lock)->rlock){+.?.-.}, at:  aio_migratepage+0x156/0x1e8
+
+  stack backtrace:
+  CPU: 20 PID: 151 Comm: kcompactd0 Tainted: G        W       4.7.0-rc1+ #52
+  Call Trace:
+    show_trace+0xea/0xf0
+    show_stack+0x72/0xf0
+    dump_stack+0x9a/0xd8
+    print_usage_bug.part.27+0x2d4/0x2e8
+    mark_lock+0x17e/0x758
+    mark_held_locks+0xa2/0xd0
+    trace_hardirqs_on_caller+0x140/0x1c0
+    mem_cgroup_migrate+0x266/0x370
+    aio_migratepage+0x16a/0x1e8
+    move_to_new_page+0xb0/0x260
+    migrate_pages+0x8f4/0x9f0
+    compact_zone+0x4dc/0xdc8
+    kcompactd_do_work+0x1aa/0x358
+    kcompactd+0xba/0x2c8
+    kthread+0x10a/0x110
+    kernel_thread_starter+0x6/0xc
+    kernel_thread_starter+0x0/0xc
+  INFO: lockdep is turned off.
+
+Link: http://lkml.kernel.org/r/20160620184158.GO3262@mtj.duckdns.org
+Link: http://lkml.kernel.org/g/5767CFE5.7080904@de.ibm.com
+Fixes: 74485cf2bc85 ("mm: migrate: consolidate mem_cgroup_migrate() calls")
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -5524,6 +5524,7 @@ void mem_cgroup_migrate(struct page *old
+       struct mem_cgroup *memcg;
+       unsigned int nr_pages;
+       bool compound;
++      unsigned long flags;
+ 
+       VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+@@ -5554,10 +5555,10 @@ void mem_cgroup_migrate(struct page *old
+ 
+       commit_charge(newpage, memcg, false);
+ 
+-      local_irq_disable();
++      local_irq_save(flags);
+       mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
+       memcg_check_events(memcg, newpage);
+-      local_irq_enable();
++      local_irq_restore(flags);
+ }
+ 
+ DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
diff --git a/queue-4.6/mm-compaction-abort-free-scanner-if-split-fails.patch b/queue-4.6/mm-compaction-abort-free-scanner-if-split-fails.patch

new file mode 100644 (file)

index 0000000..cc8afcf
--- /dev/null
+++ b/queue-4.6/mm-compaction-abort-free-scanner-if-split-fails.patch
@@ -0,0 +1,127 @@
+From a4f04f2c6955aff5e2c08dcb40aca247ff4d7370 Mon Sep 17 00:00:00 2001
+From: David Rientjes <rientjes@google.com>
+Date: Fri, 24 Jun 2016 14:50:10 -0700
+Subject: mm, compaction: abort free scanner if split fails
+
+From: David Rientjes <rientjes@google.com>
+
+commit a4f04f2c6955aff5e2c08dcb40aca247ff4d7370 upstream.
+
+If the memory compaction free scanner cannot successfully split a free
+page (only possible due to per-zone low watermark), terminate the free
+scanner rather than continuing to scan memory needlessly.  If the
+watermark is insufficient for a free page of order <= cc->order, then
+terminate the scanner since all future splits will also likely fail.
+
+This prevents the compaction freeing scanner from scanning all memory on
+very large zones (very noticeable for zones > 128GB, for instance) when
+all splits will likely fail while holding zone->lock.
+
+compaction_alloc() iterating a 128GB zone has been benchmarked to take
+over 400ms on some systems whereas any free page isolated and ready to
+be split ends up failing in split_free_page() because of the low
+watermark check and thus the iteration continues.
+
+The next time compaction occurs, the freeing scanner will likely start
+at the end of the zone again since no success was made previously and we
+get the same lengthy iteration until the zone is brought above the low
+watermark.  All thp page faults can take >400ms in such a state without
+this fix.
+
+Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606211820350.97086@chino.kir.corp.google.com
+Signed-off-by: David Rientjes <rientjes@google.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Hugh Dickins <hughd@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c |   39 +++++++++++++++++++++------------------
+ 1 file changed, 21 insertions(+), 18 deletions(-)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -436,25 +436,23 @@ static unsigned long isolate_freepages_b
+ 
+               /* Found a free page, break it into order-0 pages */
+               isolated = split_free_page(page);
++              if (!isolated)
++                      break;
++
+               total_isolated += isolated;
++              cc->nr_freepages += isolated;
+               for (i = 0; i < isolated; i++) {
+                       list_add(&page->lru, freelist);
+                       page++;
+               }
+-
+-              /* If a page was split, advance to the end of it */
+-              if (isolated) {
+-                      cc->nr_freepages += isolated;
+-                      if (!strict &&
+-                              cc->nr_migratepages <= cc->nr_freepages) {
+-                              blockpfn += isolated;
+-                              break;
+-                      }
+-
+-                      blockpfn += isolated - 1;
+-                      cursor += isolated - 1;
+-                      continue;
++              if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
++                      blockpfn += isolated;
++                      break;
+               }
++              /* Advance to the end of split page */
++              blockpfn += isolated - 1;
++              cursor += isolated - 1;
++              continue;
+ 
+ isolate_fail:
+               if (strict)
+@@ -464,6 +462,9 @@ isolate_fail:
+ 
+       }
+ 
++      if (locked)
++              spin_unlock_irqrestore(&cc->zone->lock, flags);
++
+       /*
+        * There is a tiny chance that we have read bogus compound_order(),
+        * so be careful to not go outside of the pageblock.
+@@ -485,9 +486,6 @@ isolate_fail:
+       if (strict && blockpfn < end_pfn)
+               total_isolated = 0;
+ 
+-      if (locked)
+-              spin_unlock_irqrestore(&cc->zone->lock, flags);
+-
+       /* Update the pageblock-skip if the whole pageblock was scanned */
+       if (blockpfn == end_pfn)
+               update_pageblock_skip(cc, valid_page, total_isolated, false);
+@@ -938,6 +936,7 @@ static void isolate_freepages(struct com
+                               block_end_pfn = block_start_pfn,
+                               block_start_pfn -= pageblock_nr_pages,
+                               isolate_start_pfn = block_start_pfn) {
++              unsigned long isolated;
+ 
+               /*
+                * This can iterate a massively long zone without finding any
+@@ -962,8 +961,12 @@ static void isolate_freepages(struct com
+                       continue;
+ 
+               /* Found a block suitable for isolating free pages from. */
+-              isolate_freepages_block(cc, &isolate_start_pfn,
+-                                      block_end_pfn, freelist, false);
++              isolated = isolate_freepages_block(cc, &isolate_start_pfn,
++                                              block_end_pfn, freelist, false);
++              /* If isolation failed early, do not continue needlessly */
++              if (!isolated && isolate_start_pfn < block_end_pfn &&
++                  cc->nr_migratepages > cc->nr_freepages)
++                      break;
+ 
+               /*
+                * If we isolated enough freepages, or aborted due to async
diff --git a/queue-4.6/mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch b/queue-4.6/mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch

new file mode 100644 (file)

index 0000000..49eeca4
--- /dev/null
+++ b/queue-4.6/mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch
@@ -0,0 +1,104 @@
+From a46cbf3bc53b6a93fb84a5ffb288c354fa807954 Mon Sep 17 00:00:00 2001
+From: David Rientjes <rientjes@google.com>
+Date: Thu, 14 Jul 2016 12:06:50 -0700
+Subject: mm, compaction: prevent VM_BUG_ON when terminating freeing scanner
+
+From: David Rientjes <rientjes@google.com>
+
+commit a46cbf3bc53b6a93fb84a5ffb288c354fa807954 upstream.
+
+It's possible to isolate some freepages in a pageblock and then fail
+split_free_page() due to the low watermark check.  In this case, we hit
+VM_BUG_ON() because the freeing scanner terminated early without a
+contended lock or enough freepages.
+
+This should never have been a VM_BUG_ON() since it's not a fatal
+condition.  It should have been a VM_WARN_ON() at best, or even handled
+gracefully.
+
+Regardless, we need to terminate anytime the full pageblock scan was not
+done.  The logic belongs in isolate_freepages_block(), so handle its
+state gracefully by terminating the pageblock loop and making a note to
+restart at the same pageblock next time since it was not possible to
+complete the scan this time.
+
+[rientjes@google.com: don't rescan pages in a pageblock]
+  Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1607111244150.83138@chino.kir.corp.google.com
+Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606291436300.145590@chino.kir.corp.google.com
+Signed-off-by: David Rientjes <rientjes@google.com>
+Reported-by: Minchan Kim <minchan@kernel.org>
+Tested-by: Minchan Kim <minchan@kernel.org>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c |   36 ++++++++++++++----------------------
+ 1 file changed, 14 insertions(+), 22 deletions(-)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -936,8 +936,6 @@ static void isolate_freepages(struct com
+                               block_end_pfn = block_start_pfn,
+                               block_start_pfn -= pageblock_nr_pages,
+                               isolate_start_pfn = block_start_pfn) {
+-              unsigned long isolated;
+-
+               /*
+                * This can iterate a massively long zone without finding any
+                * suitable migration targets, so periodically check if we need
+@@ -961,36 +959,30 @@ static void isolate_freepages(struct com
+                       continue;
+ 
+               /* Found a block suitable for isolating free pages from. */
+-              isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+-                                              block_end_pfn, freelist, false);
+-              /* If isolation failed early, do not continue needlessly */
+-              if (!isolated && isolate_start_pfn < block_end_pfn &&
+-                  cc->nr_migratepages > cc->nr_freepages)
+-                      break;
++              isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
++                                      freelist, false);
+ 
+               /*
+-               * If we isolated enough freepages, or aborted due to async
+-               * compaction being contended, terminate the loop.
+-               * Remember where the free scanner should restart next time,
+-               * which is where isolate_freepages_block() left off.
+-               * But if it scanned the whole pageblock, isolate_start_pfn
+-               * now points at block_end_pfn, which is the start of the next
+-               * pageblock.
+-               * In that case we will however want to restart at the start
+-               * of the previous pageblock.
++               * If we isolated enough freepages, or aborted due to lock
++               * contention, terminate.
+                */
+               if ((cc->nr_freepages >= cc->nr_migratepages)
+                                                       || cc->contended) {
+-                      if (isolate_start_pfn >= block_end_pfn)
++                      if (isolate_start_pfn >= block_end_pfn) {
++                              /*
++                               * Restart at previous pageblock if more
++                               * freepages can be isolated next time.
++                               */
+                               isolate_start_pfn =
+                                       block_start_pfn - pageblock_nr_pages;
++                      }
+                       break;
+-              } else {
++              } else if (isolate_start_pfn < block_end_pfn) {
+                       /*
+-                       * isolate_freepages_block() should not terminate
+-                       * prematurely unless contended, or isolated enough
++                       * If isolation failed early, do not continue
++                       * needlessly.
+                        */
+-                      VM_BUG_ON(isolate_start_pfn < block_end_pfn);
++                      break;
+               }
+       }
+ 
diff --git a/queue-4.6/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch b/queue-4.6/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch

new file mode 100644 (file)

index 0000000..a736406
--- /dev/null
+++ b/queue-4.6/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch
@@ -0,0 +1,295 @@
+From 73f576c04b9410ed19660f74f97521bee6e1c546 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 20 Jul 2016 15:44:57 -0700
+Subject: mm: memcontrol: fix cgroup creation failure after many small jobs
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 73f576c04b9410ed19660f74f97521bee6e1c546 upstream.
+
+The memory controller has quite a bit of state that usually outlives the
+cgroup and pins its CSS until said state disappears.  At the same time
+it imposes a 16-bit limit on the CSS ID space to economically store IDs
+in the wild.  Consequently, when we use cgroups to contain frequent but
+small and short-lived jobs that leave behind some page cache, we quickly
+run into the 64k limitations of outstanding CSSs.  Creating a new cgroup
+fails with -ENOSPC while there are only a few, or even no user-visible
+cgroups in existence.
+
+Although pinning CSSs past cgroup removal is common, there are only two
+instances that actually need an ID after a cgroup is deleted: cache
+shadow entries and swapout records.
+
+Cache shadow entries reference the ID weakly and can deal with the CSS
+having disappeared when it's looked up later.  They pose no hurdle.
+
+Swap-out records do need to pin the css to hierarchically attribute
+swapins after the cgroup has been deleted; though the only pages that
+remain swapped out after offlining are tmpfs/shmem pages.  And those
+references are under the user's control, so they are manageable.
+
+This patch introduces a private 16-bit memcg ID and switches swap and
+cache shadow entries over to using that.  This ID can then be recycled
+after offlining when the CSS remains pinned only by objects that don't
+specifically need it.
+
+This script demonstrates the problem by faulting one cache page in a new
+cgroup and deleting it again:
+
+  set -e
+  mkdir -p pages
+  for x in `seq 128000`; do
+    [ $((x % 1000)) -eq 0 ] && echo $x
+    mkdir /cgroup/foo
+    echo $$ >/cgroup/foo/cgroup.procs
+    echo trex >pages/$x
+    echo $$ >/cgroup/cgroup.procs
+    rmdir /cgroup/foo
+  done
+
+When run on an unpatched kernel, we eventually run out of possible IDs
+even though there are no visible cgroups:
+
+  [root@ham ~]# ./cssidstress.sh
+  [...]
+  65000
+  mkdir: cannot create directory '/cgroup/foo': No space left on device
+
+After this patch, the IDs get released upon cgroup destruction and the
+cache and css objects get released once memory reclaim kicks in.
+
+[hannes@cmpxchg.org: init the IDR]
+  Link: http://lkml.kernel.org/r/20160621154601.GA22431@cmpxchg.org
+Fixes: b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups")
+Link: http://lkml.kernel.org/r/20160617162516.GD19084@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: John Garcia <john.garcia@mesosphere.io>
+Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Cc: Nikolay Borisov <kernel@kyup.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/memcontrol.h |   25 +++++--------
+ mm/memcontrol.c            |   82 +++++++++++++++++++++++++++++++++++++++++----
+ mm/slab_common.c           |    4 +-
+ 3 files changed, 87 insertions(+), 24 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -97,6 +97,11 @@ enum mem_cgroup_events_target {
+ #define MEM_CGROUP_ID_SHIFT   16
+ #define MEM_CGROUP_ID_MAX     USHRT_MAX
+ 
++struct mem_cgroup_id {
++      int id;
++      atomic_t ref;
++};
++
+ struct mem_cgroup_stat_cpu {
+       long count[MEMCG_NR_STAT];
+       unsigned long events[MEMCG_NR_EVENTS];
+@@ -172,6 +177,9 @@ enum memcg_kmem_state {
+ struct mem_cgroup {
+       struct cgroup_subsys_state css;
+ 
++      /* Private memcg ID. Used to ID objects that outlive the cgroup */
++      struct mem_cgroup_id id;
++
+       /* Accounted resources */
+       struct page_counter memory;
+       struct page_counter swap;
+@@ -330,22 +338,9 @@ static inline unsigned short mem_cgroup_
+       if (mem_cgroup_disabled())
+               return 0;
+ 
+-      return memcg->css.id;
+-}
+-
+-/**
+- * mem_cgroup_from_id - look up a memcg from an id
+- * @id: the id to look up
+- *
+- * Caller must hold rcu_read_lock() and use css_tryget() as necessary.
+- */
+-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+-{
+-      struct cgroup_subsys_state *css;
+-
+-      css = css_from_id(id, &memory_cgrp_subsys);
+-      return mem_cgroup_from_css(css);
++      return memcg->id.id;
+ }
++struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
+ 
+ /**
+  * parent_mem_cgroup - find the accounting parent of a memcg
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -4038,6 +4038,60 @@ static struct cftype mem_cgroup_legacy_f
+       { },    /* terminate */
+ };
+ 
++/*
++ * Private memory cgroup IDR
++ *
++ * Swap-out records and page cache shadow entries need to store memcg
++ * references in constrained space, so we maintain an ID space that is
++ * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
++ * memory-controlled cgroups to 64k.
++ *
++ * However, there usually are many references to the oflline CSS after
++ * the cgroup has been destroyed, such as page cache or reclaimable
++ * slab objects, that don't need to hang on to the ID. We want to keep
++ * those dead CSS from occupying IDs, or we might quickly exhaust the
++ * relatively small ID space and prevent the creation of new cgroups
++ * even when there are much fewer than 64k cgroups - possibly none.
++ *
++ * Maintain a private 16-bit ID space for memcg, and allow the ID to
++ * be freed and recycled when it's no longer needed, which is usually
++ * when the CSS is offlined.
++ *
++ * The only exception to that are records of swapped out tmpfs/shmem
++ * pages that need to be attributed to live ancestors on swapin. But
++ * those references are manageable from userspace.
++ */
++
++static DEFINE_IDR(mem_cgroup_idr);
++
++static void mem_cgroup_id_get(struct mem_cgroup *memcg)
++{
++      atomic_inc(&memcg->id.ref);
++}
++
++static void mem_cgroup_id_put(struct mem_cgroup *memcg)
++{
++      if (atomic_dec_and_test(&memcg->id.ref)) {
++              idr_remove(&mem_cgroup_idr, memcg->id.id);
++              memcg->id.id = 0;
++
++              /* Memcg ID pins CSS */
++              css_put(&memcg->css);
++      }
++}
++
++/**
++ * mem_cgroup_from_id - look up a memcg from a memcg id
++ * @id: the memcg id to look up
++ *
++ * Caller must hold rcu_read_lock().
++ */
++struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
++{
++      WARN_ON_ONCE(!rcu_read_lock_held());
++      return idr_find(&mem_cgroup_idr, id);
++}
++
+ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
+ {
+       struct mem_cgroup_per_node *pn;
+@@ -4097,6 +4151,12 @@ static struct mem_cgroup *mem_cgroup_all
+       if (!memcg)
+               return NULL;
+ 
++      memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
++                               1, MEM_CGROUP_ID_MAX,
++                               GFP_KERNEL);
++      if (memcg->id.id < 0)
++              goto fail;
++
+       memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+       if (!memcg->stat)
+               goto fail;
+@@ -4123,8 +4183,11 @@ static struct mem_cgroup *mem_cgroup_all
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&memcg->cgwb_list);
+ #endif
++      idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+       return memcg;
+ fail:
++      if (memcg->id.id > 0)
++              idr_remove(&mem_cgroup_idr, memcg->id.id);
+       mem_cgroup_free(memcg);
+       return NULL;
+ }
+@@ -4187,12 +4250,11 @@ fail:
+       return ERR_PTR(-ENOMEM);
+ }
+ 
+-static int
+-mem_cgroup_css_online(struct cgroup_subsys_state *css)
++static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
+ {
+-      if (css->id > MEM_CGROUP_ID_MAX)
+-              return -ENOSPC;
+-
++      /* Online state pins memcg ID, memcg ID pins CSS */
++      mem_cgroup_id_get(mem_cgroup_from_css(css));
++      css_get(css);
+       return 0;
+ }
+ 
+@@ -4215,6 +4277,8 @@ static void mem_cgroup_css_offline(struc
+ 
+       memcg_offline_kmem(memcg);
+       wb_memcg_offline(memcg);
++
++      mem_cgroup_id_put(memcg);
+ }
+ 
+ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+@@ -5736,6 +5800,7 @@ void mem_cgroup_swapout(struct page *pag
+       if (!memcg)
+               return;
+ 
++      mem_cgroup_id_get(memcg);
+       oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+       VM_BUG_ON_PAGE(oldid, page);
+       mem_cgroup_swap_statistics(memcg, true);
+@@ -5754,6 +5819,9 @@ void mem_cgroup_swapout(struct page *pag
+       VM_BUG_ON(!irqs_disabled());
+       mem_cgroup_charge_statistics(memcg, page, false, -1);
+       memcg_check_events(memcg, page);
++
++      if (!mem_cgroup_is_root(memcg))
++              css_put(&memcg->css);
+ }
+ 
+ /*
+@@ -5784,11 +5852,11 @@ int mem_cgroup_try_charge_swap(struct pa
+           !page_counter_try_charge(&memcg->swap, 1, &counter))
+               return -ENOMEM;
+ 
++      mem_cgroup_id_get(memcg);
+       oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+       VM_BUG_ON_PAGE(oldid, page);
+       mem_cgroup_swap_statistics(memcg, true);
+ 
+-      css_get(&memcg->css);
+       return 0;
+ }
+ 
+@@ -5817,7 +5885,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
+                               page_counter_uncharge(&memcg->memsw, 1);
+               }
+               mem_cgroup_swap_statistics(memcg, false);
+-              css_put(&memcg->css);
++              mem_cgroup_id_put(memcg);
+       }
+       rcu_read_unlock();
+ }
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -526,8 +526,8 @@ void memcg_create_kmem_cache(struct mem_
+               goto out_unlock;
+ 
+       cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
+-      cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+-                             css->id, memcg_name_buf);
++      cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
++                             css->serial_nr, memcg_name_buf);
+       if (!cache_name)
+               goto out_unlock;
+ 
diff --git a/queue-4.6/mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch b/queue-4.6/mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch

new file mode 100644 (file)

index 0000000..f80230f
--- /dev/null
+++ b/queue-4.6/mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch
@@ -0,0 +1,52 @@
+From e4568d3803852d00effd41dcdd489e726b998879 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@techsingularity.net>
+Date: Thu, 14 Jul 2016 12:07:20 -0700
+Subject: mm, meminit: always return a valid node from early_pfn_to_nid
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+commit e4568d3803852d00effd41dcdd489e726b998879 upstream.
+
+early_pfn_to_nid can return node 0 if a PFN is invalid on machines that
+has no node 0.  A machine with only node 1 was observed to crash with
+the following message:
+
+   BUG: unable to handle kernel paging request at 000000000002a3c8
+   PGD 0
+   Modules linked in:
+   Hardware name: Supermicro H8DSP-8/H8DSP-8, BIOS 080011  06/30/2006
+   task: ffffffff81c0d500 ti: ffffffff81c00000 task.ti: ffffffff81c00000
+   RIP: reserve_bootmem_region+0x6a/0xef
+   CR2: 000000000002a3c8 CR3: 0000000001c06000 CR4: 00000000000006b0
+   Call Trace:
+      free_all_bootmem+0x4b/0x12a
+      mem_init+0x70/0xa3
+      start_kernel+0x25b/0x49b
+
+The problem is that early_page_uninitialised uses the early_pfn_to_nid
+helper which returns node 0 for invalid PFNs.  No caller of
+early_pfn_to_nid cares except early_page_uninitialised.  This patch has
+early_pfn_to_nid always return a valid node.
+
+Link: http://lkml.kernel.org/r/1468008031-3848-3-git-send-email-mgorman@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1110,7 +1110,7 @@ int __meminit early_pfn_to_nid(unsigned
+       spin_lock(&early_pfn_lock);
+       nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+       if (nid < 0)
+-              nid = 0;
++              nid = first_online_node;
+       spin_unlock(&early_pfn_lock);
+ 
+       return nid;
diff --git a/queue-4.6/mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch b/queue-4.6/mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch

new file mode 100644 (file)

index 0000000..43223b4
--- /dev/null
+++ b/queue-4.6/mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch
@@ -0,0 +1,39 @@
+From ef70b6f41cda6270165a6f27b2548ed31cfa3cb2 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@techsingularity.net>
+Date: Thu, 14 Jul 2016 12:07:23 -0700
+Subject: mm, meminit: ensure node is online before checking whether pages are uninitialised
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+commit ef70b6f41cda6270165a6f27b2548ed31cfa3cb2 upstream.
+
+early_page_uninitialised looks up an arbitrary PFN.  While a machine
+without node 0 will boot with "mm, page_alloc: Always return a valid
+node from early_pfn_to_nid", it works because it assumes that nodes are
+always in PFN order.  This is not guaranteed so this patch adds
+robustness by always checking if the node being checked is online.
+
+Link: http://lkml.kernel.org/r/1468008031-3848-4-git-send-email-mgorman@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -286,7 +286,9 @@ static inline void reset_deferred_memini
+ /* Returns true if the struct page for the pfn is uninitialised */
+ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+ {
+-      if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn)
++      int nid = early_pfn_to_nid(pfn);
++
++      if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
+               return true;
+ 
+       return false;
diff --git a/queue-4.6/mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch b/queue-4.6/mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch

new file mode 100644 (file)

index 0000000..083f02b
--- /dev/null
+++ b/queue-4.6/mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch
@@ -0,0 +1,49 @@
+From e838a45f9392a5bd2be1cd3ab0b16ae85857461c Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@techsingularity.net>
+Date: Fri, 24 Jun 2016 14:49:37 -0700
+Subject: mm, sl[au]b: add __GFP_ATOMIC to the GFP reclaim mask
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+commit e838a45f9392a5bd2be1cd3ab0b16ae85857461c upstream.
+
+Commit d0164adc89f6 ("mm, page_alloc: distinguish between being unable
+to sleep, unwilling to sleep and avoiding waking kswapd") modified
+__GFP_WAIT to explicitly identify the difference between atomic callers
+and those that were unwilling to sleep.  Later the definition was
+removed entirely.
+
+The GFP_RECLAIM_MASK is the set of flags that affect watermark checking
+and reclaim behaviour but __GFP_ATOMIC was never added.  Without it,
+atomic users of the slab allocator strip the __GFP_ATOMIC flag and
+cannot access the page allocator atomic reserves.  This patch addresses
+the problem.
+
+The user-visible impact depends on the workload but potentially atomic
+allocations unnecessarily fail without this path.
+
+Link: http://lkml.kernel.org/r/20160610093832.GK2527@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Reported-by: Marcin Wojtas <mw@semihalf.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/internal.h |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -24,7 +24,8 @@
+  */
+ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
+                       __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
+-                      __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
++                      __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
++                      __GFP_ATOMIC)
+ 
+ /* The GFP flags allowed during early boot */
+ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
diff --git a/queue-4.6/mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch b/queue-4.6/mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch

new file mode 100644 (file)

index 0000000..1a05a82
--- /dev/null
+++ b/queue-4.6/mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch
@@ -0,0 +1,121 @@
+From 8f182270dfec432e93fae14f9208a6b9af01009f Mon Sep 17 00:00:00 2001
+From: Lukasz Odzioba <lukasz.odzioba@intel.com>
+Date: Fri, 24 Jun 2016 14:50:01 -0700
+Subject: mm/swap.c: flush lru pvecs on compound page arrival
+
+From: Lukasz Odzioba <lukasz.odzioba@intel.com>
+
+commit 8f182270dfec432e93fae14f9208a6b9af01009f upstream.
+
+Currently we can have compound pages held on per cpu pagevecs, which
+leads to a lot of memory unavailable for reclaim when needed.  In the
+systems with hundreads of processors it can be GBs of memory.
+
+On of the way of reproducing the problem is to not call munmap
+explicitly on all mapped regions (i.e.  after receiving SIGTERM).  After
+that some pages (with THP enabled also huge pages) may end up on
+lru_add_pvec, example below.
+
+  void main() {
+  #pragma omp parallel
+  {
+       size_t size = 55 * 1000 * 1000; // smaller than  MEM/CPUS
+       void *p = mmap(NULL, size, PROT_READ | PROT_WRITE,
+               MAP_PRIVATE | MAP_ANONYMOUS , -1, 0);
+       if (p != MAP_FAILED)
+               memset(p, 0, size);
+       //munmap(p, size); // uncomment to make the problem go away
+  }
+  }
+
+When we run it with THP enabled it will leave significant amount of
+memory on lru_add_pvec.  This memory will be not reclaimed if we hit
+OOM, so when we run above program in a loop:
+
+       for i in `seq 100`; do ./a.out; done
+
+many processes (95% in my case) will be killed by OOM.
+
+The primary point of the LRU add cache is to save the zone lru_lock
+contention with a hope that more pages will belong to the same zone and
+so their addition can be batched.  The huge page is already a form of
+batched addition (it will add 512 worth of memory in one go) so skipping
+the batching seems like a safer option when compared to a potential
+excess in the caching which can be quite large and much harder to fix
+because lru_add_drain_all is way to expensive and it is not really clear
+what would be a good moment to call it.
+
+Similarly we can reproduce the problem on lru_deactivate_pvec by adding:
+madvise(p, size, MADV_FREE); after memset.
+
+This patch flushes lru pvecs on compound page arrival making the problem
+less severe - after applying it kill rate of above example drops to 0%,
+due to reducing maximum amount of memory held on pvec from 28MB (with
+THP) to 56kB per CPU.
+
+Suggested-by: Michal Hocko <mhocko@suse.com>
+Link: http://lkml.kernel.org/r/1466180198-18854-1-git-send-email-lukasz.odzioba@intel.com
+Signed-off-by: Lukasz Odzioba <lukasz.odzioba@intel.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Vladimir Davydov <vdavydov@parallels.com>
+Cc: Ming Li <mingli199x@qq.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/swap.c |   11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -239,7 +239,7 @@ void rotate_reclaimable_page(struct page
+               get_page(page);
+               local_irq_save(flags);
+               pvec = this_cpu_ptr(&lru_rotate_pvecs);
+-              if (!pagevec_add(pvec, page))
++              if (!pagevec_add(pvec, page) || PageCompound(page))
+                       pagevec_move_tail(pvec);
+               local_irq_restore(flags);
+       }
+@@ -295,7 +295,7 @@ void activate_page(struct page *page)
+               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+ 
+               get_page(page);
+-              if (!pagevec_add(pvec, page))
++              if (!pagevec_add(pvec, page) || PageCompound(page))
+                       pagevec_lru_move_fn(pvec, __activate_page, NULL);
+               put_cpu_var(activate_page_pvecs);
+       }
+@@ -390,9 +390,8 @@ static void __lru_cache_add(struct page
+       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ 
+       get_page(page);
+-      if (!pagevec_space(pvec))
++      if (!pagevec_add(pvec, page) || PageCompound(page))
+               __pagevec_lru_add(pvec);
+-      pagevec_add(pvec, page);
+       put_cpu_var(lru_add_pvec);
+ }
+ 
+@@ -627,7 +626,7 @@ void deactivate_file_page(struct page *p
+       if (likely(get_page_unless_zero(page))) {
+               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
+ 
+-              if (!pagevec_add(pvec, page))
++              if (!pagevec_add(pvec, page) || PageCompound(page))
+                       pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+               put_cpu_var(lru_deactivate_file_pvecs);
+       }
+@@ -647,7 +646,7 @@ void deactivate_page(struct page *page)
+               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+ 
+               get_page(page);
+-              if (!pagevec_add(pvec, page))
++              if (!pagevec_add(pvec, page) || PageCompound(page))
+                       pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+               put_cpu_var(lru_deactivate_pvecs);
+       }
diff --git a/queue-4.6/mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch b/queue-4.6/mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch

new file mode 100644 (file)

index 0000000..0c04c99
--- /dev/null
+++ b/queue-4.6/mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch
@@ -0,0 +1,110 @@
+From 5a49973d7143ebbabd76e1dcd69ee42e349bb7b9 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Thu, 14 Jul 2016 12:07:38 -0700
+Subject: mm: thp: refix false positive BUG in page_move_anon_rmap()
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 5a49973d7143ebbabd76e1dcd69ee42e349bb7b9 upstream.
+
+The VM_BUG_ON_PAGE in page_move_anon_rmap() is more trouble than it's
+worth: the syzkaller fuzzer hit it again.  It's still wrong for some THP
+cases, because linear_page_index() was never intended to apply to
+addresses before the start of a vma.
+
+That's easily fixed with a signed long cast inside linear_page_index();
+and Dmitry has tested such a patch, to verify the false positive.  But
+why extend linear_page_index() just for this case? when the avoidance in
+page_move_anon_rmap() has already grown ugly, and there's no reason for
+the check at all (nothing else there is using address or index).
+
+Remove address arg from page_move_anon_rmap(), remove VM_BUG_ON_PAGE,
+remove CONFIG_DEBUG_VM PageTransHuge adjustment.
+
+And one more thing: should the compound_head(page) be done inside or
+outside page_move_anon_rmap()? It's usually pushed down to the lowest
+level nowadays (and mm/memory.c shows no other explicit use of it), so I
+think it's better done in page_move_anon_rmap() than by caller.
+
+Fixes: 0798d3c022dc ("mm: thp: avoid false positive VM_BUG_ON_PAGE in page_move_anon_rmap()")
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1607120444540.12528@eggly.anvils
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Rik van Riel <riel@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/rmap.h |    2 +-
+ mm/hugetlb.c         |    2 +-
+ mm/memory.c          |    3 +--
+ mm/rmap.c            |    9 +++------
+ 4 files changed, 6 insertions(+), 10 deletions(-)
+
+--- a/include/linux/rmap.h
++++ b/include/linux/rmap.h
+@@ -158,7 +158,7 @@ struct anon_vma *page_get_anon_vma(struc
+ /*
+  * rmap interfaces called when adding or removing pte of page
+  */
+-void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
++void page_move_anon_rmap(struct page *, struct vm_area_struct *);
+ void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+               unsigned long, bool);
+ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -3328,7 +3328,7 @@ retry_avoidcopy:
+       /* If no-one else is actually using this page, avoid the copy
+        * and just make the page writable */
+       if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
+-              page_move_anon_rmap(old_page, vma, address);
++              page_move_anon_rmap(old_page, vma);
+               set_huge_ptep_writable(vma, address, ptep);
+               return 0;
+       }
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -2397,8 +2397,7 @@ static int do_wp_page(struct mm_struct *
+                                * Protected against the rmap code by
+                                * the page lock.
+                                */
+-                              page_move_anon_rmap(compound_head(old_page),
+-                                                  vma, address);
++                              page_move_anon_rmap(old_page, vma);
+                       }
+                       unlock_page(old_page);
+                       return wp_page_reuse(mm, vma, address, page_table, ptl,
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -1084,23 +1084,20 @@ EXPORT_SYMBOL_GPL(page_mkclean);
+  * page_move_anon_rmap - move a page to our anon_vma
+  * @page:     the page to move to our anon_vma
+  * @vma:      the vma the page belongs to
+- * @address:  the user virtual address mapped
+  *
+  * When a page belongs exclusively to one process after a COW event,
+  * that page can be moved into the anon_vma that belongs to just that
+  * process, so the rmap code will not search the parent or sibling
+  * processes.
+  */
+-void page_move_anon_rmap(struct page *page,
+-      struct vm_area_struct *vma, unsigned long address)
++void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
+ {
+       struct anon_vma *anon_vma = vma->anon_vma;
+ 
++      page = compound_head(page);
++
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_VMA(!anon_vma, vma);
+-      if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page))
+-              address &= HPAGE_PMD_MASK;
+-      VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
+ 
+       anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+       /*
diff --git a/queue-4.6/perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch b/queue-4.6/perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch

new file mode 100644 (file)

index 0000000..77bb527
--- /dev/null
+++ b/queue-4.6/perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch
@@ -0,0 +1,91 @@
+From 53d0fe68275dbdaf6a532bb4e87f00db5d36c140 Mon Sep 17 00:00:00 2001
+From: Arnaldo Carvalho de Melo <acme@redhat.com>
+Date: Tue, 19 Apr 2016 12:16:55 -0300
+Subject: perf test: Ignore kcore files in the "vmlinux matches kallsyms" test
+
+From: Arnaldo Carvalho de Melo <acme@redhat.com>
+
+commit 53d0fe68275dbdaf6a532bb4e87f00db5d36c140 upstream.
+
+Before:
+
+  # perf test -v kallsyms
+<SNIP>
+  Maps only in vmlinux:
+   ffffffff81d5e000-ffffffff81ec3ac8 115e000 [kernel].init.text
+   ffffffff81ec3ac8-ffffffffa0000000 12c3ac8 [kernel].exit.text
+   ffffffffa0000000-ffffffffa000c000 0 [fjes]
+   ffffffffa000c000-ffffffffa0017000 0 [video]
+   ffffffffa0017000-ffffffffa001c000 0 [grace]
+<SNIP>
+   ffffffffa0a7f000-ffffffffa0ba5000 0 [xfs]
+   ffffffffa0ba5000-ffffffffffffffff 0 [veth]
+  Maps in vmlinux with a different name in kallsyms:
+  Maps only in kallsyms:
+   ffff880000100000-ffff88001000b000 80000103000 [kernel.kallsyms]
+   ffff88001000b000-ffff880100000000 8001000e000 [kernel.kallsyms]
+   ffff880100000000-ffffc90000000000 80100003000 [kernel.kallsyms]
+<SNIP>
+   ffffffffa0000000-ffffffffff600000 7fffa0003000 [kernel.kallsyms]
+   ffffffffff600000-ffffffffffffffff 7fffff603000 [kernel.kallsyms]
+  test child finished with -1
+  ---- end ----
+  vmlinux symtab matches kallsyms: FAILED!
+  #
+
+After:
+
+  # perf test -v 1
+   1: vmlinux symtab matches kallsyms                          :
+  --- start ---
+  test child forked, pid 7058
+  Looking at the vmlinux_path (8 entries long)
+  Using /lib/modules/4.6.0-rc1+/build/vmlinux for symbols
+  0xffffffff81076870: diff end addr for aesni_gcm_dec v: 0xffffffff810791f2 k: 0xffffffff81076902
+  0xffffffff81079200: diff end addr for aesni_gcm_enc v: 0xffffffff8107bb03 k: 0xffffffff81079292
+  0xffffffff8107e8d0: diff end addr for aesni_gcm_enc_avx_gen2 v: 0xffffffff81083e76 k: 0xffffffff8107e943
+  0xffffffff81083e80: diff end addr for aesni_gcm_dec_avx_gen2 v: 0xffffffff81089611 k: 0xffffffff81083ef3
+  0xffffffff81089990: diff end addr for aesni_gcm_enc_avx_gen4 v: 0xffffffff8108e7c4 k: 0xffffffff81089a03
+  0xffffffff8108e7d0: diff end addr for aesni_gcm_dec_avx_gen4 v: 0xffffffff810937ef k: 0xffffffff8108e843
+  Maps only in vmlinux:
+   ffffffff81d5e000-ffffffff81ec3ac8 115e000 [kernel].init.text
+   ffffffff81ec3ac8-ffffffffa0000000 12c3ac8 [kernel].exit.text
+  Maps in vmlinux with a different name in kallsyms:
+  Maps only in kallsyms:
+  test child finished with -1
+  ---- end ----
+ vmlinux symtab matches kallsyms: FAILED!
+  #
+
+Cc: Adrian Hunter <adrian.hunter@intel.com>
+Cc: David Ahern <dsahern@gmail.com>
+Cc: Jiri Olsa <jolsa@kernel.org>
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Wang Nan <wangnan0@huawei.com>
+Fixes: 8e0cf965f95e ("perf symbols: Add support for reading from /proc/kcore")
+Link: http://lkml.kernel.org/n/tip-n6vrwt9t89w8k769y349govx@git.kernel.org
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/perf/tests/vmlinux-kallsyms.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/tools/perf/tests/vmlinux-kallsyms.c
++++ b/tools/perf/tests/vmlinux-kallsyms.c
+@@ -54,8 +54,14 @@ int test__vmlinux_matches_kallsyms(int s
+        * Step 3:
+        *
+        * Load and split /proc/kallsyms into multiple maps, one per module.
++       * Do not use kcore, as this test was designed before kcore support
++       * and has parts that only make sense if using the non-kcore code.
++       * XXX: extend it to stress the kcorre code as well, hint: the list
++       * of modules extracted from /proc/kcore, in its current form, can't
++       * be compacted against the list of modules found in the "vmlinux"
++       * code and with the one got from /proc/modules from the "kallsyms" code.
+        */
+-      if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, NULL) <= 0) {
++      if (__machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, true, NULL) <= 0) {
+               pr_debug("dso__load_kallsyms ");
+               goto out;
+       }
diff --git a/queue-4.6/pps-do-not-crash-when-failed-to-register.patch b/queue-4.6/pps-do-not-crash-when-failed-to-register.patch

new file mode 100644 (file)

index 0000000..3b1fbfe
--- /dev/null
+++ b/queue-4.6/pps-do-not-crash-when-failed-to-register.patch
@@ -0,0 +1,64 @@
+From 368301f2fe4b07e5fb71dba3cc566bc59eb6705f Mon Sep 17 00:00:00 2001
+From: Jiri Slaby <jslaby@suse.cz>
+Date: Wed, 20 Jul 2016 15:45:08 -0700
+Subject: pps: do not crash when failed to register
+
+From: Jiri Slaby <jslaby@suse.cz>
+
+commit 368301f2fe4b07e5fb71dba3cc566bc59eb6705f upstream.
+
+With this command sequence:
+
+  modprobe plip
+  modprobe pps_parport
+  rmmod pps_parport
+
+the partport_pps modules causes this crash:
+
+  BUG: unable to handle kernel NULL pointer dereference at (null)
+  IP: parport_detach+0x1d/0x60 [pps_parport]
+  Oops: 0000 [#1] SMP
+  ...
+  Call Trace:
+    parport_unregister_driver+0x65/0xc0 [parport]
+    SyS_delete_module+0x187/0x210
+
+The sequence that builds up to this is:
+
+ 1) plip is loaded and takes the parport device for exclusive use:
+
+    plip0: Parallel port at 0x378, using IRQ 7.
+
+ 2) pps_parport then fails to grab the device:
+
+    pps_parport: parallel port PPS client
+    parport0: cannot grant exclusive access for device pps_parport
+    pps_parport: couldn't register with parport0
+
+ 3) rmmod of pps_parport is then killed because it tries to access
+    pardev->name, but pardev (taken from port->cad) is NULL.
+
+So add a check for NULL in the test there too.
+
+Link: http://lkml.kernel.org/r/20160714115245.12651-1-jslaby@suse.cz
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+Acked-by: Rodolfo Giometti <giometti@enneenne.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pps/clients/pps_parport.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/pps/clients/pps_parport.c
++++ b/drivers/pps/clients/pps_parport.c
+@@ -195,7 +195,7 @@ static void parport_detach(struct parpor
+       struct pps_client_pp *device;
+ 
+       /* FIXME: oooh, this is ugly! */
+-      if (strcmp(pardev->name, KBUILD_MODNAME))
++      if (!pardev || strcmp(pardev->name, KBUILD_MODNAME))
+               /* not our port */
+               return;
+ 
diff --git a/queue-4.6/radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch b/queue-4.6/radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch

new file mode 100644 (file)

index 0000000..c537add
--- /dev/null
+++ b/queue-4.6/radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch
@@ -0,0 +1,60 @@
+From 3cb9185c67304b2a7ea9be73e7d13df6fb2793a1 Mon Sep 17 00:00:00 2001
+From: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Date: Wed, 20 Jul 2016 15:45:00 -0700
+Subject: radix-tree: fix radix_tree_iter_retry() for tagged iterators.
+
+From: Andrey Ryabinin <aryabinin@virtuozzo.com>
+
+commit 3cb9185c67304b2a7ea9be73e7d13df6fb2793a1 upstream.
+
+radix_tree_iter_retry() resets slot to NULL, but it doesn't reset tags.
+Then NULL slot and non-zero iter.tags passed to radix_tree_next_slot()
+leading to crash:
+
+  RIP: radix_tree_next_slot include/linux/radix-tree.h:473
+    find_get_pages_tag+0x334/0x930 mm/filemap.c:1452
+  ....
+  Call Trace:
+    pagevec_lookup_tag+0x3a/0x80 mm/swap.c:960
+    mpage_prepare_extent_to_map+0x321/0xa90 fs/ext4/inode.c:2516
+    ext4_writepages+0x10be/0x2b20 fs/ext4/inode.c:2736
+    do_writepages+0x97/0x100 mm/page-writeback.c:2364
+    __filemap_fdatawrite_range+0x248/0x2e0 mm/filemap.c:300
+    filemap_write_and_wait_range+0x121/0x1b0 mm/filemap.c:490
+    ext4_sync_file+0x34d/0xdb0 fs/ext4/fsync.c:115
+    vfs_fsync_range+0x10a/0x250 fs/sync.c:195
+    vfs_fsync fs/sync.c:209
+    do_fsync+0x42/0x70 fs/sync.c:219
+    SYSC_fdatasync fs/sync.c:232
+    SyS_fdatasync+0x19/0x20 fs/sync.c:230
+    entry_SYSCALL_64_fastpath+0x23/0xc1 arch/x86/entry/entry_64.S:207
+
+We must reset iterator's tags to bail out from radix_tree_next_slot()
+and go to the slow-path in radix_tree_next_chunk().
+
+Fixes: 46437f9a554f ("radix-tree: fix race in gang lookup")
+Link: http://lkml.kernel.org/r/1468495196-10604-1-git-send-email-aryabinin@virtuozzo.com
+Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Acked-by: Konstantin Khlebnikov <koct9i@gmail.com>
+Cc: Matthew Wilcox <willy@linux.intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/radix-tree.h |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/include/linux/radix-tree.h
++++ b/include/linux/radix-tree.h
+@@ -399,6 +399,7 @@ static inline __must_check
+ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
+ {
+       iter->next_index = iter->index;
++      iter->tags = 0;
+       return NULL;
+ }
+ 
diff --git a/queue-4.6/sched-debug-fix-deadlock-when-enabling-sched-events.patch b/queue-4.6/sched-debug-fix-deadlock-when-enabling-sched-events.patch

new file mode 100644 (file)

index 0000000..e69fe40
--- /dev/null
+++ b/queue-4.6/sched-debug-fix-deadlock-when-enabling-sched-events.patch
@@ -0,0 +1,97 @@
+From eda8dca519269c92a0771668b3d5678792de7b78 Mon Sep 17 00:00:00 2001
+From: Josh Poimboeuf <jpoimboe@redhat.com>
+Date: Mon, 13 Jun 2016 02:32:09 -0500
+Subject: sched/debug: Fix deadlock when enabling sched events
+
+From: Josh Poimboeuf <jpoimboe@redhat.com>
+
+commit eda8dca519269c92a0771668b3d5678792de7b78 upstream.
+
+I see a hang when enabling sched events:
+
+  echo 1 > /sys/kernel/debug/tracing/events/sched/enable
+
+The printk buffer shows:
+
+  BUG: spinlock recursion on CPU#1, swapper/1/0
+   lock: 0xffff88007d5d8c00, .magic: dead4ead, .owner: swapper/1/0, .owner_cpu: 1
+  CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc2+ #1
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014
+  ...
+  Call Trace:
+   <IRQ>  [<ffffffff8143d663>] dump_stack+0x85/0xc2
+   [<ffffffff81115948>] spin_dump+0x78/0xc0
+   [<ffffffff81115aea>] do_raw_spin_lock+0x11a/0x150
+   [<ffffffff81891471>] _raw_spin_lock+0x61/0x80
+   [<ffffffff810e5466>] ? try_to_wake_up+0x256/0x4e0
+   [<ffffffff810e5466>] try_to_wake_up+0x256/0x4e0
+   [<ffffffff81891a0a>] ? _raw_spin_unlock_irqrestore+0x4a/0x80
+   [<ffffffff810e5705>] wake_up_process+0x15/0x20
+   [<ffffffff810cebb4>] insert_work+0x84/0xc0
+   [<ffffffff810ced7f>] __queue_work+0x18f/0x660
+   [<ffffffff810cf9a6>] queue_work_on+0x46/0x90
+   [<ffffffffa00cd95b>] drm_fb_helper_dirty.isra.11+0xcb/0xe0 [drm_kms_helper]
+   [<ffffffffa00cdac0>] drm_fb_helper_sys_imageblit+0x30/0x40 [drm_kms_helper]
+   [<ffffffff814babcd>] soft_cursor+0x1ad/0x230
+   [<ffffffff814ba379>] bit_cursor+0x649/0x680
+   [<ffffffff814b9d30>] ? update_attr.isra.2+0x90/0x90
+   [<ffffffff814b5e6a>] fbcon_cursor+0x14a/0x1c0
+   [<ffffffff81555ef8>] hide_cursor+0x28/0x90
+   [<ffffffff81558b6f>] vt_console_print+0x3bf/0x3f0
+   [<ffffffff81122c63>] call_console_drivers.constprop.24+0x183/0x200
+   [<ffffffff811241f4>] console_unlock+0x3d4/0x610
+   [<ffffffff811247f5>] vprintk_emit+0x3c5/0x610
+   [<ffffffff81124bc9>] vprintk_default+0x29/0x40
+   [<ffffffff811e965b>] printk+0x57/0x73
+   [<ffffffff810f7a9e>] enqueue_entity+0xc2e/0xc70
+   [<ffffffff810f7b39>] enqueue_task_fair+0x59/0xab0
+   [<ffffffff8106dcd9>] ? kvm_sched_clock_read+0x9/0x20
+   [<ffffffff8103fb39>] ? sched_clock+0x9/0x10
+   [<ffffffff810e3fcc>] activate_task+0x5c/0xa0
+   [<ffffffff810e4514>] ttwu_do_activate+0x54/0xb0
+   [<ffffffff810e5cea>] sched_ttwu_pending+0x7a/0xb0
+   [<ffffffff810e5e51>] scheduler_ipi+0x61/0x170
+   [<ffffffff81059e7f>] smp_trace_reschedule_interrupt+0x4f/0x2a0
+   [<ffffffff81893ba6>] trace_reschedule_interrupt+0x96/0xa0
+   <EOI>  [<ffffffff8106e0d6>] ? native_safe_halt+0x6/0x10
+   [<ffffffff8110fb1d>] ? trace_hardirqs_on+0xd/0x10
+   [<ffffffff81040ac0>] default_idle+0x20/0x1a0
+   [<ffffffff8104147f>] arch_cpu_idle+0xf/0x20
+   [<ffffffff81102f8f>] default_idle_call+0x2f/0x50
+   [<ffffffff8110332e>] cpu_startup_entry+0x37e/0x450
+   [<ffffffff8105af70>] start_secondary+0x160/0x1a0
+
+Note the hang only occurs when echoing the above from a physical serial
+console, not from an ssh session.
+
+The bug is caused by a deadlock where the task is trying to grab the rq
+lock twice because printk()'s aren't safe in sched code.
+
+Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Matt Fleming <matt@codeblueprint.co.uk>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")
+Link: http://lkml.kernel.org/r/20160613073209.gdvdybiruljbkn3p@treble
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -3194,7 +3194,7 @@ static inline void check_schedstat_requi
+                       trace_sched_stat_iowait_enabled()  ||
+                       trace_sched_stat_blocked_enabled() ||
+                       trace_sched_stat_runtime_enabled())  {
+-              pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
++              printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+                            "stat_blocked and stat_runtime require the "
+                            "kernel parameter schedstats=enabled or "
+                            "kernel.sched_schedstats=1\n");
diff --git a/queue-4.6/series b/queue-4.6/series

index 54f49665fc764ba7e932675216dc92aadab268aa..f665e0798122dd48e90fe8528709707d1278d090 100644 (file)
--- a/queue-4.6/series
+++ b/queue-4.6/series
@@ -1 +1,33 @@
  usb-ohci-don-t-mark-eds-as-ed_oper-if-scheduling-fails.patch
+x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch
+x86-quirks-reintroduce-scanning-of-secondary-buses.patch
+x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch
+dmaengine-at_xdmac-align-descriptors-on-64-bits.patch
+dmaengine-at_xdmac-fix-residue-corruption.patch
+dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch
+mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch
+memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch
+memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch
+mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch
+mm-compaction-abort-free-scanner-if-split-fails.patch
+fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch
+mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch
+uapi-export-lirc.h-header.patch
+mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch
+mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch
+vmlinux.lds-account-for-destructor-sections.patch
+perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch
+mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch
+mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch
+radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch
+pps-do-not-crash-when-failed-to-register.patch
+kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch
+sched-debug-fix-deadlock-when-enabling-sched-events.patch
+arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch
+arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch
+xen-pciback-fix-conf_space-read-write-overlap-check.patch
+xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch
+xenbus-don-t-bug-on-user-mode-induced-condition.patch
+xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch
+xen-blkfront-fix-resume-issues-after-a-migration.patch
+xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch
diff --git a/queue-4.6/uapi-export-lirc.h-header.patch b/queue-4.6/uapi-export-lirc.h-header.patch

new file mode 100644 (file)

index 0000000..e3dcee1
--- /dev/null
+++ b/queue-4.6/uapi-export-lirc.h-header.patch
@@ -0,0 +1,38 @@
+From 12cb22bb8ae9aff9d72a9c0a234f26d641b20eb6 Mon Sep 17 00:00:00 2001
+From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
+Date: Thu, 14 Jul 2016 12:07:15 -0700
+Subject: uapi: export lirc.h header
+
+From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
+
+commit 12cb22bb8ae9aff9d72a9c0a234f26d641b20eb6 upstream.
+
+This header contains the userspace API for lirc.
+
+This is a fixup for commit b7be755733dc ("[media] bz#75751: Move
+internal header file lirc.h to uapi/").  It moved the header to the
+right place, but it forgot to add it at Kbuild.  So, despite being at
+uapi, it is not copied to the right place.
+
+Fixes: b7be755733dc44c72 ("[media] bz#75751: Move internal header file lirc.h to uapi/")
+Link: http://lkml.kernel.org/r/320c765d32bfc82c582e336d52ffe1026c73c644.1468439021.git.mchehab@s-opensource.com
+Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
+Cc: Alec Leamas <leamas.alec@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/uapi/linux/Kbuild |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/include/uapi/linux/Kbuild
++++ b/include/uapi/linux/Kbuild
+@@ -244,6 +244,7 @@ endif
+ header-y += hw_breakpoint.h
+ header-y += l2tp.h
+ header-y += libc-compat.h
++header-y += lirc.h
+ header-y += limits.h
+ header-y += llc.h
+ header-y += loop.h
diff --git a/queue-4.6/vmlinux.lds-account-for-destructor-sections.patch b/queue-4.6/vmlinux.lds-account-for-destructor-sections.patch

new file mode 100644 (file)

index 0000000..b09b4da
--- /dev/null
+++ b/queue-4.6/vmlinux.lds-account-for-destructor-sections.patch
@@ -0,0 +1,81 @@
+From e41f501d391265ff568f3e49d6128cc30856a36f Mon Sep 17 00:00:00 2001
+From: Dmitry Vyukov <dvyukov@google.com>
+Date: Thu, 14 Jul 2016 12:07:29 -0700
+Subject: vmlinux.lds: account for destructor sections
+
+From: Dmitry Vyukov <dvyukov@google.com>
+
+commit e41f501d391265ff568f3e49d6128cc30856a36f upstream.
+
+If CONFIG_KASAN is enabled and gcc is configured with
+--disable-initfini-array and/or gold linker is used, gcc emits
+.ctors/.dtors and .text.startup/.text.exit sections instead of
+.init_array/.fini_array.  .dtors section is not explicitly accounted in
+the linker script and messes vvar/percpu layout.
+
+We want:
+  ffffffff822bfd80 D _edata
+  ffffffff822c0000 D __vvar_beginning_hack
+  ffffffff822c0000 A __vvar_page
+  ffffffff822c0080 0000000000000098 D vsyscall_gtod_data
+  ffffffff822c1000 A __init_begin
+  ffffffff822c1000 D init_per_cpu__irq_stack_union
+  ffffffff822c1000 A __per_cpu_load
+  ffffffff822d3000 D init_per_cpu__gdt_page
+
+We got:
+  ffffffff8279a600 D _edata
+  ffffffff8279b000 A __vvar_page
+  ffffffff8279c000 A __init_begin
+  ffffffff8279c000 D init_per_cpu__irq_stack_union
+  ffffffff8279c000 A __per_cpu_load
+  ffffffff8279e000 D __vvar_beginning_hack
+  ffffffff8279e080 0000000000000098 D vsyscall_gtod_data
+  ffffffff827ae000 D init_per_cpu__gdt_page
+
+This happens because __vvar_page and .vvar get different addresses in
+arch/x86/kernel/vmlinux.lds.S:
+
+       . = ALIGN(PAGE_SIZE);
+       __vvar_page = .;
+
+       .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
+               /* work around gold bug 13023 */
+               __vvar_beginning_hack = .;
+
+Discard .dtors/.fini_array/.text.exit, since we don't call dtors.
+Merge .text.startup into init text.
+
+Link: http://lkml.kernel.org/r/1467386363-120030-1-git-send-email-dvyukov@google.com
+Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
+Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/asm-generic/vmlinux.lds.h |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/include/asm-generic/vmlinux.lds.h
++++ b/include/asm-generic/vmlinux.lds.h
+@@ -540,15 +540,19 @@
+ 
+ #define INIT_TEXT                                                     \
+       *(.init.text)                                                   \
++      *(.text.startup)                                                \
+       MEM_DISCARD(init.text)
+ 
+ #define EXIT_DATA                                                     \
+       *(.exit.data)                                                   \
++      *(.fini_array)                                                  \
++      *(.dtors)                                                       \
+       MEM_DISCARD(exit.data)                                          \
+       MEM_DISCARD(exit.rodata)
+ 
+ #define EXIT_TEXT                                                     \
+       *(.exit.text)                                                   \
++      *(.text.exit)                                                   \
+       MEM_DISCARD(exit.text)
+ 
+ #define EXIT_CALL                                                     \
diff --git a/queue-4.6/x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch b/queue-4.6/x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch

new file mode 100644 (file)

index 0000000..c9cb1cd
--- /dev/null
+++ b/queue-4.6/x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch
@@ -0,0 +1,246 @@
+From abb2bafd295fe962bbadc329dbfb2146457283ac Mon Sep 17 00:00:00 2001
+From: Lukas Wunner <lukas@wunner.de>
+Date: Sun, 12 Jun 2016 12:31:53 +0200
+Subject: x86/quirks: Add early quirk to reset Apple AirPort card
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Lukas Wunner <lukas@wunner.de>
+
+commit abb2bafd295fe962bbadc329dbfb2146457283ac upstream.
+
+The EFI firmware on Macs contains a full-fledged network stack for
+downloading OS X images from osrecovery.apple.com. Unfortunately
+on Macs introduced 2011 and 2012, EFI brings up the Broadcom 4331
+wireless card on every boot and leaves it enabled even after
+ExitBootServices has been called. The card continues to assert its IRQ
+line, causing spurious interrupts if the IRQ is shared. It also corrupts
+memory by DMAing received packets, allowing for remote code execution
+over the air. This only stops when a driver is loaded for the wireless
+card, which may be never if the driver is not installed or blacklisted.
+
+The issue seems to be constrained to the Broadcom 4331. Chris Milsted
+has verified that the newer Broadcom 4360 built into the MacBookPro11,3
+(2013/2014) does not exhibit this behaviour. The chances that Apple will
+ever supply a firmware fix for the older machines appear to be zero.
+
+The solution is to reset the card on boot by writing to a reset bit in
+its mmio space. This must be done as an early quirk and not as a plain
+vanilla PCI quirk to successfully combat memory corruption by DMAed
+packets: Matthew Garrett found out in 2012 that the packets are written
+to EfiBootServicesData memory (http://mjg59.dreamwidth.org/11235.html).
+This type of memory is made available to the page allocator by
+efi_free_boot_services(). Plain vanilla PCI quirks run much later, in
+subsys initcall level. In-between a time window would be open for memory
+corruption. Random crashes occurring in this time window and attributed
+to DMAed packets have indeed been observed in the wild by Chris
+Bainbridge.
+
+When Matthew Garrett analyzed the memory corruption issue in 2012, he
+sought to fix it with a grub quirk which transitions the card to D3hot:
+http://git.savannah.gnu.org/cgit/grub.git/commit/?id=9d34bb85da56
+
+This approach does not help users with other bootloaders and while it
+may prevent DMAed packets, it does not cure the spurious interrupts
+emanating from the card. Unfortunately the card's mmio space is
+inaccessible in D3hot, so to reset it, we have to undo the effect of
+Matthew's grub patch and transition the card back to D0.
+
+Note that the quirk takes a few shortcuts to reduce the amount of code:
+The size of BAR 0 and the location of the PM capability is identical
+on all affected machines and therefore hardcoded. Only the address of
+BAR 0 differs between models. Also, it is assumed that the BCMA core
+currently mapped is the 802.11 core. The EFI driver seems to always take
+care of this.
+
+Michael Büsch, Bjorn Helgaas and Matt Fleming contributed feedback
+towards finding the best solution to this problem.
+
+The following should be a comprehensive list of affected models:
+    iMac13,1        2012  21.5"       [Root Port 00:1c.3 = 8086:1e16]
+    iMac13,2        2012  27"         [Root Port 00:1c.3 = 8086:1e16]
+    Macmini5,1      2011  i5 2.3 GHz  [Root Port 00:1c.1 = 8086:1c12]
+    Macmini5,2      2011  i5 2.5 GHz  [Root Port 00:1c.1 = 8086:1c12]
+    Macmini5,3      2011  i7 2.0 GHz  [Root Port 00:1c.1 = 8086:1c12]
+    Macmini6,1      2012  i5 2.5 GHz  [Root Port 00:1c.1 = 8086:1e12]
+    Macmini6,2      2012  i7 2.3 GHz  [Root Port 00:1c.1 = 8086:1e12]
+    MacBookPro8,1   2011  13"         [Root Port 00:1c.1 = 8086:1c12]
+    MacBookPro8,2   2011  15"         [Root Port 00:1c.1 = 8086:1c12]
+    MacBookPro8,3   2011  17"         [Root Port 00:1c.1 = 8086:1c12]
+    MacBookPro9,1   2012  15"         [Root Port 00:1c.1 = 8086:1e12]
+    MacBookPro9,2   2012  13"         [Root Port 00:1c.1 = 8086:1e12]
+    MacBookPro10,1  2012  15"         [Root Port 00:1c.1 = 8086:1e12]
+    MacBookPro10,2  2012  13"         [Root Port 00:1c.1 = 8086:1e12]
+
+For posterity, spurious interrupts caused by the Broadcom 4331 wireless
+card resulted in splats like this (stacktrace omitted):
+
+    irq 17: nobody cared (try booting with the "irqpoll" option)
+    handlers:
+    [<ffffffff81374370>] pcie_isr
+    [<ffffffffc0704550>] sdhci_irq [sdhci] threaded [<ffffffffc07013c0>] sdhci_thread_irq [sdhci]
+    [<ffffffffc0a0b960>] azx_interrupt [snd_hda_codec]
+    Disabling IRQ #17
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79301
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111781
+Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=728916
+Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=895951#c16
+Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1009819
+Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1098621
+Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1149632#c5
+Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1279130
+Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1332732
+Tested-by: Konstantin Simanov <k.simanov@stlk.ru>        # [MacBookPro8,1]
+Tested-by: Lukas Wunner <lukas@wunner.de>                # [MacBookPro9,1]
+Tested-by: Bryan Paradis <bryan.paradis@gmail.com>       # [MacBookPro9,2]
+Tested-by: Andrew Worsley <amworsley@gmail.com>          # [MacBookPro10,1]
+Tested-by: Chris Bainbridge <chris.bainbridge@gmail.com> # [MacBookPro10,2]
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Acked-by: Rafał Miłecki <zajec5@gmail.com>
+Acked-by: Matt Fleming <matt@codeblueprint.co.uk>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Bjorn Helgaas <bhelgaas@google.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Chris Milsted <cmilsted@redhat.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Matthew Garrett <mjg59@srcf.ucam.org>
+Cc: Michael Buesch <m@bues.ch>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Yinghai Lu <yinghai@kernel.org>
+Cc: b43-dev@lists.infradead.org
+Cc: linux-pci@vger.kernel.org
+Cc: linux-wireless@vger.kernel.org
+Link: http://lkml.kernel.org/r/48d0972ac82a53d460e5fce77a07b2560db95203.1465690253.git.lukas@wunner.de
+[ Did minor readability edits. ]
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/early-quirks.c |   64 +++++++++++++++++++++++++++++++++++++++++
+ drivers/bcma/bcma_private.h    |    2 -
+ include/linux/bcma/bcma.h      |    1 
+ 3 files changed, 65 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/early-quirks.c
++++ b/arch/x86/kernel/early-quirks.c
+@@ -11,7 +11,11 @@
+ 
+ #include <linux/pci.h>
+ #include <linux/acpi.h>
++#include <linux/delay.h>
++#include <linux/dmi.h>
+ #include <linux/pci_ids.h>
++#include <linux/bcma/bcma.h>
++#include <linux/bcma/bcma_regs.h>
+ #include <drm/i915_drm.h>
+ #include <asm/pci-direct.h>
+ #include <asm/dma.h>
+@@ -21,6 +25,9 @@
+ #include <asm/iommu.h>
+ #include <asm/gart.h>
+ #include <asm/irq_remapping.h>
++#include <asm/early_ioremap.h>
++
++#define dev_err(msg)  pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg)
+ 
+ static void __init fix_hypertransport_config(int num, int slot, int func)
+ {
+@@ -597,6 +604,61 @@ static void __init force_disable_hpet(in
+ #endif
+ }
+ 
++#define BCM4331_MMIO_SIZE     16384
++#define BCM4331_PM_CAP                0x40
++#define bcma_aread32(reg)     ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
++#define bcma_awrite32(reg, val)       iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg)
++
++static void __init apple_airport_reset(int bus, int slot, int func)
++{
++      void __iomem *mmio;
++      u16 pmcsr;
++      u64 addr;
++      int i;
++
++      if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc."))
++              return;
++
++      /* Card may have been put into PCI_D3hot by grub quirk */
++      pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
++
++      if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
++              pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
++              write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr);
++              mdelay(10);
++
++              pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
++              if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
++                      dev_err("Cannot power up Apple AirPort card\n");
++                      return;
++              }
++      }
++
++      addr  =      read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
++      addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32;
++      addr &= PCI_BASE_ADDRESS_MEM_MASK;
++
++      mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
++      if (!mmio) {
++              dev_err("Cannot iomap Apple AirPort card\n");
++              return;
++      }
++
++      pr_info("Resetting Apple AirPort card (left enabled by EFI)\n");
++
++      for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++)
++              udelay(10);
++
++      bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
++      bcma_aread32(BCMA_RESET_CTL);
++      udelay(1);
++
++      bcma_awrite32(BCMA_RESET_CTL, 0);
++      bcma_aread32(BCMA_RESET_CTL);
++      udelay(10);
++
++      early_iounmap(mmio, BCM4331_MMIO_SIZE);
++}
+ 
+ #define QFLAG_APPLY_ONCE      0x1
+ #define QFLAG_APPLIED         0x2
+@@ -639,6 +701,8 @@ static struct chipset early_qrk[] __init
+        */
+       { PCI_VENDOR_ID_INTEL, 0x0f00,
+               PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
++      { PCI_VENDOR_ID_BROADCOM, 0x4331,
++        PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
+       {}
+ };
+ 
+--- a/drivers/bcma/bcma_private.h
++++ b/drivers/bcma/bcma_private.h
+@@ -8,8 +8,6 @@
+ #include <linux/bcma/bcma.h>
+ #include <linux/delay.h>
+ 
+-#define BCMA_CORE_SIZE                0x1000
+-
+ #define bcma_err(bus, fmt, ...) \
+       pr_err("bus%d: " fmt, (bus)->num, ##__VA_ARGS__)
+ #define bcma_warn(bus, fmt, ...) \
+--- a/include/linux/bcma/bcma.h
++++ b/include/linux/bcma/bcma.h
+@@ -158,6 +158,7 @@ struct bcma_host_ops {
+ #define BCMA_CORE_DEFAULT             0xFFF
+ 
+ #define BCMA_MAX_NR_CORES             16
++#define BCMA_CORE_SIZE                        0x1000
+ 
+ /* Chip IDs of PCIe devices */
+ #define BCMA_CHIP_ID_BCM4313  0x4313
diff --git a/queue-4.6/x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch b/queue-4.6/x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch

new file mode 100644 (file)

index 0000000..bec033c
--- /dev/null
+++ b/queue-4.6/x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch
@@ -0,0 +1,59 @@
+From 447d29d1d3aed839e74c2401ef63387780ac51ed Mon Sep 17 00:00:00 2001
+From: Lukas Wunner <lukas@wunner.de>
+Date: Sun, 12 Jun 2016 12:31:53 +0200
+Subject: x86/quirks: Apply nvidia_bugs quirk only on root bus
+
+From: Lukas Wunner <lukas@wunner.de>
+
+commit 447d29d1d3aed839e74c2401ef63387780ac51ed upstream.
+
+Since the following commit:
+
+  8659c406ade3 ("x86: only scan the root bus in early PCI quirks")
+
+... early quirks are only applied to devices on the root bus.
+
+The motivation was to prevent application of the nvidia_bugs quirk on
+secondary buses.
+
+We're about to reintroduce scanning of secondary buses for a quirk to
+reset the Broadcom 4331 wireless card on 2011/2012 Macs. To prevent
+regressions, open code the requirement to apply nvidia_bugs only on the
+root bus.
+
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Bjorn Helgaas <bhelgaas@google.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Yinghai Lu <yinghai@kernel.org>
+Link: http://lkml.kernel.org/r/4d5477c1d76b2f0387a780f2142bbcdd9fee869b.1465690253.git.lukas@wunner.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/early-quirks.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/arch/x86/kernel/early-quirks.c
++++ b/arch/x86/kernel/early-quirks.c
+@@ -76,6 +76,13 @@ static void __init nvidia_bugs(int num,
+ #ifdef CONFIG_ACPI
+ #ifdef CONFIG_X86_IO_APIC
+       /*
++       * Only applies to Nvidia root ports (bus 0) and not to
++       * Nvidia graphics cards with PCI ports on secondary buses.
++       */
++      if (num)
++              return;
++
++      /*
+        * All timer overrides on Nvidia are
+        * wrong unless HPET is enabled.
+        * Unfortunately that's not true on many Asus boards.
diff --git a/queue-4.6/x86-quirks-reintroduce-scanning-of-secondary-buses.patch b/queue-4.6/x86-quirks-reintroduce-scanning-of-secondary-buses.patch

new file mode 100644 (file)

index 0000000..06124d8
--- /dev/null
+++ b/queue-4.6/x86-quirks-reintroduce-scanning-of-secondary-buses.patch
@@ -0,0 +1,151 @@
+From 850c321027c2e31d0afc71588974719a4b565550 Mon Sep 17 00:00:00 2001
+From: Lukas Wunner <lukas@wunner.de>
+Date: Sun, 12 Jun 2016 12:31:53 +0200
+Subject: x86/quirks: Reintroduce scanning of secondary buses
+
+From: Lukas Wunner <lukas@wunner.de>
+
+commit 850c321027c2e31d0afc71588974719a4b565550 upstream.
+
+We used to scan secondary buses until the following commit that
+was applied in 2009:
+
+  8659c406ade3 ("x86: only scan the root bus in early PCI quirks")
+
+which commit constrained early quirks to the root bus only. Its
+motivation was to prevent application of the nvidia_bugs quirk
+on secondary buses.
+
+We're about to add a quirk to reset the Broadcom 4331 wireless card on
+2011/2012 Macs, which is located on a secondary bus behind a PCIe root
+port. To facilitate that, reintroduce scanning of secondary buses.
+
+The commit message of 8659c406ade3 notes that scanning only the root bus
+"saves quite some unnecessary scanning work". The algorithm used prior
+to 8659c406ade3 was particularly time consuming because it scanned
+buses 0 to 31 brute force. To avoid lengthening boot time, employ a
+recursive strategy which only scans buses that are actually reachable
+from the root bus.
+
+Yinghai Lu pointed out that the secondary bus number read from a
+bridge's config space may be invalid, in particular a value of 0 would
+cause an infinite loop. The PCI core goes beyond that and recurses to a
+child bus only if its bus number is greater than the parent bus number
+(see pci_scan_bridge()). Since the root bus is numbered 0, this implies
+that secondary buses may not be 0. Do the same on early scanning.
+
+If this algorithm is found to significantly impact boot time or cause
+infinite loops on broken hardware, it would be possible to limit its
+recursion depth: The Broadcom 4331 quirk applies at depth 1, all others
+at depth 0, so the bus need not be scanned deeper than that for now. An
+alternative approach would be to revert to scanning only the root bus,
+and apply the Broadcom 4331 quirk to the root ports 8086:1c12, 8086:1e12
+and 8086:1e16. Apple always positioned the card behind either of these
+three ports. The quirk would then check presence of the card in slot 0
+below the root port and do its deed.
+
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Bjorn Helgaas <bhelgaas@google.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Yinghai Lu <yinghai@kernel.org>
+Cc: linux-pci@vger.kernel.org
+Link: http://lkml.kernel.org/r/f0daa70dac1a9b2483abdb31887173eb6ab77bdf.1465690253.git.lukas@wunner.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/early-quirks.c |   34 +++++++++++++++++++++-------------
+ 1 file changed, 21 insertions(+), 13 deletions(-)
+
+--- a/arch/x86/kernel/early-quirks.c
++++ b/arch/x86/kernel/early-quirks.c
+@@ -610,12 +610,6 @@ struct chipset {
+       void (*f)(int num, int slot, int func);
+ };
+ 
+-/*
+- * Only works for devices on the root bus. If you add any devices
+- * not on bus 0 readd another loop level in early_quirks(). But
+- * be careful because at least the Nvidia quirk here relies on
+- * only matching on bus 0.
+- */
+ static struct chipset early_qrk[] __initdata = {
+       { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+         PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
+@@ -648,6 +642,8 @@ static struct chipset early_qrk[] __init
+       {}
+ };
+ 
++static void __init early_pci_scan_bus(int bus);
++
+ /**
+  * check_dev_quirk - apply early quirks to a given PCI device
+  * @num: bus number
+@@ -656,7 +652,7 @@ static struct chipset early_qrk[] __init
+  *
+  * Check the vendor & device ID against the early quirks table.
+  *
+- * If the device is single function, let early_quirks() know so we don't
++ * If the device is single function, let early_pci_scan_bus() know so we don't
+  * poke at this device again.
+  */
+ static int __init check_dev_quirk(int num, int slot, int func)
+@@ -665,6 +661,7 @@ static int __init check_dev_quirk(int nu
+       u16 vendor;
+       u16 device;
+       u8 type;
++      u8 sec;
+       int i;
+ 
+       class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
+@@ -692,25 +689,36 @@ static int __init check_dev_quirk(int nu
+ 
+       type = read_pci_config_byte(num, slot, func,
+                                   PCI_HEADER_TYPE);
++
++      if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
++              sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
++              if (sec > num)
++                      early_pci_scan_bus(sec);
++      }
++
+       if (!(type & 0x80))
+               return -1;
+ 
+       return 0;
+ }
+ 
+-void __init early_quirks(void)
++static void __init early_pci_scan_bus(int bus)
+ {
+       int slot, func;
+ 
+-      if (!early_pci_allowed())
+-              return;
+-
+       /* Poor man's PCI discovery */
+-      /* Only scan the root bus */
+       for (slot = 0; slot < 32; slot++)
+               for (func = 0; func < 8; func++) {
+                       /* Only probe function 0 on single fn devices */
+-                      if (check_dev_quirk(0, slot, func))
++                      if (check_dev_quirk(bus, slot, func))
+                               break;
+               }
+ }
++
++void __init early_quirks(void)
++{
++      if (!early_pci_allowed())
++              return;
++
++      early_pci_scan_bus(0);
++}
diff --git a/queue-4.6/xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch b/queue-4.6/xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch

new file mode 100644 (file)

index 0000000..27dd0a7
--- /dev/null
+++ b/queue-4.6/xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch
@@ -0,0 +1,86 @@
+From efd1535270c1deb0487527bf0c3c827301a69c93 Mon Sep 17 00:00:00 2001
+From: Bob Liu <bob.liu@oracle.com>
+Date: Tue, 7 Jun 2016 10:43:15 -0400
+Subject: xen-blkfront: don't call talk_to_blkback when already connected to blkback
+
+From: Bob Liu <bob.liu@oracle.com>
+
+commit efd1535270c1deb0487527bf0c3c827301a69c93 upstream.
+
+Sometimes blkfront may twice receive blkback_changed() notification
+(XenbusStateConnected) after migration, which will cause
+talk_to_blkback() to be called twice too and confuse xen-blkback.
+
+The flow is as follow:
+   blkfront                                        blkback
+blkfront_resume()
+ > talk_to_blkback()
+  > Set blkfront to XenbusStateInitialised
+                                                front changed()
+                                                 > Connect()
+                                                  > Set blkback to XenbusStateConnected
+
+blkback_changed()
+ > Skip talk_to_blkback()
+   because frontstate == XenbusStateInitialised
+ > blkfront_connect()
+  > Set blkfront to XenbusStateConnected
+
+-----
+And here we get another XenbusStateConnected notification leading
+to:
+-----
+blkback_changed()
+ > because now frontstate != XenbusStateInitialised
+   talk_to_blkback() is also called again
+  > blkfront state changed from
+  XenbusStateConnected to XenbusStateInitialised
+    (Which is not correct!)
+
+                                               front_changed():
+                                                 > Do nothing because blkback
+                                                   already in XenbusStateConnected
+
+Now blkback is in XenbusStateConnected but blkfront is still
+in XenbusStateInitialised - leading to no disks.
+
+Poking of the XenbusStateConnected state is allowed (to deal with
+block disk change) and has to be dealt with. The most likely
+cause of this bug are custom udev scripts hooking up the disks
+and then validating the size.
+
+Signed-off-by: Bob Liu <bob.liu@oracle.com>
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/xen-blkfront.c |   15 ++++++++++++++-
+ 1 file changed, 14 insertions(+), 1 deletion(-)
+
+--- a/drivers/block/xen-blkfront.c
++++ b/drivers/block/xen-blkfront.c
+@@ -2469,10 +2469,23 @@ static void blkback_changed(struct xenbu
+               break;
+ 
+       case XenbusStateConnected:
+-              if (dev->state != XenbusStateInitialised) {
++              /*
++               * talk_to_blkback sets state to XenbusStateInitialised
++               * and blkfront_connect sets it to XenbusStateConnected
++               * (if connection went OK).
++               *
++               * If the backend (or toolstack) decides to poke at backend
++               * state (and re-trigger the watch by setting the state repeatedly
++               * to XenbusStateConnected (4)) we need to deal with this.
++               * This is allowed as this is used to communicate to the guest
++               * that the size of disk has changed!
++               */
++              if ((dev->state != XenbusStateInitialised) &&
++                  (dev->state != XenbusStateConnected)) {
+                       if (talk_to_blkback(dev, info))
+                               break;
+               }
++
+               blkfront_connect(info);
+               break;
+ 
diff --git a/queue-4.6/xen-blkfront-fix-resume-issues-after-a-migration.patch b/queue-4.6/xen-blkfront-fix-resume-issues-after-a-migration.patch

new file mode 100644 (file)

index 0000000..f69190e
--- /dev/null
+++ b/queue-4.6/xen-blkfront-fix-resume-issues-after-a-migration.patch
@@ -0,0 +1,81 @@
+From 2a6f71ad99cabe436e70c3f5fcf58072cb3bc07f Mon Sep 17 00:00:00 2001
+From: Bob Liu <bob.liu@oracle.com>
+Date: Tue, 31 May 2016 16:59:17 +0800
+Subject: xen-blkfront: fix resume issues after a migration
+
+From: Bob Liu <bob.liu@oracle.com>
+
+commit 2a6f71ad99cabe436e70c3f5fcf58072cb3bc07f upstream.
+
+After a migrate to another host (which may not have multiqueue
+support), the number of rings (block hardware queues)
+may be changed and the ring info structure will also be reallocated.
+
+This patch fixes two related bugs:
+ * call blk_mq_update_nr_hw_queues() to make blk-core know the number
+   of hardware queues have been changed.
+ * Don't store rinfo pointer to hctx->driver_data, because rinfo may be
+   reallocated so use hctx->queue_num to get the rinfo structure instead.
+
+Signed-off-by: Bob Liu <bob.liu@oracle.com>
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/xen-blkfront.c |   20 ++++++++------------
+ 1 file changed, 8 insertions(+), 12 deletions(-)
+
+--- a/drivers/block/xen-blkfront.c
++++ b/drivers/block/xen-blkfront.c
+@@ -877,8 +877,12 @@ static int blkif_queue_rq(struct blk_mq_
+                         const struct blk_mq_queue_data *qd)
+ {
+       unsigned long flags;
+-      struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
++      int qid = hctx->queue_num;
++      struct blkfront_info *info = hctx->queue->queuedata;
++      struct blkfront_ring_info *rinfo = NULL;
+ 
++      BUG_ON(info->nr_rings <= qid);
++      rinfo = &info->rinfo[qid];
+       blk_mq_start_request(qd->rq);
+       spin_lock_irqsave(&rinfo->ring_lock, flags);
+       if (RING_FULL(&rinfo->ring))
+@@ -904,20 +908,9 @@ out_busy:
+       return BLK_MQ_RQ_QUEUE_BUSY;
+ }
+ 
+-static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+-                          unsigned int index)
+-{
+-      struct blkfront_info *info = (struct blkfront_info *)data;
+-
+-      BUG_ON(info->nr_rings <= index);
+-      hctx->driver_data = &info->rinfo[index];
+-      return 0;
+-}
+-
+ static struct blk_mq_ops blkfront_mq_ops = {
+       .queue_rq = blkif_queue_rq,
+       .map_queue = blk_mq_map_queue,
+-      .init_hctx = blk_mq_init_hctx,
+ };
+ 
+ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
+@@ -953,6 +946,7 @@ static int xlvbd_init_blk_queue(struct g
+               return PTR_ERR(rq);
+       }
+ 
++      rq->queuedata = info;
+       queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
+ 
+       if (info->feature_discard) {
+@@ -2137,6 +2131,8 @@ static int blkfront_resume(struct xenbus
+               return err;
+ 
+       err = talk_to_blkback(dev, info);
++      if (!err)
++              blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
+ 
+       /*
+        * We have to wait for the backend to switch to
diff --git a/queue-4.6/xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch b/queue-4.6/xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch

new file mode 100644 (file)

index 0000000..79ff83a
--- /dev/null
+++ b/queue-4.6/xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch
@@ -0,0 +1,177 @@
+From 7b427a59538a98161321aa46c13f4ea81b43f4eb Mon Sep 17 00:00:00 2001
+From: Bob Liu <bob.liu@oracle.com>
+Date: Mon, 27 Jun 2016 16:33:24 +0800
+Subject: xen-blkfront: save uncompleted reqs in blkfront_resume()
+
+From: Bob Liu <bob.liu@oracle.com>
+
+commit 7b427a59538a98161321aa46c13f4ea81b43f4eb upstream.
+
+Uncompleted reqs used to be 'saved and resubmitted' in blkfront_recover() during
+migration, but that's too late after multi-queue was introduced.
+
+After a migrate to another host (which may not have multiqueue support), the
+number of rings (block hardware queues) may be changed and the ring and shadow
+structure will also be reallocated.
+
+The blkfront_recover() then can't 'save and resubmit' the real
+uncompleted reqs because shadow structure have been reallocated.
+
+This patch fixes this issue by moving the 'save' logic out of
+blkfront_recover() to earlier place in blkfront_resume().
+
+The 'resubmit' is not changed and still in blkfront_recover().
+
+Signed-off-by: Bob Liu <bob.liu@oracle.com>
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/xen-blkfront.c |   91 ++++++++++++++++++-------------------------
+ 1 file changed, 40 insertions(+), 51 deletions(-)
+
+--- a/drivers/block/xen-blkfront.c
++++ b/drivers/block/xen-blkfront.c
+@@ -207,6 +207,9 @@ struct blkfront_info
+       struct blk_mq_tag_set tag_set;
+       struct blkfront_ring_info *rinfo;
+       unsigned int nr_rings;
++      /* Save uncomplete reqs and bios for migration. */
++      struct list_head requests;
++      struct bio_list bio_list;
+ };
+ 
+ static unsigned int nr_minors;
+@@ -2007,69 +2010,22 @@ static int blkif_recover(struct blkfront
+ {
+       unsigned int i, r_index;
+       struct request *req, *n;
+-      struct blk_shadow *copy;
+       int rc;
+       struct bio *bio, *cloned_bio;
+-      struct bio_list bio_list, merge_bio;
+       unsigned int segs, offset;
+       int pending, size;
+       struct split_bio *split_bio;
+-      struct list_head requests;
+ 
+       blkfront_gather_backend_features(info);
+       segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       blk_queue_max_segments(info->rq, segs);
+-      bio_list_init(&bio_list);
+-      INIT_LIST_HEAD(&requests);
+ 
+       for (r_index = 0; r_index < info->nr_rings; r_index++) {
+-              struct blkfront_ring_info *rinfo;
+-
+-              rinfo = &info->rinfo[r_index];
+-              /* Stage 1: Make a safe copy of the shadow state. */
+-              copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
+-                             GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
+-              if (!copy)
+-                      return -ENOMEM;
+-
+-              /* Stage 2: Set up free list. */
+-              memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
+-              for (i = 0; i < BLK_RING_SIZE(info); i++)
+-                      rinfo->shadow[i].req.u.rw.id = i+1;
+-              rinfo->shadow_free = rinfo->ring.req_prod_pvt;
+-              rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
++              struct blkfront_ring_info *rinfo = &info->rinfo[r_index];
+ 
+               rc = blkfront_setup_indirect(rinfo);
+-              if (rc) {
+-                      kfree(copy);
++              if (rc)
+                       return rc;
+-              }
+-
+-              for (i = 0; i < BLK_RING_SIZE(info); i++) {
+-                      /* Not in use? */
+-                      if (!copy[i].request)
+-                              continue;
+-
+-                      /*
+-                       * Get the bios in the request so we can re-queue them.
+-                       */
+-                      if (copy[i].request->cmd_flags &
+-                          (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+-                              /*
+-                               * Flush operations don't contain bios, so
+-                               * we need to requeue the whole request
+-                               */
+-                              list_add(&copy[i].request->queuelist, &requests);
+-                              continue;
+-                      }
+-                      merge_bio.head = copy[i].request->bio;
+-                      merge_bio.tail = copy[i].request->biotail;
+-                      bio_list_merge(&bio_list, &merge_bio);
+-                      copy[i].request->bio = NULL;
+-                      blk_end_request_all(copy[i].request, 0);
+-              }
+-
+-              kfree(copy);
+       }
+       xenbus_switch_state(info->xbdev, XenbusStateConnected);
+ 
+@@ -2084,7 +2040,7 @@ static int blkif_recover(struct blkfront
+               kick_pending_request_queues(rinfo);
+       }
+ 
+-      list_for_each_entry_safe(req, n, &requests, queuelist) {
++      list_for_each_entry_safe(req, n, &info->requests, queuelist) {
+               /* Requeue pending requests (flush or discard) */
+               list_del_init(&req->queuelist);
+               BUG_ON(req->nr_phys_segments > segs);
+@@ -2092,7 +2048,7 @@ static int blkif_recover(struct blkfront
+       }
+       blk_mq_kick_requeue_list(info->rq);
+ 
+-      while ((bio = bio_list_pop(&bio_list)) != NULL) {
++      while ((bio = bio_list_pop(&info->bio_list)) != NULL) {
+               /* Traverse the list of pending bios and re-queue them */
+               if (bio_segments(bio) > segs) {
+                       /*
+@@ -2138,9 +2094,42 @@ static int blkfront_resume(struct xenbus
+ {
+       struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+       int err = 0;
++      unsigned int i, j;
+ 
+       dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
+ 
++      bio_list_init(&info->bio_list);
++      INIT_LIST_HEAD(&info->requests);
++      for (i = 0; i < info->nr_rings; i++) {
++              struct blkfront_ring_info *rinfo = &info->rinfo[i];
++              struct bio_list merge_bio;
++              struct blk_shadow *shadow = rinfo->shadow;
++
++              for (j = 0; j < BLK_RING_SIZE(info); j++) {
++                      /* Not in use? */
++                      if (!shadow[j].request)
++                              continue;
++
++                      /*
++                       * Get the bios in the request so we can re-queue them.
++                       */
++                      if (shadow[j].request->cmd_flags &
++                                      (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
++                              /*
++                               * Flush operations don't contain bios, so
++                               * we need to requeue the whole request
++                               */
++                              list_add(&shadow[j].request->queuelist, &info->requests);
++                              continue;
++                      }
++                      merge_bio.head = shadow[j].request->bio;
++                      merge_bio.tail = shadow[j].request->biotail;
++                      bio_list_merge(&info->bio_list, &merge_bio);
++                      shadow[j].request->bio = NULL;
++                      blk_mq_end_request(shadow[j].request, 0);
++              }
++      }
++
+       blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+ 
+       err = negotiate_mq(info);
diff --git a/queue-4.6/xen-pciback-fix-conf_space-read-write-overlap-check.patch b/queue-4.6/xen-pciback-fix-conf_space-read-write-overlap-check.patch

new file mode 100644 (file)

index 0000000..331f342
--- /dev/null
+++ b/queue-4.6/xen-pciback-fix-conf_space-read-write-overlap-check.patch
@@ -0,0 +1,55 @@
+From 02ef871ecac290919ea0c783d05da7eedeffc10e Mon Sep 17 00:00:00 2001
+From: Andrey Grodzovsky <andrey2805@gmail.com>
+Date: Tue, 21 Jun 2016 14:26:36 -0400
+Subject: xen/pciback: Fix conf_space read/write overlap check.
+
+From: Andrey Grodzovsky <andrey2805@gmail.com>
+
+commit 02ef871ecac290919ea0c783d05da7eedeffc10e upstream.
+
+Current overlap check is evaluating to false a case where a filter
+field is fully contained (proper subset) of a r/w request.  This
+change applies classical overlap check instead to include all the
+scenarios.
+
+More specifically, for (Hilscher GmbH CIFX 50E-DP(M/S)) device driver
+the logic is such that the entire confspace is read and written in 4
+byte chunks. In this case as an example, CACHE_LINE_SIZE,
+LATENCY_TIMER and PCI_BIST are arriving together in one call to
+xen_pcibk_config_write() with offset == 0xc and size == 4.  With the
+exsisting overlap check the LATENCY_TIMER field (offset == 0xd, length
+== 1) is fully contained in the write request and hence is excluded
+from write, which is incorrect.
+
+Signed-off-by: Andrey Grodzovsky <andrey2805@gmail.com>
+Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Jan Beulich <JBeulich@suse.com>
+Signed-off-by: David Vrabel <david.vrabel@citrix.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/xen/xen-pciback/conf_space.c |    6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/drivers/xen/xen-pciback/conf_space.c
++++ b/drivers/xen/xen-pciback/conf_space.c
+@@ -183,8 +183,7 @@ int xen_pcibk_config_read(struct pci_dev
+               field_start = OFFSET(cfg_entry);
+               field_end = OFFSET(cfg_entry) + field->size;
+ 
+-              if ((req_start >= field_start && req_start < field_end)
+-                  || (req_end > field_start && req_end <= field_end)) {
++               if (req_end > field_start && field_end > req_start) {
+                       err = conf_space_read(dev, cfg_entry, field_start,
+                                             &tmp_val);
+                       if (err)
+@@ -230,8 +229,7 @@ int xen_pcibk_config_write(struct pci_de
+               field_start = OFFSET(cfg_entry);
+               field_end = OFFSET(cfg_entry) + field->size;
+ 
+-              if ((req_start >= field_start && req_start < field_end)
+-                  || (req_end > field_start && req_end <= field_end)) {
++               if (req_end > field_start && field_end > req_start) {
+                       tmp_val = 0;
+ 
+                       err = xen_pcibk_config_read(dev, field_start,
diff --git a/queue-4.6/xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch b/queue-4.6/xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch

new file mode 100644 (file)

index 0000000..fd00317
--- /dev/null
+++ b/queue-4.6/xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch
@@ -0,0 +1,51 @@
+From 7469be95a487319514adce2304ad2af3553d2fc9 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <JBeulich@suse.com>
+Date: Thu, 7 Jul 2016 01:32:04 -0600
+Subject: xenbus: don't bail early from xenbus_dev_request_and_reply()
+
+From: Jan Beulich <JBeulich@suse.com>
+
+commit 7469be95a487319514adce2304ad2af3553d2fc9 upstream.
+
+xenbus_dev_request_and_reply() needs to track whether a transaction is
+open.  For XS_TRANSACTION_START messages it calls transaction_start()
+and for XS_TRANSACTION_END messages it calls transaction_end().
+
+If sending an XS_TRANSACTION_START message fails or responds with an
+an error, the transaction is not open and transaction_end() must be
+called.
+
+If sending an XS_TRANSACTION_END message fails, the transaction is
+still open, but if an error response is returned the transaction is
+closed.
+
+Commit 027bd7e89906 ("xen/xenbus: Avoid synchronous wait on XenBus
+stalling shutdown/restart") introduced a regression where failed
+XS_TRANSACTION_START messages were leaving the transaction open.  This
+can cause problems with suspend (and migration) as all transactions
+must be closed before suspending.
+
+It appears that the problematic change was added accidentally, so just
+remove it.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Signed-off-by: David Vrabel <david.vrabel@citrix.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/xen/xenbus/xenbus_xs.c |    3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/drivers/xen/xenbus/xenbus_xs.c
++++ b/drivers/xen/xenbus/xenbus_xs.c
+@@ -249,9 +249,6 @@ void *xenbus_dev_request_and_reply(struc
+ 
+       mutex_unlock(&xs_state.request_mutex);
+ 
+-      if (IS_ERR(ret))
+-              return ret;
+-
+       if ((msg->type == XS_TRANSACTION_END) ||
+           ((req_msg.type == XS_TRANSACTION_START) &&
+            (msg->type == XS_ERROR)))
diff --git a/queue-4.6/xenbus-don-t-bug-on-user-mode-induced-condition.patch b/queue-4.6/xenbus-don-t-bug-on-user-mode-induced-condition.patch

new file mode 100644 (file)

index 0000000..c8e4261
--- /dev/null
+++ b/queue-4.6/xenbus-don-t-bug-on-user-mode-induced-condition.patch
@@ -0,0 +1,57 @@
+From 0beef634b86a1350c31da5fcc2992f0d7c8a622b Mon Sep 17 00:00:00 2001
+From: Jan Beulich <JBeulich@suse.com>
+Date: Thu, 7 Jul 2016 01:23:57 -0600
+Subject: xenbus: don't BUG() on user mode induced condition
+
+From: Jan Beulich <JBeulich@suse.com>
+
+commit 0beef634b86a1350c31da5fcc2992f0d7c8a622b upstream.
+
+Inability to locate a user mode specified transaction ID should not
+lead to a kernel crash. For other than XS_TRANSACTION_START also
+don't issue anything to xenbus if the specified ID doesn't match that
+of any active transaction.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: David Vrabel <david.vrabel@citrix.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/xen/xenbus/xenbus_dev_frontend.c |   14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/drivers/xen/xenbus/xenbus_dev_frontend.c
++++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
+@@ -316,11 +316,18 @@ static int xenbus_write_transaction(unsi
+                       rc = -ENOMEM;
+                       goto out;
+               }
++      } else {
++              list_for_each_entry(trans, &u->transactions, list)
++                      if (trans->handle.id == u->u.msg.tx_id)
++                              break;
++              if (&trans->list == &u->transactions)
++                      return -ESRCH;
+       }
+ 
+       reply = xenbus_dev_request_and_reply(&u->u.msg);
+       if (IS_ERR(reply)) {
+-              kfree(trans);
++              if (msg_type == XS_TRANSACTION_START)
++                      kfree(trans);
+               rc = PTR_ERR(reply);
+               goto out;
+       }
+@@ -333,12 +340,7 @@ static int xenbus_write_transaction(unsi
+                       list_add(&trans->list, &u->transactions);
+               }
+       } else if (u->u.msg.type == XS_TRANSACTION_END) {
+-              list_for_each_entry(trans, &u->transactions, list)
+-                      if (trans->handle.id == u->u.msg.tx_id)
+-                              break;
+-              BUG_ON(&trans->list == &u->transactions);
+               list_del(&trans->list);
+-
+               kfree(trans);
+       }
+
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 2 Aug 2016 07:04:26 +0000 (09:04 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 2 Aug 2016 07:04:26 +0000 (09:04 +0200)
queue-4.6/arc-unwind-ensure-that-.debug_frame-is-generated-vs.-.eh_frame.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/arc-unwind-warn-only-once-if-dw2_unwind-is-disabled.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/dmaengine-at_xdmac-align-descriptors-on-64-bits.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/dmaengine-at_xdmac-double-fifo-flush-needed-to-compute-residue.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/dmaengine-at_xdmac-fix-residue-corruption.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/fs-nilfs2-fix-potential-underflow-in-call-to-crc32_le.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/kernel-sysrq-watchdog-sched-core-reset-watchdog-on-all-cpus-while-processing-sysrq-w.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/memcg-css_alloc-should-return-an-err_ptr-value-on-error.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/memcg-mem_cgroup_migrate-may-be-called-with-irq-disabled.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/mm-compaction-abort-free-scanner-if-split-fails.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/mm-compaction-prevent-vm_bug_on-when-terminating-freeing-scanner.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/mm-meminit-always-return-a-valid-node-from-early_pfn_to_nid.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/mm-meminit-ensure-node-is-online-before-checking-whether-pages-are-uninitialised.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/mm-slb-add-__gfp_atomic-to-the-gfp-reclaim-mask.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/mm-swap.c-flush-lru-pvecs-on-compound-page-arrival.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/mm-thp-refix-false-positive-bug-in-page_move_anon_rmap.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/perf-test-ignore-kcore-files-in-the-vmlinux-matches-kallsyms-test.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/pps-do-not-crash-when-failed-to-register.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/radix-tree-fix-radix_tree_iter_retry-for-tagged-iterators.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/sched-debug-fix-deadlock-when-enabling-sched-events.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/series		patch \| blob \| blame \| history
queue-4.6/uapi-export-lirc.h-header.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/vmlinux.lds-account-for-destructor-sections.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/x86-quirks-add-early-quirk-to-reset-apple-airport-card.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/x86-quirks-apply-nvidia_bugs-quirk-only-on-root-bus.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/x86-quirks-reintroduce-scanning-of-secondary-buses.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/xen-blkfront-don-t-call-talk_to_blkback-when-already-connected-to-blkback.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/xen-blkfront-fix-resume-issues-after-a-migration.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/xen-blkfront-save-uncompleted-reqs-in-blkfront_resume.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/xen-pciback-fix-conf_space-read-write-overlap-check.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/xenbus-don-t-bail-early-from-xenbus_dev_request_and_reply.patch	[new file with mode: 0644]	patch \| blob
queue-4.6/xenbus-don-t-bug-on-user-mode-induced-condition.patch	[new file with mode: 0644]	patch \| blob