]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 7 Aug 2018 14:29:44 +0000 (16:29 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 7 Aug 2018 14:29:44 +0000 (16:29 +0200)
added patches:
intel_idle-graceful-probe-failure-when-mwait-is-disabled.patch
nvme-pci-allocate-device-queues-storage-space-at-probe.patch
nvme-pci-fix-queue-double-allocations.patch
nvmet-fc-fix-target-sgl-list-on-large-transfers.patch
xfs-catch-inode-allocation-state-mismatch-corruption.patch
xfs-don-t-call-xfs_da_shrink_inode-with-null-bp.patch
xfs-validate-cached-inodes-are-free-when-allocated.patch

queue-4.14/intel_idle-graceful-probe-failure-when-mwait-is-disabled.patch [new file with mode: 0644]
queue-4.14/nvme-pci-allocate-device-queues-storage-space-at-probe.patch [new file with mode: 0644]
queue-4.14/nvme-pci-fix-queue-double-allocations.patch [new file with mode: 0644]
queue-4.14/nvmet-fc-fix-target-sgl-list-on-large-transfers.patch [new file with mode: 0644]
queue-4.14/series
queue-4.14/xfs-catch-inode-allocation-state-mismatch-corruption.patch [new file with mode: 0644]
queue-4.14/xfs-don-t-call-xfs_da_shrink_inode-with-null-bp.patch [new file with mode: 0644]
queue-4.14/xfs-validate-cached-inodes-are-free-when-allocated.patch [new file with mode: 0644]

diff --git a/queue-4.14/intel_idle-graceful-probe-failure-when-mwait-is-disabled.patch b/queue-4.14/intel_idle-graceful-probe-failure-when-mwait-is-disabled.patch
new file mode 100644 (file)
index 0000000..20cc17e
--- /dev/null
@@ -0,0 +1,51 @@
+From a4c447533a18ee86e07232d6344ba12b1f9c5077 Mon Sep 17 00:00:00 2001
+From: Len Brown <len.brown@intel.com>
+Date: Thu, 9 Nov 2017 02:19:39 -0500
+Subject: intel_idle: Graceful probe failure when MWAIT is disabled
+
+From: Len Brown <len.brown@intel.com>
+
+commit a4c447533a18ee86e07232d6344ba12b1f9c5077 upstream.
+
+When MWAIT is disabled, intel_idle refuses to probe.
+But it may mis-lead the user by blaming this on the model number:
+
+intel_idle: does not run on family 6 modesl 79
+
+So defer the check for MWAIT until after the model# white-list check succeeds,
+and if the MWAIT check fails, tell the user how to fix it:
+
+intel_idle: Please enable MWAIT in BIOS SETUP
+
+Signed-off-by: Len Brown <len.brown@intel.com>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/idle/intel_idle.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1061,7 +1061,7 @@ static const struct idle_cpu idle_cpu_dn
+ };
+ #define ICPU(model, cpu) \
+-      { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu }
++      { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&cpu }
+ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+       ICPU(INTEL_FAM6_NEHALEM_EP,             idle_cpu_nehalem),
+@@ -1125,6 +1125,11 @@ static int __init intel_idle_probe(void)
+               return -ENODEV;
+       }
++      if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
++              pr_debug("Please enable MWAIT in BIOS SETUP\n");
++              return -ENODEV;
++      }
++
+       if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+               return -ENODEV;
diff --git a/queue-4.14/nvme-pci-allocate-device-queues-storage-space-at-probe.patch b/queue-4.14/nvme-pci-allocate-device-queues-storage-space-at-probe.patch
new file mode 100644 (file)
index 0000000..4f56d9b
--- /dev/null
@@ -0,0 +1,282 @@
+From 147b27e4bd08406a6abebedbb478b431ec197be1 Mon Sep 17 00:00:00 2001
+From: Sagi Grimberg <sagi@grimberg.me>
+Date: Sun, 14 Jan 2018 12:39:01 +0200
+Subject: nvme-pci: allocate device queues storage space at probe
+
+From: Sagi Grimberg <sagi@grimberg.me>
+
+commit 147b27e4bd08406a6abebedbb478b431ec197be1 upstream.
+
+It may cause race by setting 'nvmeq' in nvme_init_request()
+because .init_request is called inside switching io scheduler, which
+may happen when the NVMe device is being resetted and its nvme queues
+are being freed and created. We don't have any sync between the two
+pathes.
+
+This patch changes the nvmeq allocation to occur at probe time so
+there is no way we can dereference it at init_request.
+
+[   93.268391] kernel BUG at drivers/nvme/host/pci.c:408!
+[   93.274146] invalid opcode: 0000 [#1] SMP
+[   93.278618] Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss
+nfsv4 dns_resolver nfs lockd grace fscache sunrpc ipmi_ssif vfat fat
+intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel
+kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel iTCO_wdt
+intel_cstate ipmi_si iTCO_vendor_support intel_uncore mxm_wmi mei_me
+ipmi_devintf intel_rapl_perf pcspkr sg ipmi_msghandler lpc_ich dcdbas mei
+shpchp acpi_power_meter wmi dm_multipath ip_tables xfs libcrc32c sd_mod
+mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt
+fb_sys_fops ttm drm ahci libahci nvme libata crc32c_intel nvme_core tg3
+megaraid_sas ptp i2c_core pps_core dm_mirror dm_region_hash dm_log dm_mod
+[   93.349071] CPU: 5 PID: 1842 Comm: sh Not tainted 4.15.0-rc2.ming+ #4
+[   93.356256] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.5.5 08/16/2017
+[   93.364801] task: 00000000fb8abf2a task.stack: 0000000028bd82d1
+[   93.371408] RIP: 0010:nvme_init_request+0x36/0x40 [nvme]
+[   93.377333] RSP: 0018:ffffc90002537ca8 EFLAGS: 00010246
+[   93.383161] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000008
+[   93.391122] RDX: 0000000000000000 RSI: ffff880276ae0000 RDI: ffff88047bae9008
+[   93.399084] RBP: ffff88047bae9008 R08: ffff88047bae9008 R09: 0000000009dabc00
+[   93.407045] R10: 0000000000000004 R11: 000000000000299c R12: ffff880186bc1f00
+[   93.415007] R13: ffff880276ae0000 R14: 0000000000000000 R15: 0000000000000071
+[   93.422969] FS:  00007f33cf288740(0000) GS:ffff88047ba80000(0000) knlGS:0000000000000000
+[   93.431996] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[   93.438407] CR2: 00007f33cf28e000 CR3: 000000047e5bb006 CR4: 00000000001606e0
+[   93.446368] Call Trace:
+[   93.449103]  blk_mq_alloc_rqs+0x231/0x2a0
+[   93.453579]  blk_mq_sched_alloc_tags.isra.8+0x42/0x80
+[   93.459214]  blk_mq_init_sched+0x7e/0x140
+[   93.463687]  elevator_switch+0x5a/0x1f0
+[   93.467966]  ? elevator_get.isra.17+0x52/0xc0
+[   93.472826]  elv_iosched_store+0xde/0x150
+[   93.477299]  queue_attr_store+0x4e/0x90
+[   93.481580]  kernfs_fop_write+0xfa/0x180
+[   93.485958]  __vfs_write+0x33/0x170
+[   93.489851]  ? __inode_security_revalidate+0x4c/0x60
+[   93.495390]  ? selinux_file_permission+0xda/0x130
+[   93.500641]  ? _cond_resched+0x15/0x30
+[   93.504815]  vfs_write+0xad/0x1a0
+[   93.508512]  SyS_write+0x52/0xc0
+[   93.512113]  do_syscall_64+0x61/0x1a0
+[   93.516199]  entry_SYSCALL64_slow_path+0x25/0x25
+[   93.521351] RIP: 0033:0x7f33ce96aab0
+[   93.525337] RSP: 002b:00007ffe57570238 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+[   93.533785] RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f33ce96aab0
+[   93.541746] RDX: 0000000000000006 RSI: 00007f33cf28e000 RDI: 0000000000000001
+[   93.549707] RBP: 00007f33cf28e000 R08: 000000000000000a R09: 00007f33cf288740
+[   93.557669] R10: 00007f33cf288740 R11: 0000000000000246 R12: 00007f33cec42400
+[   93.565630] R13: 0000000000000006 R14: 0000000000000001 R15: 0000000000000000
+[   93.573592] Code: 4c 8d 40 08 4c 39 c7 74 16 48 8b 00 48 8b 04 08 48 85 c0
+74 16 48 89 86 78 01 00 00 31 c0 c3 8d 4a 01 48 63 c9 48 c1 e1 03 eb de <0f>
+0b 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 85 f6 53 48 89
+[   93.594676] RIP: nvme_init_request+0x36/0x40 [nvme] RSP: ffffc90002537ca8
+[   93.602273] ---[ end trace 810dde3993e5f14e ]---
+
+Reported-by: Yi Zhang <yi.zhang@redhat.com>
+Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/nvme/host/pci.c |   61 +++++++++++++++++++-----------------------------
+ 1 file changed, 25 insertions(+), 36 deletions(-)
+
+--- a/drivers/nvme/host/pci.c
++++ b/drivers/nvme/host/pci.c
+@@ -77,7 +77,7 @@ static void nvme_dev_disable(struct nvme
+  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+  */
+ struct nvme_dev {
+-      struct nvme_queue **queues;
++      struct nvme_queue *queues;
+       struct blk_mq_tag_set tagset;
+       struct blk_mq_tag_set admin_tagset;
+       u32 __iomem *dbs;
+@@ -348,7 +348,7 @@ static int nvme_admin_init_hctx(struct b
+                               unsigned int hctx_idx)
+ {
+       struct nvme_dev *dev = data;
+-      struct nvme_queue *nvmeq = dev->queues[0];
++      struct nvme_queue *nvmeq = &dev->queues[0];
+       WARN_ON(hctx_idx != 0);
+       WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+@@ -370,7 +370,7 @@ static int nvme_init_hctx(struct blk_mq_
+                         unsigned int hctx_idx)
+ {
+       struct nvme_dev *dev = data;
+-      struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
++      struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
+       if (!nvmeq->tags)
+               nvmeq->tags = &dev->tagset.tags[hctx_idx];
+@@ -386,7 +386,7 @@ static int nvme_init_request(struct blk_
+       struct nvme_dev *dev = set->driver_data;
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
+-      struct nvme_queue *nvmeq = dev->queues[queue_idx];
++      struct nvme_queue *nvmeq = &dev->queues[queue_idx];
+       BUG_ON(!nvmeq);
+       iod->nvmeq = nvmeq;
+@@ -900,7 +900,7 @@ static int nvme_poll(struct blk_mq_hw_ct
+ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
+ {
+       struct nvme_dev *dev = to_nvme_dev(ctrl);
+-      struct nvme_queue *nvmeq = dev->queues[0];
++      struct nvme_queue *nvmeq = &dev->queues[0];
+       struct nvme_command c;
+       memset(&c, 0, sizeof(c));
+@@ -1146,7 +1146,6 @@ static void nvme_free_queue(struct nvme_
+       if (nvmeq->sq_cmds)
+               dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+                                       nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+-      kfree(nvmeq);
+ }
+ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
+@@ -1154,10 +1153,8 @@ static void nvme_free_queues(struct nvme
+       int i;
+       for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
+-              struct nvme_queue *nvmeq = dev->queues[i];
+               dev->ctrl.queue_count--;
+-              dev->queues[i] = NULL;
+-              nvme_free_queue(nvmeq);
++              nvme_free_queue(&dev->queues[i]);
+       }
+ }
+@@ -1189,10 +1186,8 @@ static int nvme_suspend_queue(struct nvm
+ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
+ {
+-      struct nvme_queue *nvmeq = dev->queues[0];
++      struct nvme_queue *nvmeq = &dev->queues[0];
+-      if (!nvmeq)
+-              return;
+       if (nvme_suspend_queue(nvmeq))
+               return;
+@@ -1246,13 +1241,10 @@ static int nvme_alloc_sq_cmds(struct nvm
+       return 0;
+ }
+-static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
+-                                                      int depth, int node)
++static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
++              int depth, int node)
+ {
+-      struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
+-                                                      node);
+-      if (!nvmeq)
+-              return NULL;
++      struct nvme_queue *nvmeq = &dev->queues[qid];
+       nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
+                                         &nvmeq->cq_dma_addr, GFP_KERNEL);
+@@ -1271,17 +1263,15 @@ static struct nvme_queue *nvme_alloc_que
+       nvmeq->q_depth = depth;
+       nvmeq->qid = qid;
+       nvmeq->cq_vector = -1;
+-      dev->queues[qid] = nvmeq;
+       dev->ctrl.queue_count++;
+-      return nvmeq;
++      return 0;
+  free_cqdma:
+       dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
+                                                       nvmeq->cq_dma_addr);
+  free_nvmeq:
+-      kfree(nvmeq);
+-      return NULL;
++      return -ENOMEM;
+ }
+ static int queue_request_irq(struct nvme_queue *nvmeq)
+@@ -1468,14 +1458,12 @@ static int nvme_pci_configure_admin_queu
+       if (result < 0)
+               return result;
+-      nvmeq = dev->queues[0];
+-      if (!nvmeq) {
+-              nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
+-                                      dev_to_node(dev->dev));
+-              if (!nvmeq)
+-                      return -ENOMEM;
+-      }
++      result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
++                      dev_to_node(dev->dev));
++      if (result)
++              return result;
++      nvmeq = &dev->queues[0];
+       aqa = nvmeq->q_depth - 1;
+       aqa |= aqa << 16;
+@@ -1505,7 +1493,7 @@ static int nvme_create_io_queues(struct
+       for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
+               /* vector == qid - 1, match nvme_create_queue */
+-              if (!nvme_alloc_queue(dev, i, dev->q_depth,
++              if (nvme_alloc_queue(dev, i, dev->q_depth,
+                    pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
+                       ret = -ENOMEM;
+                       break;
+@@ -1514,7 +1502,7 @@ static int nvme_create_io_queues(struct
+       max = min(dev->max_qid, dev->ctrl.queue_count - 1);
+       for (i = dev->online_queues; i <= max; i++) {
+-              ret = nvme_create_queue(dev->queues[i], i);
++              ret = nvme_create_queue(&dev->queues[i], i);
+               if (ret)
+                       break;
+       }
+@@ -1770,7 +1758,7 @@ static int nvme_setup_host_mem(struct nv
+ static int nvme_setup_io_queues(struct nvme_dev *dev)
+ {
+-      struct nvme_queue *adminq = dev->queues[0];
++      struct nvme_queue *adminq = &dev->queues[0];
+       struct pci_dev *pdev = to_pci_dev(dev->dev);
+       int result, nr_io_queues;
+       unsigned long size;
+@@ -1896,7 +1884,7 @@ static void nvme_disable_io_queues(struc
+  retry:
+               timeout = ADMIN_TIMEOUT;
+               for (; i > 0; i--, sent++)
+-                      if (nvme_delete_queue(dev->queues[i], opcode))
++                      if (nvme_delete_queue(&dev->queues[i], opcode))
+                               break;
+               while (sent--) {
+@@ -2081,7 +2069,7 @@ static void nvme_dev_disable(struct nvme
+       queues = dev->online_queues - 1;
+       for (i = dev->ctrl.queue_count - 1; i > 0; i--)
+-              nvme_suspend_queue(dev->queues[i]);
++              nvme_suspend_queue(&dev->queues[i]);
+       if (dead) {
+               /* A device might become IO incapable very soon during
+@@ -2089,7 +2077,7 @@ static void nvme_dev_disable(struct nvme
+                * queue_count can be 0 here.
+                */
+               if (dev->ctrl.queue_count)
+-                      nvme_suspend_queue(dev->queues[0]);
++                      nvme_suspend_queue(&dev->queues[0]);
+       } else {
+               nvme_disable_io_queues(dev, queues);
+               nvme_disable_admin_queue(dev, shutdown);
+@@ -2345,7 +2333,8 @@ static int nvme_probe(struct pci_dev *pd
+       dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+       if (!dev)
+               return -ENOMEM;
+-      dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
++
++      dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(struct nvme_queue),
+                                                       GFP_KERNEL, node);
+       if (!dev->queues)
+               goto free;
diff --git a/queue-4.14/nvme-pci-fix-queue-double-allocations.patch b/queue-4.14/nvme-pci-fix-queue-double-allocations.patch
new file mode 100644 (file)
index 0000000..e411c44
--- /dev/null
@@ -0,0 +1,34 @@
+From 62314e405fa101dbb82563394f9dfc225e3f1167 Mon Sep 17 00:00:00 2001
+From: Keith Busch <keith.busch@intel.com>
+Date: Tue, 23 Jan 2018 09:16:19 -0700
+Subject: nvme-pci: Fix queue double allocations
+
+From: Keith Busch <keith.busch@intel.com>
+
+commit 62314e405fa101dbb82563394f9dfc225e3f1167 upstream.
+
+The queue count says the highest queue that's been allocated, so don't
+reallocate a queue lower than that.
+
+Fixes: 147b27e4bd0 ("nvme-pci: allocate device queues storage space at probe")
+Signed-off-by: Keith Busch <keith.busch@intel.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/nvme/host/pci.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/nvme/host/pci.c
++++ b/drivers/nvme/host/pci.c
+@@ -1246,6 +1246,9 @@ static int nvme_alloc_queue(struct nvme_
+ {
+       struct nvme_queue *nvmeq = &dev->queues[qid];
++      if (dev->ctrl.queue_count > qid)
++              return 0;
++
+       nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
+                                         &nvmeq->cq_dma_addr, GFP_KERNEL);
+       if (!nvmeq->cqes)
diff --git a/queue-4.14/nvmet-fc-fix-target-sgl-list-on-large-transfers.patch b/queue-4.14/nvmet-fc-fix-target-sgl-list-on-large-transfers.patch
new file mode 100644 (file)
index 0000000..67bd9e2
--- /dev/null
@@ -0,0 +1,127 @@
+From d082dc1562a2ff0947b214796f12faaa87e816a9 Mon Sep 17 00:00:00 2001
+From: James Smart <jsmart2021@gmail.com>
+Date: Mon, 16 Jul 2018 14:38:14 -0700
+Subject: nvmet-fc: fix target sgl list on large transfers
+
+From: James Smart <jsmart2021@gmail.com>
+
+commit d082dc1562a2ff0947b214796f12faaa87e816a9 upstream.
+
+The existing code to carve up the sg list expected an sg element-per-page
+which can be very incorrect with iommu's remapping multiple memory pages
+to fewer bus addresses. To hit this error required a large io payload
+(greater than 256k) and a system that maps on a per-page basis. It's
+possible that large ios could get by fine if the system condensed the
+sgl list into the first 64 elements.
+
+This patch corrects the sg list handling by specifically walking the
+sg list element by element and attempting to divide the transfer up
+on a per-sg element boundary. While doing so, it still tries to keep
+sequences under 256k, but will exceed that rule if a single sg element
+is larger than 256k.
+
+Fixes: 48fa362b6c3f ("nvmet-fc: simplify sg list handling")
+Cc: <stable@vger.kernel.org> # 4.14
+Signed-off-by: James Smart <james.smart@broadcom.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ drivers/nvme/target/fc.c |   44 +++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 35 insertions(+), 9 deletions(-)
+
+--- a/drivers/nvme/target/fc.c
++++ b/drivers/nvme/target/fc.c
+@@ -58,8 +58,8 @@ struct nvmet_fc_ls_iod {
+       struct work_struct              work;
+ } __aligned(sizeof(unsigned long long));
++/* desired maximum for a single sequence - if sg list allows it */
+ #define NVMET_FC_MAX_SEQ_LENGTH               (256 * 1024)
+-#define NVMET_FC_MAX_XFR_SGENTS               (NVMET_FC_MAX_SEQ_LENGTH / PAGE_SIZE)
+ enum nvmet_fcp_datadir {
+       NVMET_FCP_NODATA,
+@@ -74,6 +74,7 @@ struct nvmet_fc_fcp_iod {
+       struct nvme_fc_cmd_iu           cmdiubuf;
+       struct nvme_fc_ersp_iu          rspiubuf;
+       dma_addr_t                      rspdma;
++      struct scatterlist              *next_sg;
+       struct scatterlist              *data_sg;
+       int                             data_sg_cnt;
+       u32                             total_length;
+@@ -1000,8 +1001,7 @@ nvmet_fc_register_targetport(struct nvme
+       INIT_LIST_HEAD(&newrec->assoc_list);
+       kref_init(&newrec->ref);
+       ida_init(&newrec->assoc_cnt);
+-      newrec->max_sg_cnt = min_t(u32, NVMET_FC_MAX_XFR_SGENTS,
+-                                      template->max_sgl_segments);
++      newrec->max_sg_cnt = template->max_sgl_segments;
+       ret = nvmet_fc_alloc_ls_iodlist(newrec);
+       if (ret) {
+@@ -1717,6 +1717,7 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_f
+                               ((fod->io_dir == NVMET_FCP_WRITE) ?
+                                       DMA_FROM_DEVICE : DMA_TO_DEVICE));
+                               /* note: write from initiator perspective */
++      fod->next_sg = fod->data_sg;
+       return 0;
+@@ -1874,24 +1875,49 @@ nvmet_fc_transfer_fcp_data(struct nvmet_
+                               struct nvmet_fc_fcp_iod *fod, u8 op)
+ {
+       struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
++      struct scatterlist *sg = fod->next_sg;
+       unsigned long flags;
+-      u32 tlen;
++      u32 remaininglen = fod->total_length - fod->offset;
++      u32 tlen = 0;
+       int ret;
+       fcpreq->op = op;
+       fcpreq->offset = fod->offset;
+       fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC;
+-      tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE,
+-                      (fod->total_length - fod->offset));
++      /*
++       * for next sequence:
++       *  break at a sg element boundary
++       *  attempt to keep sequence length capped at
++       *    NVMET_FC_MAX_SEQ_LENGTH but allow sequence to
++       *    be longer if a single sg element is larger
++       *    than that amount. This is done to avoid creating
++       *    a new sg list to use for the tgtport api.
++       */
++      fcpreq->sg = sg;
++      fcpreq->sg_cnt = 0;
++      while (tlen < remaininglen &&
++             fcpreq->sg_cnt < tgtport->max_sg_cnt &&
++             tlen + sg_dma_len(sg) < NVMET_FC_MAX_SEQ_LENGTH) {
++              fcpreq->sg_cnt++;
++              tlen += sg_dma_len(sg);
++              sg = sg_next(sg);
++      }
++      if (tlen < remaininglen && fcpreq->sg_cnt == 0) {
++              fcpreq->sg_cnt++;
++              tlen += min_t(u32, sg_dma_len(sg), remaininglen);
++              sg = sg_next(sg);
++      }
++      if (tlen < remaininglen)
++              fod->next_sg = sg;
++      else
++              fod->next_sg = NULL;
++
+       fcpreq->transfer_length = tlen;
+       fcpreq->transferred_length = 0;
+       fcpreq->fcp_error = 0;
+       fcpreq->rsplen = 0;
+-      fcpreq->sg = &fod->data_sg[fod->offset / PAGE_SIZE];
+-      fcpreq->sg_cnt = DIV_ROUND_UP(tlen, PAGE_SIZE);
+-
+       /*
+        * If the last READDATA request: check if LLDD supports
+        * combined xfr with response.
index 17e9d5b9836db15de61c77b80a8a2f4c3f715798..8380ea2b12131f38c4cd8922be58391892b23dec 100644 (file)
@@ -11,3 +11,10 @@ acpi-pci-bail-early-in-acpi_pci_add_bus-if-there-is-no-acpi-handle.patch
 ring_buffer-tracing-inherit-the-tracing-setting-to-next-ring-buffer.patch
 i2c-imx-fix-reinit_completion-use.patch
 btrfs-fix-file-data-corruption-after-cloning-a-range-and-fsync.patch
+nvme-pci-allocate-device-queues-storage-space-at-probe.patch
+nvme-pci-fix-queue-double-allocations.patch
+nvmet-fc-fix-target-sgl-list-on-large-transfers.patch
+intel_idle-graceful-probe-failure-when-mwait-is-disabled.patch
+xfs-catch-inode-allocation-state-mismatch-corruption.patch
+xfs-validate-cached-inodes-are-free-when-allocated.patch
+xfs-don-t-call-xfs_da_shrink_inode-with-null-bp.patch
diff --git a/queue-4.14/xfs-catch-inode-allocation-state-mismatch-corruption.patch b/queue-4.14/xfs-catch-inode-allocation-state-mismatch-corruption.patch
new file mode 100644 (file)
index 0000000..a571375
--- /dev/null
@@ -0,0 +1,184 @@
+From ee457001ed6c6f31ddad69c24c1da8f377d8472d Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 23 Mar 2018 10:22:53 -0700
+Subject: xfs: catch inode allocation state mismatch corruption
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit ee457001ed6c6f31ddad69c24c1da8f377d8472d upstream.
+
+We recently came across a V4 filesystem causing memory corruption
+due to a newly allocated inode being setup twice and being added to
+the superblock inode list twice. From code inspection, the only way
+this could happen is if a newly allocated inode was not marked as
+free on disk (i.e. di_mode wasn't zero).
+
+Running the metadump on an upstream debug kernel fails during inode
+allocation like so:
+
+XFS: Assertion failed: ip->i_d.di_nblocks == 0, file: fs/xfs/xfs_inod=
+e.c, line: 838
+ ------------[ cut here ]------------
+kernel BUG at fs/xfs/xfs_message.c:114!
+invalid opcode: 0000 [#1] PREEMPT SMP
+CPU: 11 PID: 3496 Comm: mkdir Not tainted 4.16.0-rc5-dgc #442
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/0=
+1/2014
+RIP: 0010:assfail+0x28/0x30
+RSP: 0018:ffffc9000236fc80 EFLAGS: 00010202
+RAX: 00000000ffffffea RBX: 0000000000004000 RCX: 0000000000000000
+RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffff8227211b
+RBP: ffffc9000236fce8 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000bec R11: f000000000000000 R12: ffffc9000236fd30
+R13: ffff8805c76bab80 R14: ffff8805c77ac800 R15: ffff88083fb12e10
+FS:  00007fac8cbff040(0000) GS:ffff88083fd00000(0000) knlGS:0000000000000=
+000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007fffa6783ff8 CR3: 00000005c6e2b003 CR4: 00000000000606e0
+Call Trace:
+ xfs_ialloc+0x383/0x570
+ xfs_dir_ialloc+0x6a/0x2a0
+ xfs_create+0x412/0x670
+ xfs_generic_create+0x1f7/0x2c0
+ ? capable_wrt_inode_uidgid+0x3f/0x50
+ vfs_mkdir+0xfb/0x1b0
+ SyS_mkdir+0xcf/0xf0
+ do_syscall_64+0x73/0x1a0
+ entry_SYSCALL_64_after_hwframe+0x42/0xb7
+
+Extracting the inode number we crashed on from an event trace and
+looking at it with xfs_db:
+
+xfs_db> inode 184452204
+xfs_db> p
+core.magic = 0x494e
+core.mode = 0100644
+core.version = 2
+core.format = 2 (extents)
+core.nlinkv2 = 1
+core.onlink = 0
+.....
+
+Confirms that it is not a free inode on disk. xfs_repair
+also trips over this inode:
+
+.....
+zero length extent (off = 0, fsbno = 0) in ino 184452204
+correcting nextents for inode 184452204
+bad attribute fork in inode 184452204, would clear attr fork
+bad nblocks 1 for inode 184452204, would reset to 0
+bad anextents 1 for inode 184452204, would reset to 0
+imap claims in-use inode 184452204 is free, would correct imap
+would have cleared inode 184452204
+.....
+disconnected inode 184452204, would move to lost+found
+
+And so we have a situation where the directory structure and the
+inobt thinks the inode is free, but the inode on disk thinks it is
+still in use. Where this corruption came from is not possible to
+diagnose, but we can detect it and prevent the kernel from oopsing
+on lookup. The reproducer now results in:
+
+$ sudo mkdir /mnt/scratch/{0,1,2,3,4,5}{0,1,2,3,4,5}
+mkdir: cannot create directory =E2=80=98/mnt/scratch/00=E2=80=99: File ex=
+ists
+mkdir: cannot create directory =E2=80=98/mnt/scratch/01=E2=80=99: File ex=
+ists
+mkdir: cannot create directory =E2=80=98/mnt/scratch/03=E2=80=99: Structu=
+re needs cleaning
+mkdir: cannot create directory =E2=80=98/mnt/scratch/04=E2=80=99: Input/o=
+utput error
+mkdir: cannot create directory =E2=80=98/mnt/scratch/05=E2=80=99: Input/o=
+utput error
+....
+
+And this corruption shutdown:
+
+[   54.843517] XFS (loop0): Corruption detected! Free inode 0xafe846c not=
+ marked free on disk
+[   54.845885] XFS (loop0): Internal error xfs_trans_cancel at line 1023 =
+of file fs/xfs/xfs_trans.c.  Caller xfs_create+0x425/0x670
+[   54.848994] CPU: 10 PID: 3541 Comm: mkdir Not tainted 4.16.0-rc5-dgc #=
+443
+[   54.850753] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO=
+S 1.10.2-1 04/01/2014
+[   54.852859] Call Trace:
+[   54.853531]  dump_stack+0x85/0xc5
+[   54.854385]  xfs_trans_cancel+0x197/0x1c0
+[   54.855421]  xfs_create+0x425/0x670
+[   54.856314]  xfs_generic_create+0x1f7/0x2c0
+[   54.857390]  ? capable_wrt_inode_uidgid+0x3f/0x50
+[   54.858586]  vfs_mkdir+0xfb/0x1b0
+[   54.859458]  SyS_mkdir+0xcf/0xf0
+[   54.860254]  do_syscall_64+0x73/0x1a0
+[   54.861193]  entry_SYSCALL_64_after_hwframe+0x42/0xb7
+[   54.862492] RIP: 0033:0x7fb73bddf547
+[   54.863358] RSP: 002b:00007ffdaa553338 EFLAGS: 00000246 ORIG_RAX: 0000=
+000000000053
+[   54.865133] RAX: ffffffffffffffda RBX: 00007ffdaa55449a RCX: 00007fb73=
+bddf547
+[   54.866766] RDX: 0000000000000001 RSI: 00000000000001ff RDI: 00007ffda=
+a55449a
+[   54.868432] RBP: 00007ffdaa55449a R08: 00000000000001ff R09: 00005623a=
+8670dd0
+[   54.870110] R10: 00007fb73be72d5b R11: 0000000000000246 R12: 000000000=
+00001ff
+[   54.871752] R13: 00007ffdaa5534b0 R14: 0000000000000000 R15: 00007ffda=
+a553500
+[   54.873429] XFS (loop0): xfs_do_force_shutdown(0x8) called from line 1=
+024 of file fs/xfs/xfs_trans.c.  Return address = ffffffff814cd050
+[   54.882790] XFS (loop0): Corruption of in-memory data detected.  Shutt=
+ing down filesystem
+[   54.884597] XFS (loop0): Please umount the filesystem and rectify the =
+problem(s)
+
+Note that this crash is only possible on v4 filesystemsi or v5
+filesystems mounted with the ikeep mount option. For all other V5
+filesystems, this problem cannot occur because we don't read inodes
+we are allocating from disk - we simply overwrite them with the new
+inode information.
+
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Tested-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_icache.c |   23 ++++++++++++++++++++++-
+ 1 file changed, 22 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -475,7 +475,28 @@ xfs_iget_cache_miss(
+       trace_xfs_iget_miss(ip);
+-      if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) {
++
++      /*
++       * If we are allocating a new inode, then check what was returned is
++       * actually a free, empty inode. If we are not allocating an inode,
++       * the check we didn't find a free inode.
++       */
++      if (flags & XFS_IGET_CREATE) {
++              if (VFS_I(ip)->i_mode != 0) {
++                      xfs_warn(mp,
++"Corruption detected! Free inode 0x%llx not marked free on disk",
++                              ino);
++                      error = -EFSCORRUPTED;
++                      goto out_destroy;
++              }
++              if (ip->i_d.di_nblocks != 0) {
++                      xfs_warn(mp,
++"Corruption detected! Free inode 0x%llx has blocks allocated!",
++                              ino);
++                      error = -EFSCORRUPTED;
++                      goto out_destroy;
++              }
++      } else if (VFS_I(ip)->i_mode == 0) {
+               error = -ENOENT;
+               goto out_destroy;
+       }
diff --git a/queue-4.14/xfs-don-t-call-xfs_da_shrink_inode-with-null-bp.patch b/queue-4.14/xfs-don-t-call-xfs_da_shrink_inode-with-null-bp.patch
new file mode 100644 (file)
index 0000000..af1e6d6
--- /dev/null
@@ -0,0 +1,45 @@
+From bb3d48dcf86a97dc25fe9fc2c11938e19cb4399a Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@sandeen.net>
+Date: Fri, 8 Jun 2018 09:53:49 -0700
+Subject: xfs: don't call xfs_da_shrink_inode with NULL bp
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit bb3d48dcf86a97dc25fe9fc2c11938e19cb4399a upstream.
+
+xfs_attr3_leaf_create may have errored out before instantiating a buffer,
+for example if the blkno is out of range.  In that case there is no work
+to do to remove it, and in fact xfs_da_shrink_inode will lead to an oops
+if we try.
+
+This also seems to fix a flaw where the original error from
+xfs_attr3_leaf_create gets overwritten in the cleanup case, and it
+removes a pointless assignment to bp which isn't used after this.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199969
+Reported-by: Xu, Wen <wen.xu@gatech.edu>
+Tested-by: Xu, Wen <wen.xu@gatech.edu>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_attr_leaf.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_attr_leaf.c
++++ b/fs/xfs/libxfs/xfs_attr_leaf.c
+@@ -785,9 +785,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t
+       ASSERT(blkno == 0);
+       error = xfs_attr3_leaf_create(args, blkno, &bp);
+       if (error) {
+-              error = xfs_da_shrink_inode(args, 0, bp);
+-              bp = NULL;
+-              if (error)
++              /* xfs_attr3_leaf_create may not have instantiated a block */
++              if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0))
+                       goto out;
+               xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
+               memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
diff --git a/queue-4.14/xfs-validate-cached-inodes-are-free-when-allocated.patch b/queue-4.14/xfs-validate-cached-inodes-are-free-when-allocated.patch
new file mode 100644 (file)
index 0000000..1c7638d
--- /dev/null
@@ -0,0 +1,157 @@
+From afca6c5b2595fc44383919fba740c194b0b76aff Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Tue, 17 Apr 2018 17:17:34 -0700
+Subject: xfs: validate cached inodes are free when allocated
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit afca6c5b2595fc44383919fba740c194b0b76aff upstream.
+
+A recent fuzzed filesystem image cached random dcache corruption
+when the reproducer was run. This often showed up as panics in
+lookup_slow() on a null inode->i_ops pointer when doing pathwalks.
+
+BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
+....
+Call Trace:
+ lookup_slow+0x44/0x60
+ walk_component+0x3dd/0x9f0
+ link_path_walk+0x4a7/0x830
+ path_lookupat+0xc1/0x470
+ filename_lookup+0x129/0x270
+ user_path_at_empty+0x36/0x40
+ path_listxattr+0x98/0x110
+ SyS_listxattr+0x13/0x20
+ do_syscall_64+0xf5/0x280
+ entry_SYSCALL_64_after_hwframe+0x42/0xb7
+
+but had many different failure modes including deadlocks trying to
+lock the inode that was just allocated or KASAN reports of
+use-after-free violations.
+
+The cause of the problem was a corrupt INOBT on a v4 fs where the
+root inode was marked as free in the inobt record. Hence when we
+allocated an inode, it chose the root inode to allocate, found it in
+the cache and re-initialised it.
+
+We recently fixed a similar inode allocation issue caused by inobt
+record corruption problem in xfs_iget_cache_miss() in commit
+ee457001ed6c ("xfs: catch inode allocation state mismatch
+corruption"). This change adds similar checks to the cache-hit path
+to catch it, and turns the reproducer into a corruption shutdown
+situation.
+
+Reported-by: Wen Xu <wen.xu@gatech.edu>
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: fix typos in comment]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_icache.c |   73 ++++++++++++++++++++++++++++++++++------------------
+ 1 file changed, 48 insertions(+), 25 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -306,6 +306,46 @@ xfs_reinit_inode(
+ }
+ /*
++ * If we are allocating a new inode, then check what was returned is
++ * actually a free, empty inode. If we are not allocating an inode,
++ * then check we didn't find a free inode.
++ *
++ * Returns:
++ *    0               if the inode free state matches the lookup context
++ *    -ENOENT         if the inode is free and we are not allocating
++ *    -EFSCORRUPTED   if there is any state mismatch at all
++ */
++static int
++xfs_iget_check_free_state(
++      struct xfs_inode        *ip,
++      int                     flags)
++{
++      if (flags & XFS_IGET_CREATE) {
++              /* should be a free inode */
++              if (VFS_I(ip)->i_mode != 0) {
++                      xfs_warn(ip->i_mount,
++"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
++                              ip->i_ino, VFS_I(ip)->i_mode);
++                      return -EFSCORRUPTED;
++              }
++
++              if (ip->i_d.di_nblocks != 0) {
++                      xfs_warn(ip->i_mount,
++"Corruption detected! Free inode 0x%llx has blocks allocated!",
++                              ip->i_ino);
++                      return -EFSCORRUPTED;
++              }
++              return 0;
++      }
++
++      /* should be an allocated inode */
++      if (VFS_I(ip)->i_mode == 0)
++              return -ENOENT;
++
++      return 0;
++}
++
++/*
+  * Check the validity of the inode we just found it the cache
+  */
+ static int
+@@ -354,12 +394,12 @@ xfs_iget_cache_hit(
+       }
+       /*
+-       * If lookup is racing with unlink return an error immediately.
++       * Check the inode free state is valid. This also detects lookup
++       * racing with unlinks.
+        */
+-      if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+-              error = -ENOENT;
++      error = xfs_iget_check_free_state(ip, flags);
++      if (error)
+               goto out_error;
+-      }
+       /*
+        * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+@@ -477,29 +517,12 @@ xfs_iget_cache_miss(
+       /*
+-       * If we are allocating a new inode, then check what was returned is
+-       * actually a free, empty inode. If we are not allocating an inode,
+-       * the check we didn't find a free inode.
++       * Check the inode free state is valid. This also detects lookup
++       * racing with unlinks.
+        */
+-      if (flags & XFS_IGET_CREATE) {
+-              if (VFS_I(ip)->i_mode != 0) {
+-                      xfs_warn(mp,
+-"Corruption detected! Free inode 0x%llx not marked free on disk",
+-                              ino);
+-                      error = -EFSCORRUPTED;
+-                      goto out_destroy;
+-              }
+-              if (ip->i_d.di_nblocks != 0) {
+-                      xfs_warn(mp,
+-"Corruption detected! Free inode 0x%llx has blocks allocated!",
+-                              ino);
+-                      error = -EFSCORRUPTED;
+-                      goto out_destroy;
+-              }
+-      } else if (VFS_I(ip)->i_mode == 0) {
+-              error = -ENOENT;
++      error = xfs_iget_check_free_state(ip, flags);
++      if (error)
+               goto out_destroy;
+-      }
+       /*
+        * Preload the radix tree so we can insert safely under the