--- /dev/null
+From a4c447533a18ee86e07232d6344ba12b1f9c5077 Mon Sep 17 00:00:00 2001
+From: Len Brown <len.brown@intel.com>
+Date: Thu, 9 Nov 2017 02:19:39 -0500
+Subject: intel_idle: Graceful probe failure when MWAIT is disabled
+
+From: Len Brown <len.brown@intel.com>
+
+commit a4c447533a18ee86e07232d6344ba12b1f9c5077 upstream.
+
+When MWAIT is disabled, intel_idle refuses to probe.
+But it may mis-lead the user by blaming this on the model number:
+
+intel_idle: does not run on family 6 modesl 79
+
+So defer the check for MWAIT until after the model# white-list check succeeds,
+and if the MWAIT check fails, tell the user how to fix it:
+
+intel_idle: Please enable MWAIT in BIOS SETUP
+
+Signed-off-by: Len Brown <len.brown@intel.com>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/idle/intel_idle.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -1061,7 +1061,7 @@ static const struct idle_cpu idle_cpu_dn
+ };
+
+ #define ICPU(model, cpu) \
+- { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu }
++ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&cpu }
+
+ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ ICPU(INTEL_FAM6_NEHALEM_EP, idle_cpu_nehalem),
+@@ -1125,6 +1125,11 @@ static int __init intel_idle_probe(void)
+ return -ENODEV;
+ }
+
++ if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
++ pr_debug("Please enable MWAIT in BIOS SETUP\n");
++ return -ENODEV;
++ }
++
+ if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return -ENODEV;
+
--- /dev/null
+From 147b27e4bd08406a6abebedbb478b431ec197be1 Mon Sep 17 00:00:00 2001
+From: Sagi Grimberg <sagi@grimberg.me>
+Date: Sun, 14 Jan 2018 12:39:01 +0200
+Subject: nvme-pci: allocate device queues storage space at probe
+
+From: Sagi Grimberg <sagi@grimberg.me>
+
+commit 147b27e4bd08406a6abebedbb478b431ec197be1 upstream.
+
+It may cause race by setting 'nvmeq' in nvme_init_request()
+because .init_request is called inside switching io scheduler, which
+may happen when the NVMe device is being resetted and its nvme queues
+are being freed and created. We don't have any sync between the two
+pathes.
+
+This patch changes the nvmeq allocation to occur at probe time so
+there is no way we can dereference it at init_request.
+
+[ 93.268391] kernel BUG at drivers/nvme/host/pci.c:408!
+[ 93.274146] invalid opcode: 0000 [#1] SMP
+[ 93.278618] Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss
+nfsv4 dns_resolver nfs lockd grace fscache sunrpc ipmi_ssif vfat fat
+intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel
+kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel iTCO_wdt
+intel_cstate ipmi_si iTCO_vendor_support intel_uncore mxm_wmi mei_me
+ipmi_devintf intel_rapl_perf pcspkr sg ipmi_msghandler lpc_ich dcdbas mei
+shpchp acpi_power_meter wmi dm_multipath ip_tables xfs libcrc32c sd_mod
+mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt
+fb_sys_fops ttm drm ahci libahci nvme libata crc32c_intel nvme_core tg3
+megaraid_sas ptp i2c_core pps_core dm_mirror dm_region_hash dm_log dm_mod
+[ 93.349071] CPU: 5 PID: 1842 Comm: sh Not tainted 4.15.0-rc2.ming+ #4
+[ 93.356256] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.5.5 08/16/2017
+[ 93.364801] task: 00000000fb8abf2a task.stack: 0000000028bd82d1
+[ 93.371408] RIP: 0010:nvme_init_request+0x36/0x40 [nvme]
+[ 93.377333] RSP: 0018:ffffc90002537ca8 EFLAGS: 00010246
+[ 93.383161] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000008
+[ 93.391122] RDX: 0000000000000000 RSI: ffff880276ae0000 RDI: ffff88047bae9008
+[ 93.399084] RBP: ffff88047bae9008 R08: ffff88047bae9008 R09: 0000000009dabc00
+[ 93.407045] R10: 0000000000000004 R11: 000000000000299c R12: ffff880186bc1f00
+[ 93.415007] R13: ffff880276ae0000 R14: 0000000000000000 R15: 0000000000000071
+[ 93.422969] FS: 00007f33cf288740(0000) GS:ffff88047ba80000(0000) knlGS:0000000000000000
+[ 93.431996] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 93.438407] CR2: 00007f33cf28e000 CR3: 000000047e5bb006 CR4: 00000000001606e0
+[ 93.446368] Call Trace:
+[ 93.449103] blk_mq_alloc_rqs+0x231/0x2a0
+[ 93.453579] blk_mq_sched_alloc_tags.isra.8+0x42/0x80
+[ 93.459214] blk_mq_init_sched+0x7e/0x140
+[ 93.463687] elevator_switch+0x5a/0x1f0
+[ 93.467966] ? elevator_get.isra.17+0x52/0xc0
+[ 93.472826] elv_iosched_store+0xde/0x150
+[ 93.477299] queue_attr_store+0x4e/0x90
+[ 93.481580] kernfs_fop_write+0xfa/0x180
+[ 93.485958] __vfs_write+0x33/0x170
+[ 93.489851] ? __inode_security_revalidate+0x4c/0x60
+[ 93.495390] ? selinux_file_permission+0xda/0x130
+[ 93.500641] ? _cond_resched+0x15/0x30
+[ 93.504815] vfs_write+0xad/0x1a0
+[ 93.508512] SyS_write+0x52/0xc0
+[ 93.512113] do_syscall_64+0x61/0x1a0
+[ 93.516199] entry_SYSCALL64_slow_path+0x25/0x25
+[ 93.521351] RIP: 0033:0x7f33ce96aab0
+[ 93.525337] RSP: 002b:00007ffe57570238 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+[ 93.533785] RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f33ce96aab0
+[ 93.541746] RDX: 0000000000000006 RSI: 00007f33cf28e000 RDI: 0000000000000001
+[ 93.549707] RBP: 00007f33cf28e000 R08: 000000000000000a R09: 00007f33cf288740
+[ 93.557669] R10: 00007f33cf288740 R11: 0000000000000246 R12: 00007f33cec42400
+[ 93.565630] R13: 0000000000000006 R14: 0000000000000001 R15: 0000000000000000
+[ 93.573592] Code: 4c 8d 40 08 4c 39 c7 74 16 48 8b 00 48 8b 04 08 48 85 c0
+74 16 48 89 86 78 01 00 00 31 c0 c3 8d 4a 01 48 63 c9 48 c1 e1 03 eb de <0f>
+0b 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 85 f6 53 48 89
+[ 93.594676] RIP: nvme_init_request+0x36/0x40 [nvme] RSP: ffffc90002537ca8
+[ 93.602273] ---[ end trace 810dde3993e5f14e ]---
+
+Reported-by: Yi Zhang <yi.zhang@redhat.com>
+Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/nvme/host/pci.c | 61 +++++++++++++++++++-----------------------------
+ 1 file changed, 25 insertions(+), 36 deletions(-)
+
+--- a/drivers/nvme/host/pci.c
++++ b/drivers/nvme/host/pci.c
+@@ -77,7 +77,7 @@ static void nvme_dev_disable(struct nvme
+ * Represents an NVM Express device. Each nvme_dev is a PCI function.
+ */
+ struct nvme_dev {
+- struct nvme_queue **queues;
++ struct nvme_queue *queues;
+ struct blk_mq_tag_set tagset;
+ struct blk_mq_tag_set admin_tagset;
+ u32 __iomem *dbs;
+@@ -348,7 +348,7 @@ static int nvme_admin_init_hctx(struct b
+ unsigned int hctx_idx)
+ {
+ struct nvme_dev *dev = data;
+- struct nvme_queue *nvmeq = dev->queues[0];
++ struct nvme_queue *nvmeq = &dev->queues[0];
+
+ WARN_ON(hctx_idx != 0);
+ WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+@@ -370,7 +370,7 @@ static int nvme_init_hctx(struct blk_mq_
+ unsigned int hctx_idx)
+ {
+ struct nvme_dev *dev = data;
+- struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
++ struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
+
+ if (!nvmeq->tags)
+ nvmeq->tags = &dev->tagset.tags[hctx_idx];
+@@ -386,7 +386,7 @@ static int nvme_init_request(struct blk_
+ struct nvme_dev *dev = set->driver_data;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
+- struct nvme_queue *nvmeq = dev->queues[queue_idx];
++ struct nvme_queue *nvmeq = &dev->queues[queue_idx];
+
+ BUG_ON(!nvmeq);
+ iod->nvmeq = nvmeq;
+@@ -900,7 +900,7 @@ static int nvme_poll(struct blk_mq_hw_ct
+ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
+ {
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
+- struct nvme_queue *nvmeq = dev->queues[0];
++ struct nvme_queue *nvmeq = &dev->queues[0];
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+@@ -1146,7 +1146,6 @@ static void nvme_free_queue(struct nvme_
+ if (nvmeq->sq_cmds)
+ dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+ nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+- kfree(nvmeq);
+ }
+
+ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
+@@ -1154,10 +1153,8 @@ static void nvme_free_queues(struct nvme
+ int i;
+
+ for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
+- struct nvme_queue *nvmeq = dev->queues[i];
+ dev->ctrl.queue_count--;
+- dev->queues[i] = NULL;
+- nvme_free_queue(nvmeq);
++ nvme_free_queue(&dev->queues[i]);
+ }
+ }
+
+@@ -1189,10 +1186,8 @@ static int nvme_suspend_queue(struct nvm
+
+ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
+ {
+- struct nvme_queue *nvmeq = dev->queues[0];
++ struct nvme_queue *nvmeq = &dev->queues[0];
+
+- if (!nvmeq)
+- return;
+ if (nvme_suspend_queue(nvmeq))
+ return;
+
+@@ -1246,13 +1241,10 @@ static int nvme_alloc_sq_cmds(struct nvm
+ return 0;
+ }
+
+-static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
+- int depth, int node)
++static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
++ int depth, int node)
+ {
+- struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
+- node);
+- if (!nvmeq)
+- return NULL;
++ struct nvme_queue *nvmeq = &dev->queues[qid];
+
+ nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
+ &nvmeq->cq_dma_addr, GFP_KERNEL);
+@@ -1271,17 +1263,15 @@ static struct nvme_queue *nvme_alloc_que
+ nvmeq->q_depth = depth;
+ nvmeq->qid = qid;
+ nvmeq->cq_vector = -1;
+- dev->queues[qid] = nvmeq;
+ dev->ctrl.queue_count++;
+
+- return nvmeq;
++ return 0;
+
+ free_cqdma:
+ dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
+ nvmeq->cq_dma_addr);
+ free_nvmeq:
+- kfree(nvmeq);
+- return NULL;
++ return -ENOMEM;
+ }
+
+ static int queue_request_irq(struct nvme_queue *nvmeq)
+@@ -1468,14 +1458,12 @@ static int nvme_pci_configure_admin_queu
+ if (result < 0)
+ return result;
+
+- nvmeq = dev->queues[0];
+- if (!nvmeq) {
+- nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
+- dev_to_node(dev->dev));
+- if (!nvmeq)
+- return -ENOMEM;
+- }
++ result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
++ dev_to_node(dev->dev));
++ if (result)
++ return result;
+
++ nvmeq = &dev->queues[0];
+ aqa = nvmeq->q_depth - 1;
+ aqa |= aqa << 16;
+
+@@ -1505,7 +1493,7 @@ static int nvme_create_io_queues(struct
+
+ for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
+ /* vector == qid - 1, match nvme_create_queue */
+- if (!nvme_alloc_queue(dev, i, dev->q_depth,
++ if (nvme_alloc_queue(dev, i, dev->q_depth,
+ pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
+ ret = -ENOMEM;
+ break;
+@@ -1514,7 +1502,7 @@ static int nvme_create_io_queues(struct
+
+ max = min(dev->max_qid, dev->ctrl.queue_count - 1);
+ for (i = dev->online_queues; i <= max; i++) {
+- ret = nvme_create_queue(dev->queues[i], i);
++ ret = nvme_create_queue(&dev->queues[i], i);
+ if (ret)
+ break;
+ }
+@@ -1770,7 +1758,7 @@ static int nvme_setup_host_mem(struct nv
+
+ static int nvme_setup_io_queues(struct nvme_dev *dev)
+ {
+- struct nvme_queue *adminq = dev->queues[0];
++ struct nvme_queue *adminq = &dev->queues[0];
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+ int result, nr_io_queues;
+ unsigned long size;
+@@ -1896,7 +1884,7 @@ static void nvme_disable_io_queues(struc
+ retry:
+ timeout = ADMIN_TIMEOUT;
+ for (; i > 0; i--, sent++)
+- if (nvme_delete_queue(dev->queues[i], opcode))
++ if (nvme_delete_queue(&dev->queues[i], opcode))
+ break;
+
+ while (sent--) {
+@@ -2081,7 +2069,7 @@ static void nvme_dev_disable(struct nvme
+
+ queues = dev->online_queues - 1;
+ for (i = dev->ctrl.queue_count - 1; i > 0; i--)
+- nvme_suspend_queue(dev->queues[i]);
++ nvme_suspend_queue(&dev->queues[i]);
+
+ if (dead) {
+ /* A device might become IO incapable very soon during
+@@ -2089,7 +2077,7 @@ static void nvme_dev_disable(struct nvme
+ * queue_count can be 0 here.
+ */
+ if (dev->ctrl.queue_count)
+- nvme_suspend_queue(dev->queues[0]);
++ nvme_suspend_queue(&dev->queues[0]);
+ } else {
+ nvme_disable_io_queues(dev, queues);
+ nvme_disable_admin_queue(dev, shutdown);
+@@ -2345,7 +2333,8 @@ static int nvme_probe(struct pci_dev *pd
+ dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+ if (!dev)
+ return -ENOMEM;
+- dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
++
++ dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(struct nvme_queue),
+ GFP_KERNEL, node);
+ if (!dev->queues)
+ goto free;
--- /dev/null
+From 62314e405fa101dbb82563394f9dfc225e3f1167 Mon Sep 17 00:00:00 2001
+From: Keith Busch <keith.busch@intel.com>
+Date: Tue, 23 Jan 2018 09:16:19 -0700
+Subject: nvme-pci: Fix queue double allocations
+
+From: Keith Busch <keith.busch@intel.com>
+
+commit 62314e405fa101dbb82563394f9dfc225e3f1167 upstream.
+
+The queue count says the highest queue that's been allocated, so don't
+reallocate a queue lower than that.
+
+Fixes: 147b27e4bd0 ("nvme-pci: allocate device queues storage space at probe")
+Signed-off-by: Keith Busch <keith.busch@intel.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/nvme/host/pci.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/nvme/host/pci.c
++++ b/drivers/nvme/host/pci.c
+@@ -1246,6 +1246,9 @@ static int nvme_alloc_queue(struct nvme_
+ {
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+
++ if (dev->ctrl.queue_count > qid)
++ return 0;
++
+ nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
+ &nvmeq->cq_dma_addr, GFP_KERNEL);
+ if (!nvmeq->cqes)
--- /dev/null
+From d082dc1562a2ff0947b214796f12faaa87e816a9 Mon Sep 17 00:00:00 2001
+From: James Smart <jsmart2021@gmail.com>
+Date: Mon, 16 Jul 2018 14:38:14 -0700
+Subject: nvmet-fc: fix target sgl list on large transfers
+
+From: James Smart <jsmart2021@gmail.com>
+
+commit d082dc1562a2ff0947b214796f12faaa87e816a9 upstream.
+
+The existing code to carve up the sg list expected an sg element-per-page
+which can be very incorrect with iommu's remapping multiple memory pages
+to fewer bus addresses. To hit this error required a large io payload
+(greater than 256k) and a system that maps on a per-page basis. It's
+possible that large ios could get by fine if the system condensed the
+sgl list into the first 64 elements.
+
+This patch corrects the sg list handling by specifically walking the
+sg list element by element and attempting to divide the transfer up
+on a per-sg element boundary. While doing so, it still tries to keep
+sequences under 256k, but will exceed that rule if a single sg element
+is larger than 256k.
+
+Fixes: 48fa362b6c3f ("nvmet-fc: simplify sg list handling")
+Cc: <stable@vger.kernel.org> # 4.14
+Signed-off-by: James Smart <james.smart@broadcom.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ drivers/nvme/target/fc.c | 44 +++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 35 insertions(+), 9 deletions(-)
+
+--- a/drivers/nvme/target/fc.c
++++ b/drivers/nvme/target/fc.c
+@@ -58,8 +58,8 @@ struct nvmet_fc_ls_iod {
+ struct work_struct work;
+ } __aligned(sizeof(unsigned long long));
+
++/* desired maximum for a single sequence - if sg list allows it */
+ #define NVMET_FC_MAX_SEQ_LENGTH (256 * 1024)
+-#define NVMET_FC_MAX_XFR_SGENTS (NVMET_FC_MAX_SEQ_LENGTH / PAGE_SIZE)
+
+ enum nvmet_fcp_datadir {
+ NVMET_FCP_NODATA,
+@@ -74,6 +74,7 @@ struct nvmet_fc_fcp_iod {
+ struct nvme_fc_cmd_iu cmdiubuf;
+ struct nvme_fc_ersp_iu rspiubuf;
+ dma_addr_t rspdma;
++ struct scatterlist *next_sg;
+ struct scatterlist *data_sg;
+ int data_sg_cnt;
+ u32 total_length;
+@@ -1000,8 +1001,7 @@ nvmet_fc_register_targetport(struct nvme
+ INIT_LIST_HEAD(&newrec->assoc_list);
+ kref_init(&newrec->ref);
+ ida_init(&newrec->assoc_cnt);
+- newrec->max_sg_cnt = min_t(u32, NVMET_FC_MAX_XFR_SGENTS,
+- template->max_sgl_segments);
++ newrec->max_sg_cnt = template->max_sgl_segments;
+
+ ret = nvmet_fc_alloc_ls_iodlist(newrec);
+ if (ret) {
+@@ -1717,6 +1717,7 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_f
+ ((fod->io_dir == NVMET_FCP_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE));
+ /* note: write from initiator perspective */
++ fod->next_sg = fod->data_sg;
+
+ return 0;
+
+@@ -1874,24 +1875,49 @@ nvmet_fc_transfer_fcp_data(struct nvmet_
+ struct nvmet_fc_fcp_iod *fod, u8 op)
+ {
+ struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
++ struct scatterlist *sg = fod->next_sg;
+ unsigned long flags;
+- u32 tlen;
++ u32 remaininglen = fod->total_length - fod->offset;
++ u32 tlen = 0;
+ int ret;
+
+ fcpreq->op = op;
+ fcpreq->offset = fod->offset;
+ fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC;
+
+- tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE,
+- (fod->total_length - fod->offset));
++ /*
++ * for next sequence:
++ * break at a sg element boundary
++ * attempt to keep sequence length capped at
++ * NVMET_FC_MAX_SEQ_LENGTH but allow sequence to
++ * be longer if a single sg element is larger
++ * than that amount. This is done to avoid creating
++ * a new sg list to use for the tgtport api.
++ */
++ fcpreq->sg = sg;
++ fcpreq->sg_cnt = 0;
++ while (tlen < remaininglen &&
++ fcpreq->sg_cnt < tgtport->max_sg_cnt &&
++ tlen + sg_dma_len(sg) < NVMET_FC_MAX_SEQ_LENGTH) {
++ fcpreq->sg_cnt++;
++ tlen += sg_dma_len(sg);
++ sg = sg_next(sg);
++ }
++ if (tlen < remaininglen && fcpreq->sg_cnt == 0) {
++ fcpreq->sg_cnt++;
++ tlen += min_t(u32, sg_dma_len(sg), remaininglen);
++ sg = sg_next(sg);
++ }
++ if (tlen < remaininglen)
++ fod->next_sg = sg;
++ else
++ fod->next_sg = NULL;
++
+ fcpreq->transfer_length = tlen;
+ fcpreq->transferred_length = 0;
+ fcpreq->fcp_error = 0;
+ fcpreq->rsplen = 0;
+
+- fcpreq->sg = &fod->data_sg[fod->offset / PAGE_SIZE];
+- fcpreq->sg_cnt = DIV_ROUND_UP(tlen, PAGE_SIZE);
+-
+ /*
+ * If the last READDATA request: check if LLDD supports
+ * combined xfr with response.
ring_buffer-tracing-inherit-the-tracing-setting-to-next-ring-buffer.patch
i2c-imx-fix-reinit_completion-use.patch
btrfs-fix-file-data-corruption-after-cloning-a-range-and-fsync.patch
+nvme-pci-allocate-device-queues-storage-space-at-probe.patch
+nvme-pci-fix-queue-double-allocations.patch
+nvmet-fc-fix-target-sgl-list-on-large-transfers.patch
+intel_idle-graceful-probe-failure-when-mwait-is-disabled.patch
+xfs-catch-inode-allocation-state-mismatch-corruption.patch
+xfs-validate-cached-inodes-are-free-when-allocated.patch
+xfs-don-t-call-xfs_da_shrink_inode-with-null-bp.patch
--- /dev/null
+From ee457001ed6c6f31ddad69c24c1da8f377d8472d Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 23 Mar 2018 10:22:53 -0700
+Subject: xfs: catch inode allocation state mismatch corruption
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit ee457001ed6c6f31ddad69c24c1da8f377d8472d upstream.
+
+We recently came across a V4 filesystem causing memory corruption
+due to a newly allocated inode being setup twice and being added to
+the superblock inode list twice. From code inspection, the only way
+this could happen is if a newly allocated inode was not marked as
+free on disk (i.e. di_mode wasn't zero).
+
+Running the metadump on an upstream debug kernel fails during inode
+allocation like so:
+
+XFS: Assertion failed: ip->i_d.di_nblocks == 0, file: fs/xfs/xfs_inod=
+e.c, line: 838
+ ------------[ cut here ]------------
+kernel BUG at fs/xfs/xfs_message.c:114!
+invalid opcode: 0000 [#1] PREEMPT SMP
+CPU: 11 PID: 3496 Comm: mkdir Not tainted 4.16.0-rc5-dgc #442
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/0=
+1/2014
+RIP: 0010:assfail+0x28/0x30
+RSP: 0018:ffffc9000236fc80 EFLAGS: 00010202
+RAX: 00000000ffffffea RBX: 0000000000004000 RCX: 0000000000000000
+RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffff8227211b
+RBP: ffffc9000236fce8 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000bec R11: f000000000000000 R12: ffffc9000236fd30
+R13: ffff8805c76bab80 R14: ffff8805c77ac800 R15: ffff88083fb12e10
+FS: 00007fac8cbff040(0000) GS:ffff88083fd00000(0000) knlGS:0000000000000=
+000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007fffa6783ff8 CR3: 00000005c6e2b003 CR4: 00000000000606e0
+Call Trace:
+ xfs_ialloc+0x383/0x570
+ xfs_dir_ialloc+0x6a/0x2a0
+ xfs_create+0x412/0x670
+ xfs_generic_create+0x1f7/0x2c0
+ ? capable_wrt_inode_uidgid+0x3f/0x50
+ vfs_mkdir+0xfb/0x1b0
+ SyS_mkdir+0xcf/0xf0
+ do_syscall_64+0x73/0x1a0
+ entry_SYSCALL_64_after_hwframe+0x42/0xb7
+
+Extracting the inode number we crashed on from an event trace and
+looking at it with xfs_db:
+
+xfs_db> inode 184452204
+xfs_db> p
+core.magic = 0x494e
+core.mode = 0100644
+core.version = 2
+core.format = 2 (extents)
+core.nlinkv2 = 1
+core.onlink = 0
+.....
+
+Confirms that it is not a free inode on disk. xfs_repair
+also trips over this inode:
+
+.....
+zero length extent (off = 0, fsbno = 0) in ino 184452204
+correcting nextents for inode 184452204
+bad attribute fork in inode 184452204, would clear attr fork
+bad nblocks 1 for inode 184452204, would reset to 0
+bad anextents 1 for inode 184452204, would reset to 0
+imap claims in-use inode 184452204 is free, would correct imap
+would have cleared inode 184452204
+.....
+disconnected inode 184452204, would move to lost+found
+
+And so we have a situation where the directory structure and the
+inobt thinks the inode is free, but the inode on disk thinks it is
+still in use. Where this corruption came from is not possible to
+diagnose, but we can detect it and prevent the kernel from oopsing
+on lookup. The reproducer now results in:
+
+$ sudo mkdir /mnt/scratch/{0,1,2,3,4,5}{0,1,2,3,4,5}
+mkdir: cannot create directory =E2=80=98/mnt/scratch/00=E2=80=99: File ex=
+ists
+mkdir: cannot create directory =E2=80=98/mnt/scratch/01=E2=80=99: File ex=
+ists
+mkdir: cannot create directory =E2=80=98/mnt/scratch/03=E2=80=99: Structu=
+re needs cleaning
+mkdir: cannot create directory =E2=80=98/mnt/scratch/04=E2=80=99: Input/o=
+utput error
+mkdir: cannot create directory =E2=80=98/mnt/scratch/05=E2=80=99: Input/o=
+utput error
+....
+
+And this corruption shutdown:
+
+[ 54.843517] XFS (loop0): Corruption detected! Free inode 0xafe846c not=
+ marked free on disk
+[ 54.845885] XFS (loop0): Internal error xfs_trans_cancel at line 1023 =
+of file fs/xfs/xfs_trans.c. Caller xfs_create+0x425/0x670
+[ 54.848994] CPU: 10 PID: 3541 Comm: mkdir Not tainted 4.16.0-rc5-dgc #=
+443
+[ 54.850753] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO=
+S 1.10.2-1 04/01/2014
+[ 54.852859] Call Trace:
+[ 54.853531] dump_stack+0x85/0xc5
+[ 54.854385] xfs_trans_cancel+0x197/0x1c0
+[ 54.855421] xfs_create+0x425/0x670
+[ 54.856314] xfs_generic_create+0x1f7/0x2c0
+[ 54.857390] ? capable_wrt_inode_uidgid+0x3f/0x50
+[ 54.858586] vfs_mkdir+0xfb/0x1b0
+[ 54.859458] SyS_mkdir+0xcf/0xf0
+[ 54.860254] do_syscall_64+0x73/0x1a0
+[ 54.861193] entry_SYSCALL_64_after_hwframe+0x42/0xb7
+[ 54.862492] RIP: 0033:0x7fb73bddf547
+[ 54.863358] RSP: 002b:00007ffdaa553338 EFLAGS: 00000246 ORIG_RAX: 0000=
+000000000053
+[ 54.865133] RAX: ffffffffffffffda RBX: 00007ffdaa55449a RCX: 00007fb73=
+bddf547
+[ 54.866766] RDX: 0000000000000001 RSI: 00000000000001ff RDI: 00007ffda=
+a55449a
+[ 54.868432] RBP: 00007ffdaa55449a R08: 00000000000001ff R09: 00005623a=
+8670dd0
+[ 54.870110] R10: 00007fb73be72d5b R11: 0000000000000246 R12: 000000000=
+00001ff
+[ 54.871752] R13: 00007ffdaa5534b0 R14: 0000000000000000 R15: 00007ffda=
+a553500
+[ 54.873429] XFS (loop0): xfs_do_force_shutdown(0x8) called from line 1=
+024 of file fs/xfs/xfs_trans.c. Return address = ffffffff814cd050
+[ 54.882790] XFS (loop0): Corruption of in-memory data detected. Shutt=
+ing down filesystem
+[ 54.884597] XFS (loop0): Please umount the filesystem and rectify the =
+problem(s)
+
+Note that this crash is only possible on v4 filesystemsi or v5
+filesystems mounted with the ikeep mount option. For all other V5
+filesystems, this problem cannot occur because we don't read inodes
+we are allocating from disk - we simply overwrite them with the new
+inode information.
+
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Tested-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_icache.c | 23 ++++++++++++++++++++++-
+ 1 file changed, 22 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -475,7 +475,28 @@ xfs_iget_cache_miss(
+
+ trace_xfs_iget_miss(ip);
+
+- if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) {
++
++ /*
++ * If we are allocating a new inode, then check what was returned is
++ * actually a free, empty inode. If we are not allocating an inode,
++ * the check we didn't find a free inode.
++ */
++ if (flags & XFS_IGET_CREATE) {
++ if (VFS_I(ip)->i_mode != 0) {
++ xfs_warn(mp,
++"Corruption detected! Free inode 0x%llx not marked free on disk",
++ ino);
++ error = -EFSCORRUPTED;
++ goto out_destroy;
++ }
++ if (ip->i_d.di_nblocks != 0) {
++ xfs_warn(mp,
++"Corruption detected! Free inode 0x%llx has blocks allocated!",
++ ino);
++ error = -EFSCORRUPTED;
++ goto out_destroy;
++ }
++ } else if (VFS_I(ip)->i_mode == 0) {
+ error = -ENOENT;
+ goto out_destroy;
+ }
--- /dev/null
+From bb3d48dcf86a97dc25fe9fc2c11938e19cb4399a Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@sandeen.net>
+Date: Fri, 8 Jun 2018 09:53:49 -0700
+Subject: xfs: don't call xfs_da_shrink_inode with NULL bp
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit bb3d48dcf86a97dc25fe9fc2c11938e19cb4399a upstream.
+
+xfs_attr3_leaf_create may have errored out before instantiating a buffer,
+for example if the blkno is out of range. In that case there is no work
+to do to remove it, and in fact xfs_da_shrink_inode will lead to an oops
+if we try.
+
+This also seems to fix a flaw where the original error from
+xfs_attr3_leaf_create gets overwritten in the cleanup case, and it
+removes a pointless assignment to bp which isn't used after this.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199969
+Reported-by: Xu, Wen <wen.xu@gatech.edu>
+Tested-by: Xu, Wen <wen.xu@gatech.edu>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_attr_leaf.c | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_attr_leaf.c
++++ b/fs/xfs/libxfs/xfs_attr_leaf.c
+@@ -785,9 +785,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t
+ ASSERT(blkno == 0);
+ error = xfs_attr3_leaf_create(args, blkno, &bp);
+ if (error) {
+- error = xfs_da_shrink_inode(args, 0, bp);
+- bp = NULL;
+- if (error)
++ /* xfs_attr3_leaf_create may not have instantiated a block */
++ if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0))
+ goto out;
+ xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
+ memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
--- /dev/null
+From afca6c5b2595fc44383919fba740c194b0b76aff Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Tue, 17 Apr 2018 17:17:34 -0700
+Subject: xfs: validate cached inodes are free when allocated
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit afca6c5b2595fc44383919fba740c194b0b76aff upstream.
+
+A recent fuzzed filesystem image cached random dcache corruption
+when the reproducer was run. This often showed up as panics in
+lookup_slow() on a null inode->i_ops pointer when doing pathwalks.
+
+BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
+....
+Call Trace:
+ lookup_slow+0x44/0x60
+ walk_component+0x3dd/0x9f0
+ link_path_walk+0x4a7/0x830
+ path_lookupat+0xc1/0x470
+ filename_lookup+0x129/0x270
+ user_path_at_empty+0x36/0x40
+ path_listxattr+0x98/0x110
+ SyS_listxattr+0x13/0x20
+ do_syscall_64+0xf5/0x280
+ entry_SYSCALL_64_after_hwframe+0x42/0xb7
+
+but had many different failure modes including deadlocks trying to
+lock the inode that was just allocated or KASAN reports of
+use-after-free violations.
+
+The cause of the problem was a corrupt INOBT on a v4 fs where the
+root inode was marked as free in the inobt record. Hence when we
+allocated an inode, it chose the root inode to allocate, found it in
+the cache and re-initialised it.
+
+We recently fixed a similar inode allocation issue caused by inobt
+record corruption problem in xfs_iget_cache_miss() in commit
+ee457001ed6c ("xfs: catch inode allocation state mismatch
+corruption"). This change adds similar checks to the cache-hit path
+to catch it, and turns the reproducer into a corruption shutdown
+situation.
+
+Reported-by: Wen Xu <wen.xu@gatech.edu>
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: fix typos in comment]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: Eduardo Valentin <eduval@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_icache.c | 73 ++++++++++++++++++++++++++++++++++------------------
+ 1 file changed, 48 insertions(+), 25 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -306,6 +306,46 @@ xfs_reinit_inode(
+ }
+
+ /*
++ * If we are allocating a new inode, then check what was returned is
++ * actually a free, empty inode. If we are not allocating an inode,
++ * then check we didn't find a free inode.
++ *
++ * Returns:
++ * 0 if the inode free state matches the lookup context
++ * -ENOENT if the inode is free and we are not allocating
++ * -EFSCORRUPTED if there is any state mismatch at all
++ */
++static int
++xfs_iget_check_free_state(
++ struct xfs_inode *ip,
++ int flags)
++{
++ if (flags & XFS_IGET_CREATE) {
++ /* should be a free inode */
++ if (VFS_I(ip)->i_mode != 0) {
++ xfs_warn(ip->i_mount,
++"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
++ ip->i_ino, VFS_I(ip)->i_mode);
++ return -EFSCORRUPTED;
++ }
++
++ if (ip->i_d.di_nblocks != 0) {
++ xfs_warn(ip->i_mount,
++"Corruption detected! Free inode 0x%llx has blocks allocated!",
++ ip->i_ino);
++ return -EFSCORRUPTED;
++ }
++ return 0;
++ }
++
++ /* should be an allocated inode */
++ if (VFS_I(ip)->i_mode == 0)
++ return -ENOENT;
++
++ return 0;
++}
++
++/*
+ * Check the validity of the inode we just found it the cache
+ */
+ static int
+@@ -354,12 +394,12 @@ xfs_iget_cache_hit(
+ }
+
+ /*
+- * If lookup is racing with unlink return an error immediately.
++ * Check the inode free state is valid. This also detects lookup
++ * racing with unlinks.
+ */
+- if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+- error = -ENOENT;
++ error = xfs_iget_check_free_state(ip, flags);
++ if (error)
+ goto out_error;
+- }
+
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+@@ -477,29 +517,12 @@ xfs_iget_cache_miss(
+
+
+ /*
+- * If we are allocating a new inode, then check what was returned is
+- * actually a free, empty inode. If we are not allocating an inode,
+- * the check we didn't find a free inode.
++ * Check the inode free state is valid. This also detects lookup
++ * racing with unlinks.
+ */
+- if (flags & XFS_IGET_CREATE) {
+- if (VFS_I(ip)->i_mode != 0) {
+- xfs_warn(mp,
+-"Corruption detected! Free inode 0x%llx not marked free on disk",
+- ino);
+- error = -EFSCORRUPTED;
+- goto out_destroy;
+- }
+- if (ip->i_d.di_nblocks != 0) {
+- xfs_warn(mp,
+-"Corruption detected! Free inode 0x%llx has blocks allocated!",
+- ino);
+- error = -EFSCORRUPTED;
+- goto out_destroy;
+- }
+- } else if (VFS_I(ip)->i_mode == 0) {
+- error = -ENOENT;
++ error = xfs_iget_check_free_state(ip, flags);
++ if (error)
+ goto out_destroy;
+- }
+
+ /*
+ * Preload the radix tree so we can insert safely under the