From 159b7eede32ea403f9dfdd66f2f7bc8403377e3c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 22 Apr 2018 09:30:31 +0200
Subject: [PATCH] 4.4-stable patches

added patches:
	ext4-don-t-allow-r-w-mounts-if-metadata-blocks-overlap-the-superblock.patch
	vfio-pci-virtualize-maximum-payload-size.patch
	vfio-pci-virtualize-maximum-read-request-size.patch
	vfio-pci-virtualize-pcie-af-flr.patch
---
 ...tadata-blocks-overlap-the-superblock.patch |  57 +++++++
 queue-4.4/series                              |   4 +
 ...-pci-virtualize-maximum-payload-size.patch |  46 ++++++
 ...virtualize-maximum-read-request-size.patch |  80 ++++++++++
 .../vfio-pci-virtualize-pcie-af-flr.patch     | 146 ++++++++++++++++++
 5 files changed, 333 insertions(+)
 create mode 100644 queue-4.4/ext4-don-t-allow-r-w-mounts-if-metadata-blocks-overlap-the-superblock.patch
 create mode 100644 queue-4.4/vfio-pci-virtualize-maximum-payload-size.patch
 create mode 100644 queue-4.4/vfio-pci-virtualize-maximum-read-request-size.patch
 create mode 100644 queue-4.4/vfio-pci-virtualize-pcie-af-flr.patch

diff --git a/queue-4.4/ext4-don-t-allow-r-w-mounts-if-metadata-blocks-overlap-the-superblock.patch b/queue-4.4/ext4-don-t-allow-r-w-mounts-if-metadata-blocks-overlap-the-superblock.patch
new file mode 100644
index 00000000000..7870dd76887
--- /dev/null
+++ b/queue-4.4/ext4-don-t-allow-r-w-mounts-if-metadata-blocks-overlap-the-superblock.patch
@@ -0,0 +1,57 @@
+From 18db4b4e6fc31eda838dd1c1296d67dbcb3dc957 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Thu, 29 Mar 2018 22:10:35 -0400
+Subject: ext4: don't allow r/w mounts if metadata blocks overlap the superblock
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 18db4b4e6fc31eda838dd1c1296d67dbcb3dc957 upstream.
+
+If some metadata block, such as an allocation bitmap, overlaps the
+superblock, it's very likely that if the file system is mounted
+read/write, the results will not be pretty.  So disallow r/w mounts
+for file systems corrupted in this particular way.
+
+Backport notes:
+3.18.y is missing bc98a42c1f7d ("VFS: Convert sb->s_flags & MS_RDONLY to sb_rdonly(sb)")
+and e462ec50cb5f ("VFS: Differentiate mount flags (MS_*) from internal superblock flags")
+so we simply use the sb MS_RDONLY check from pre bc98a42c1f7d in place of the sb_rdonly
+function used in the upstream variant of the patch.
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@vger.kernel.org
+Signed-off-by: Harsh Shandilya <harsh@prjkt.io>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/super.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2131,6 +2131,8 @@ static int ext4_check_descriptors(struct
+ 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ 				 "Block bitmap for group %u overlaps "
+ 				 "superblock", i);
++			if (!(sb->s_flags & MS_RDONLY))
++				return 0;
+ 		}
+ 		if (block_bitmap < first_block || block_bitmap > last_block) {
+ 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+@@ -2143,6 +2145,8 @@ static int ext4_check_descriptors(struct
+ 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ 				 "Inode bitmap for group %u overlaps "
+ 				 "superblock", i);
++			if (!(sb->s_flags & MS_RDONLY))
++				return 0;
+ 		}
+ 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
+ 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+@@ -2155,6 +2159,8 @@ static int ext4_check_descriptors(struct
+ 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ 				 "Inode table for group %u overlaps "
+ 				 "superblock", i);
++			if (!(sb->s_flags & MS_RDONLY))
++				return 0;
+ 		}
+ 		if (inode_table < first_block ||
+ 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
diff --git a/queue-4.4/series b/queue-4.4/series
index 19ad99de2d4..3cf2ecd7b0e 100644
--- a/queue-4.4/series
+++ b/queue-4.4/series
@@ -68,3 +68,7 @@ alsa-pcm-avoid-potential-races-between-oss-ioctls-and-read-write.patch
 alsa-pcm-return-ebusy-for-oss-ioctls-changing-busy-streams.patch
 alsa-pcm-fix-mutex-unbalance-in-oss-emulation-ioctls.patch
 alsa-pcm-fix-endless-loop-for-xrun-recovery-in-oss-emulation.patch
+vfio-pci-virtualize-pcie-af-flr.patch
+vfio-pci-virtualize-maximum-payload-size.patch
+vfio-pci-virtualize-maximum-read-request-size.patch
+ext4-don-t-allow-r-w-mounts-if-metadata-blocks-overlap-the-superblock.patch
diff --git a/queue-4.4/vfio-pci-virtualize-maximum-payload-size.patch b/queue-4.4/vfio-pci-virtualize-maximum-payload-size.patch
new file mode 100644
index 00000000000..f01c94b7863
--- /dev/null
+++ b/queue-4.4/vfio-pci-virtualize-maximum-payload-size.patch
@@ -0,0 +1,46 @@
+From 523184972b282cd9ca17a76f6ca4742394856818 Mon Sep 17 00:00:00 2001
+From: Alex Williamson <alex.williamson@redhat.com>
+Date: Mon, 2 Oct 2017 12:39:09 -0600
+Subject: vfio/pci: Virtualize Maximum Payload Size
+
+From: Alex Williamson <alex.williamson@redhat.com>
+
+commit 523184972b282cd9ca17a76f6ca4742394856818 upstream.
+
+With virtual PCI-Express chipsets, we now see userspace/guest drivers
+trying to match the physical MPS setting to a virtual downstream port.
+Of course a lone physical device surrounded by virtual interconnects
+cannot make a correct decision for a proper MPS setting.  Instead,
+let's virtualize the MPS control register so that writes through to
+hardware are disallowed.  Userspace drivers like QEMU assume they can
+write anything to the device and we'll filter out anything dangerous.
+Since mismatched MPS can lead to AER and other faults, let's add it
+to the kernel side rather than relying on userspace virtualization to
+handle it.
+
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vfio/pci/vfio_pci_config.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/vfio/pci/vfio_pci_config.c
++++ b/drivers/vfio/pci/vfio_pci_config.c
+@@ -799,11 +799,13 @@ static int __init init_pci_cap_exp_perm(
+ 
+ 	/*
+ 	 * Allow writes to device control fields, except devctl_phantom,
+-	 * which could confuse IOMMU, and the ARI bit in devctl2, which
++	 * which could confuse IOMMU, MPS, which can break communication
++	 * with other physical devices, and the ARI bit in devctl2, which
+ 	 * is set at probe time.  FLR gets virtualized via our writefn.
+ 	 */
+ 	p_setw(perm, PCI_EXP_DEVCTL,
+-	       PCI_EXP_DEVCTL_BCR_FLR, ~PCI_EXP_DEVCTL_PHANTOM);
++	       PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD,
++	       ~PCI_EXP_DEVCTL_PHANTOM);
+ 	p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
+ 	return 0;
+ }
diff --git a/queue-4.4/vfio-pci-virtualize-maximum-read-request-size.patch b/queue-4.4/vfio-pci-virtualize-maximum-read-request-size.patch
new file mode 100644
index 00000000000..d5f969dae16
--- /dev/null
+++ b/queue-4.4/vfio-pci-virtualize-maximum-read-request-size.patch
@@ -0,0 +1,80 @@
+From cf0d53ba4947aad6e471491d5b20a567cbe92e56 Mon Sep 17 00:00:00 2001
+From: Alex Williamson <alex.williamson@redhat.com>
+Date: Mon, 2 Oct 2017 12:39:10 -0600
+Subject: vfio/pci: Virtualize Maximum Read Request Size
+
+From: Alex Williamson <alex.williamson@redhat.com>
+
+commit cf0d53ba4947aad6e471491d5b20a567cbe92e56 upstream.
+
+MRRS defines the maximum read request size a device is allowed to
+make.  Drivers will often increase this to allow more data transfer
+with a single request.  Completions to this request are bound by the
+MPS setting for the bus.  Aside from device quirks (none known), it
+doesn't seem to make sense to set an MRRS value less than MPS, yet
+this is a likely scenario given that user drivers do not have a
+system-wide view of the PCI topology.  Virtualize MRRS such that the
+user can set MRRS >= MPS, but use MPS as the floor value that we'll
+write to hardware.
+
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vfio/pci/vfio_pci_config.c |   29 ++++++++++++++++++++++++++---
+ 1 file changed, 26 insertions(+), 3 deletions(-)
+
+--- a/drivers/vfio/pci/vfio_pci_config.c
++++ b/drivers/vfio/pci/vfio_pci_config.c
+@@ -758,6 +758,7 @@ static int vfio_exp_config_write(struct
+ {
+ 	__le16 *ctrl = (__le16 *)(vdev->vconfig + pos -
+ 				  offset + PCI_EXP_DEVCTL);
++	int readrq = le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ;
+ 
+ 	count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
+ 	if (count < 0)
+@@ -783,6 +784,27 @@ static int vfio_exp_config_write(struct
+ 			pci_try_reset_function(vdev->pdev);
+ 	}
+ 
++	/*
++	 * MPS is virtualized to the user, writes do not change the physical
++	 * register since determining a proper MPS value requires a system wide
++	 * device view.  The MRRS is largely independent of MPS, but since the
++	 * user does not have that system-wide view, they might set a safe, but
++	 * inefficiently low value.  Here we allow writes through to hardware,
++	 * but we set the floor to the physical device MPS setting, so that
++	 * we can at least use full TLPs, as defined by the MPS value.
++	 *
++	 * NB, if any devices actually depend on an artificially low MRRS
++	 * setting, this will need to be revisited, perhaps with a quirk
++	 * though pcie_set_readrq().
++	 */
++	if (readrq != (le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ)) {
++		readrq = 128 <<
++			((le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ) >> 12);
++		readrq = max(readrq, pcie_get_mps(vdev->pdev));
++
++		pcie_set_readrq(vdev->pdev, readrq);
++	}
++
+ 	return count;
+ }
+ 
+@@ -801,11 +823,12 @@ static int __init init_pci_cap_exp_perm(
+ 	 * Allow writes to device control fields, except devctl_phantom,
+ 	 * which could confuse IOMMU, MPS, which can break communication
+ 	 * with other physical devices, and the ARI bit in devctl2, which
+-	 * is set at probe time.  FLR gets virtualized via our writefn.
++	 * is set at probe time.  FLR and MRRS get virtualized via our
++	 * writefn.
+ 	 */
+ 	p_setw(perm, PCI_EXP_DEVCTL,
+-	       PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD,
+-	       ~PCI_EXP_DEVCTL_PHANTOM);
++	       PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD |
++	       PCI_EXP_DEVCTL_READRQ, ~PCI_EXP_DEVCTL_PHANTOM);
+ 	p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
+ 	return 0;
+ }
diff --git a/queue-4.4/vfio-pci-virtualize-pcie-af-flr.patch b/queue-4.4/vfio-pci-virtualize-pcie-af-flr.patch
new file mode 100644
index 00000000000..f5007f4cad0
--- /dev/null
+++ b/queue-4.4/vfio-pci-virtualize-pcie-af-flr.patch
@@ -0,0 +1,146 @@
+From ddf9dc0eb5314d6dac8b19b1cc37c739c6896e7e Mon Sep 17 00:00:00 2001
+From: Alex Williamson <alex.williamson@redhat.com>
+Date: Mon, 26 Sep 2016 13:52:16 -0600
+Subject: vfio-pci: Virtualize PCIe & AF FLR
+
+From: Alex Williamson <alex.williamson@redhat.com>
+
+commit ddf9dc0eb5314d6dac8b19b1cc37c739c6896e7e upstream.
+
+We use a BAR restore trick to try to detect when a user has performed
+a device reset, possibly through FLR or other backdoors, to put things
+back into a working state.  This is important for backdoor resets, but
+we can actually just virtualize the "front door" resets provided via
+PCIe and AF FLR.  Set these bits as virtualized + writable, allowing
+the default write to set them in vconfig, then we can simply check the
+bit, perform an FLR of our own, and clear the bit.  We don't actually
+have the granularity in PCI to specify the type of reset we want to
+do, but generally devices don't implement both PCIe and AF FLR and
+we'll favor these over other types of reset, so we should generally
+lineup.  We do test whether the device provides the requested FLR type
+to stay consistent with hardware capabilities though.
+
+This seems to fix several instance of devices getting into bad states
+with userspace drivers, like dpdk, running inside a VM.
+
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Reviewed-by: Greg Rose <grose@lightfleet.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vfio/pci/vfio_pci_config.c |   82 ++++++++++++++++++++++++++++++++++---
+ 1 file changed, 77 insertions(+), 5 deletions(-)
+
+--- a/drivers/vfio/pci/vfio_pci_config.c
++++ b/drivers/vfio/pci/vfio_pci_config.c
+@@ -752,6 +752,40 @@ static int __init init_pci_cap_pcix_perm
+ 	return 0;
+ }
+ 
++static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos,
++				 int count, struct perm_bits *perm,
++				 int offset, __le32 val)
++{
++	__le16 *ctrl = (__le16 *)(vdev->vconfig + pos -
++				  offset + PCI_EXP_DEVCTL);
++
++	count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
++	if (count < 0)
++		return count;
++
++	/*
++	 * The FLR bit is virtualized, if set and the device supports PCIe
++	 * FLR, issue a reset_function.  Regardless, clear the bit, the spec
++	 * requires it to be always read as zero.  NB, reset_function might
++	 * not use a PCIe FLR, we don't have that level of granularity.
++	 */
++	if (*ctrl & cpu_to_le16(PCI_EXP_DEVCTL_BCR_FLR)) {
++		u32 cap;
++		int ret;
++
++		*ctrl &= ~cpu_to_le16(PCI_EXP_DEVCTL_BCR_FLR);
++
++		ret = pci_user_read_config_dword(vdev->pdev,
++						 pos - offset + PCI_EXP_DEVCAP,
++						 &cap);
++
++		if (!ret && (cap & PCI_EXP_DEVCAP_FLR))
++			pci_try_reset_function(vdev->pdev);
++	}
++
++	return count;
++}
++
+ /* Permissions for PCI Express capability */
+ static int __init init_pci_cap_exp_perm(struct perm_bits *perm)
+ {
+@@ -759,26 +793,64 @@ static int __init init_pci_cap_exp_perm(
+ 	if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2))
+ 		return -ENOMEM;
+ 
++	perm->writefn = vfio_exp_config_write;
++
+ 	p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
+ 
+ 	/*
+-	 * Allow writes to device control fields (includes FLR!)
+-	 * but not to devctl_phantom which could confuse IOMMU
+-	 * or to the ARI bit in devctl2 which is set at probe time
++	 * Allow writes to device control fields, except devctl_phantom,
++	 * which could confuse IOMMU, and the ARI bit in devctl2, which
++	 * is set at probe time.  FLR gets virtualized via our writefn.
+ 	 */
+-	p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM);
++	p_setw(perm, PCI_EXP_DEVCTL,
++	       PCI_EXP_DEVCTL_BCR_FLR, ~PCI_EXP_DEVCTL_PHANTOM);
+ 	p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
+ 	return 0;
+ }
+ 
++static int vfio_af_config_write(struct vfio_pci_device *vdev, int pos,
++				int count, struct perm_bits *perm,
++				int offset, __le32 val)
++{
++	u8 *ctrl = vdev->vconfig + pos - offset + PCI_AF_CTRL;
++
++	count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
++	if (count < 0)
++		return count;
++
++	/*
++	 * The FLR bit is virtualized, if set and the device supports AF
++	 * FLR, issue a reset_function.  Regardless, clear the bit, the spec
++	 * requires it to be always read as zero.  NB, reset_function might
++	 * not use an AF FLR, we don't have that level of granularity.
++	 */
++	if (*ctrl & PCI_AF_CTRL_FLR) {
++		u8 cap;
++		int ret;
++
++		*ctrl &= ~PCI_AF_CTRL_FLR;
++
++		ret = pci_user_read_config_byte(vdev->pdev,
++						pos - offset + PCI_AF_CAP,
++						&cap);
++
++		if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP))
++			pci_try_reset_function(vdev->pdev);
++	}
++
++	return count;
++}
++
+ /* Permissions for Advanced Function capability */
+ static int __init init_pci_cap_af_perm(struct perm_bits *perm)
+ {
+ 	if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF]))
+ 		return -ENOMEM;
+ 
++	perm->writefn = vfio_af_config_write;
++
+ 	p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
+-	p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR);
++	p_setb(perm, PCI_AF_CTRL, PCI_AF_CTRL_FLR, PCI_AF_CTRL_FLR);
+ 	return 0;
+ }
+ 
-- 
2.47.3