--- /dev/null
+From debd1c065d2037919a7da67baf55cc683fee09f0 Mon Sep 17 00:00:00 2001
+From: Nikolay Borisov <nborisov@suse.com>
+Date: Fri, 17 May 2019 10:44:25 +0300
+Subject: btrfs: Ensure replaced device doesn't have pending chunk allocation
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit debd1c065d2037919a7da67baf55cc683fee09f0 upstream.
+
+Recent FITRIM work, namely bbbf7243d62d ("btrfs: combine device update
+operations during transaction commit") combined the way certain
+operations are recoded in a transaction. As a result an ASSERT was added
+in dev_replace_finish to ensure the new code works correctly.
+Unfortunately I got reports that it's possible to trigger the assert,
+meaning that during a device replace it's possible to have an unfinished
+chunk allocation on the source device.
+
+This is supposed to be prevented by the fact that a transaction is
+committed before finishing the replace oepration and alter acquiring the
+chunk mutex. This is not sufficient since by the time the transaction is
+committed and the chunk mutex acquired it's possible to allocate a chunk
+depending on the workload being executed on the replaced device. This
+bug has been present ever since device replace was introduced but there
+was never code which checks for it.
+
+The correct way to fix is to ensure that there is no pending device
+modification operation when the chunk mutex is acquire and if there is
+repeat transaction commit. Unfortunately it's not possible to just
+exclude the source device from btrfs_fs_devices::dev_alloc_list since
+this causes ENOSPC to be hit in transaction commit.
+
+Fixing that in another way would need to add special cases to handle the
+last writes and forbid new ones. The looped transaction fix is more
+obvious, and can be easily backported. The runtime of dev-replace is
+long so there's no noticeable delay caused by that.
+
+Reported-by: David Sterba <dsterba@suse.com>
+Fixes: 391cd9df81ac ("Btrfs: fix unprotected alloc list insertion during the finishing procedure of replace")
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/btrfs/dev-replace.c | 29 +++++++++++++++++++----------
+ fs/btrfs/volumes.c | 2 ++
+ fs/btrfs/volumes.h | 5 +++++
+ 3 files changed, 26 insertions(+), 10 deletions(-)
+
+--- a/fs/btrfs/dev-replace.c
++++ b/fs/btrfs/dev-replace.c
+@@ -512,18 +512,27 @@ static int btrfs_dev_replace_finishing(s
+ }
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+
+- trans = btrfs_start_transaction(root, 0);
+- if (IS_ERR(trans)) {
+- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+- return PTR_ERR(trans);
++ while (1) {
++ trans = btrfs_start_transaction(root, 0);
++ if (IS_ERR(trans)) {
++ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
++ return PTR_ERR(trans);
++ }
++ ret = btrfs_commit_transaction(trans);
++ WARN_ON(ret);
++ mutex_lock(&uuid_mutex);
++ /* keep away write_all_supers() during the finishing procedure */
++ mutex_lock(&fs_info->fs_devices->device_list_mutex);
++ mutex_lock(&fs_info->chunk_mutex);
++ if (src_device->has_pending_chunks) {
++ mutex_unlock(&root->fs_info->chunk_mutex);
++ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
++ mutex_unlock(&uuid_mutex);
++ } else {
++ break;
++ }
+ }
+- ret = btrfs_commit_transaction(trans);
+- WARN_ON(ret);
+
+- mutex_lock(&uuid_mutex);
+- /* keep away write_all_supers() during the finishing procedure */
+- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+- mutex_lock(&fs_info->chunk_mutex);
+ btrfs_dev_replace_lock(dev_replace, 1);
+ dev_replace->replace_state =
+ scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -4851,6 +4851,7 @@ static int __btrfs_alloc_chunk(struct bt
+ for (i = 0; i < map->num_stripes; i++) {
+ num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+ btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
++ map->stripes[i].dev->has_pending_chunks = true;
+ }
+
+ atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
+@@ -7310,6 +7311,7 @@ void btrfs_update_commit_device_bytes_us
+ for (i = 0; i < map->num_stripes; i++) {
+ dev = map->stripes[i].dev;
+ dev->commit_bytes_used = dev->bytes_used;
++ dev->has_pending_chunks = false;
+ }
+ }
+ mutex_unlock(&fs_info->chunk_mutex);
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -61,6 +61,11 @@ struct btrfs_device {
+
+ spinlock_t io_lock ____cacheline_aligned;
+ int running_pending;
++ /* When true means this device has pending chunk alloc in
++ * current transaction. Protected by chunk_mutex.
++ */
++ bool has_pending_chunks;
++
+ /* regular prio bios */
+ struct btrfs_pending_bios pending_bios;
+ /* sync bios */
drm-imx-only-send-event-on-crtc-disable-if-kept-disabled.patch
ftrace-x86-remove-possible-deadlock-between-register_kprobe-and-ftrace_run_update_code.patch
mm-vmscan.c-prevent-useless-kswapd-loops.patch
+btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch
+vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch
+vhost_net-use-packet-weight-for-rx-handler-too.patch
+vhost_net-introduce-vhost_exceeds_weight.patch
+vhost-introduce-vhost_exceeds_weight.patch
+vhost_net-fix-possible-infinite-loop.patch
+vhost-vsock-add-weight-support.patch
+vhost-scsi-add-weight-support.patch
--- /dev/null
+From e82b9b0727ff6d665fff2d326162b460dded554d Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 17 May 2019 00:29:49 -0400
+Subject: vhost: introduce vhost_exceeds_weight()
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit e82b9b0727ff6d665fff2d326162b460dded554d upstream.
+
+We used to have vhost_exceeds_weight() for vhost-net to:
+
+- prevent vhost kthread from hogging the cpu
+- balance the time spent between TX and RX
+
+This function could be useful for vsock and scsi as well. So move it
+to vhost.c. Device must specify a weight which counts the number of
+requests, or it can also specific a byte_weight which counts the
+number of bytes that has been processed.
+
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/net.c | 18 +++++-------------
+ drivers/vhost/scsi.c | 8 +++++++-
+ drivers/vhost/vhost.c | 20 +++++++++++++++++++-
+ drivers/vhost/vhost.h | 6 +++++-
+ drivers/vhost/vsock.c | 11 ++++++++++-
+ 5 files changed, 46 insertions(+), 17 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -446,12 +446,6 @@ static bool vhost_exceeds_maxpend(struct
+ == nvq->done_idx;
+ }
+
+-static bool vhost_exceeds_weight(int pkts, int total_len)
+-{
+- return total_len >= VHOST_NET_WEIGHT ||
+- pkts >= VHOST_NET_PKT_WEIGHT;
+-}
+-
+ /* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+ static void handle_tx(struct vhost_net *net)
+@@ -584,10 +578,9 @@ static void handle_tx(struct vhost_net *
+ else
+ vhost_zerocopy_signal_used(net, vq);
+ vhost_net_tx_packet(net);
+- if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
+- vhost_poll_queue(&vq->poll);
++ if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
++ total_len)))
+ break;
+- }
+ }
+ out:
+ mutex_unlock(&vq->mutex);
+@@ -867,10 +860,8 @@ static void handle_rx(struct vhost_net *
+ vhost_log_write(vq, vq_log, log, vhost_len,
+ vq->iov, in);
+ total_len += vhost_len;
+- if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
+- vhost_poll_queue(&vq->poll);
++ if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
+ goto out;
+- }
+ }
+ vhost_net_enable_vq(net, vq);
+ out:
+@@ -949,7 +940,8 @@ static int vhost_net_open(struct inode *
+ n->vqs[i].sock_hlen = 0;
+ vhost_net_buf_init(&n->vqs[i].rxq);
+ }
+- vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
++ vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
++ VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
+
+ vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
+--- a/drivers/vhost/scsi.c
++++ b/drivers/vhost/scsi.c
+@@ -58,6 +58,12 @@
+ #define VHOST_SCSI_PREALLOC_UPAGES 2048
+ #define VHOST_SCSI_PREALLOC_PROT_SGLS 512
+
++/* Max number of requests before requeueing the job.
++ * Using this limit prevents one virtqueue from starving others with
++ * request.
++ */
++#define VHOST_SCSI_WEIGHT 256
++
+ struct vhost_scsi_inflight {
+ /* Wait for the flush operation to finish */
+ struct completion comp;
+@@ -1427,7 +1433,7 @@ static int vhost_scsi_open(struct inode
+ vqs[i] = &vs->vqs[i].vq;
+ vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
+ }
+- vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ);
++ vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, VHOST_SCSI_WEIGHT, 0);
+
+ vhost_scsi_init_inflight(vs, NULL);
+
+--- a/drivers/vhost/vhost.c
++++ b/drivers/vhost/vhost.c
+@@ -412,8 +412,24 @@ static void vhost_dev_free_iovecs(struct
+ vhost_vq_free_iovecs(dev->vqs[i]);
+ }
+
++bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
++ int pkts, int total_len)
++{
++ struct vhost_dev *dev = vq->dev;
++
++ if ((dev->byte_weight && total_len >= dev->byte_weight) ||
++ pkts >= dev->weight) {
++ vhost_poll_queue(&vq->poll);
++ return true;
++ }
++
++ return false;
++}
++EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
++
+ void vhost_dev_init(struct vhost_dev *dev,
+- struct vhost_virtqueue **vqs, int nvqs)
++ struct vhost_virtqueue **vqs, int nvqs,
++ int weight, int byte_weight)
+ {
+ struct vhost_virtqueue *vq;
+ int i;
+@@ -427,6 +443,8 @@ void vhost_dev_init(struct vhost_dev *de
+ dev->iotlb = NULL;
+ dev->mm = NULL;
+ dev->worker = NULL;
++ dev->weight = weight;
++ dev->byte_weight = byte_weight;
+ init_llist_head(&dev->work_list);
+ init_waitqueue_head(&dev->wait);
+ INIT_LIST_HEAD(&dev->read_list);
+--- a/drivers/vhost/vhost.h
++++ b/drivers/vhost/vhost.h
+@@ -173,9 +173,13 @@ struct vhost_dev {
+ struct list_head read_list;
+ struct list_head pending_list;
+ wait_queue_head_t wait;
++ int weight;
++ int byte_weight;
+ };
+
+-void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
++bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
++void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
++ int nvqs, int weight, int byte_weight);
+ long vhost_dev_set_owner(struct vhost_dev *dev);
+ bool vhost_dev_has_owner(struct vhost_dev *dev);
+ long vhost_dev_check_owner(struct vhost_dev *);
+--- a/drivers/vhost/vsock.c
++++ b/drivers/vhost/vsock.c
+@@ -21,6 +21,14 @@
+ #include "vhost.h"
+
+ #define VHOST_VSOCK_DEFAULT_HOST_CID 2
++/* Max number of bytes transferred before requeueing the job.
++ * Using this limit prevents one virtqueue from starving others. */
++#define VHOST_VSOCK_WEIGHT 0x80000
++/* Max number of packets transferred before requeueing the job.
++ * Using this limit prevents one virtqueue from starving others with
++ * small pkts.
++ */
++#define VHOST_VSOCK_PKT_WEIGHT 256
+
+ enum {
+ VHOST_VSOCK_FEATURES = VHOST_FEATURES,
+@@ -531,7 +539,8 @@ static int vhost_vsock_dev_open(struct i
+ vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
+ vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;
+
+- vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs));
++ vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
++ VHOST_VSOCK_PKT_WEIGHT, VHOST_VSOCK_WEIGHT);
+
+ file->private_data = vsock;
+ spin_lock_init(&vsock->send_pkt_list_lock);
--- /dev/null
+From a2ac99905f1ea8b15997a6ec39af69aa28a3653b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?haibinzhang=28=E5=BC=A0=E6=B5=B7=E6=96=8C=29?=
+ <haibinzhang@tencent.com>
+Date: Mon, 9 Apr 2018 07:22:17 +0000
+Subject: vhost-net: set packet weight of tx polling to 2 * vq size
+
+From: haibinzhang(张海斌) <haibinzhang@tencent.com>
+
+commit a2ac99905f1ea8b15997a6ec39af69aa28a3653b upstream.
+
+handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy
+polling udp packets with small length(e.g. 1byte udp payload), because setting
+VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length.
+
+Ping-Latencies shown below were tested between two Virtual Machines using
+netperf (UDP_STREAM, len=1), and then another machine pinged the client:
+
+vq size=256
+Packet-Weight Ping-Latencies(millisecond)
+ min avg max
+Origin 3.319 18.489 57.303
+64 1.643 2.021 2.552
+128 1.825 2.600 3.224
+256 1.997 2.710 4.295
+512 1.860 3.171 4.631
+1024 2.002 4.173 9.056
+2048 2.257 5.650 9.688
+4096 2.093 8.508 15.943
+
+vq size=512
+Packet-Weight Ping-Latencies(millisecond)
+ min avg max
+Origin 6.537 29.177 66.245
+64 2.798 3.614 4.403
+128 2.861 3.820 4.775
+256 3.008 4.018 4.807
+512 3.254 4.523 5.824
+1024 3.079 5.335 7.747
+2048 3.944 8.201 12.762
+4096 4.158 11.057 19.985
+
+Seems pretty consistent, a small dip at 2 VQ sizes.
+Ring size is a hint from device about a burst size it can tolerate. Based on
+benchmarks, set the weight to 2 * vq size.
+
+To evaluate this change, another tests were done using netperf(RR, TX) between
+two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz, and vq size was
+tweaked through qemu. Results shown below does not show obvious changes.
+
+vq size=256 TCP_RR vq size=512 TCP_RR
+size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
+ 1/ 1/ -7%/ -2% 1/ 1/ 0%/ -2%
+ 1/ 4/ +1%/ 0% 1/ 4/ +1%/ 0%
+ 1/ 8/ +1%/ -2% 1/ 8/ 0%/ +1%
+ 64/ 1/ -6%/ 0% 64/ 1/ +7%/ +3%
+ 64/ 4/ 0%/ +2% 64/ 4/ -1%/ +1%
+ 64/ 8/ 0%/ 0% 64/ 8/ -1%/ -2%
+ 256/ 1/ -3%/ -4% 256/ 1/ -4%/ -2%
+ 256/ 4/ +3%/ +4% 256/ 4/ +1%/ +2%
+ 256/ 8/ +2%/ 0% 256/ 8/ +1%/ -1%
+
+vq size=256 UDP_RR vq size=512 UDP_RR
+size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
+ 1/ 1/ -5%/ +1% 1/ 1/ -3%/ -2%
+ 1/ 4/ +4%/ +1% 1/ 4/ -2%/ +2%
+ 1/ 8/ -1%/ -1% 1/ 8/ -1%/ 0%
+ 64/ 1/ -2%/ -3% 64/ 1/ +1%/ +1%
+ 64/ 4/ -5%/ -1% 64/ 4/ +2%/ 0%
+ 64/ 8/ 0%/ -1% 64/ 8/ -2%/ +1%
+ 256/ 1/ +7%/ +1% 256/ 1/ -7%/ 0%
+ 256/ 4/ +1%/ +1% 256/ 4/ -3%/ -4%
+ 256/ 8/ +2%/ +2% 256/ 8/ +1%/ +1%
+
+vq size=256 TCP_STREAM vq size=512 TCP_STREAM
+size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
+ 64/ 1/ 0%/ -3% 64/ 1/ 0%/ 0%
+ 64/ 4/ +3%/ -1% 64/ 4/ -2%/ +4%
+ 64/ 8/ +9%/ -4% 64/ 8/ -1%/ +2%
+ 256/ 1/ +1%/ -4% 256/ 1/ +1%/ +1%
+ 256/ 4/ -1%/ -1% 256/ 4/ -3%/ 0%
+ 256/ 8/ +7%/ +5% 256/ 8/ -3%/ 0%
+ 512/ 1/ +1%/ 0% 512/ 1/ -1%/ -1%
+ 512/ 4/ +1%/ -1% 512/ 4/ 0%/ 0%
+ 512/ 8/ +7%/ -5% 512/ 8/ +6%/ -1%
+1024/ 1/ 0%/ -1% 1024/ 1/ 0%/ +1%
+1024/ 4/ +3%/ 0% 1024/ 4/ +1%/ 0%
+1024/ 8/ +8%/ +5% 1024/ 8/ -1%/ 0%
+2048/ 1/ +2%/ +2% 2048/ 1/ -1%/ 0%
+2048/ 4/ +1%/ 0% 2048/ 4/ 0%/ -1%
+2048/ 8/ -2%/ 0% 2048/ 8/ 5%/ -1%
+4096/ 1/ -2%/ 0% 4096/ 1/ -2%/ 0%
+4096/ 4/ +2%/ 0% 4096/ 4/ 0%/ 0%
+4096/ 8/ +9%/ -2% 4096/ 8/ -5%/ -1%
+
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Haibin Zhang <haibinzhang@tencent.com>
+Signed-off-by: Yunfang Tai <yunfangtai@tencent.com>
+Signed-off-by: Lidong Chen <lidongchen@tencent.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Balbir Singh <sblbir@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/net.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -44,6 +44,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "
+ * Using this limit prevents one virtqueue from starving others. */
+ #define VHOST_NET_WEIGHT 0x80000
+
++/* Max number of packets transferred before requeueing the job.
++ * Using this limit prevents one virtqueue from starving rx. */
++#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2)
++
+ /* MAX number of TX used buffers for outstanding zerocopy */
+ #define VHOST_MAX_PEND 128
+ #define VHOST_GOODCOPY_LEN 256
+@@ -461,6 +465,7 @@ static void handle_tx(struct vhost_net *
+ struct socket *sock;
+ struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+ bool zcopy, zcopy_used;
++ int sent_pkts = 0;
+
+ mutex_lock(&vq->mutex);
+ sock = vq->private_data;
+@@ -572,7 +577,8 @@ static void handle_tx(struct vhost_net *
+ else
+ vhost_zerocopy_signal_used(net, vq);
+ vhost_net_tx_packet(net);
+- if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
++ if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
++ unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
--- /dev/null
+From c1ea02f15ab5efb3e93fc3144d895410bf79fcf2 Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 17 May 2019 00:29:52 -0400
+Subject: vhost: scsi: add weight support
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit c1ea02f15ab5efb3e93fc3144d895410bf79fcf2 upstream.
+
+This patch will check the weight and exit the loop if we exceeds the
+weight. This is useful for preventing scsi kthread from hogging cpu
+which is guest triggerable.
+
+This addresses CVE-2019-3900.
+
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Stefan Hajnoczi <stefanha@redhat.com>
+Fixes: 057cbf49a1f0 ("tcm_vhost: Initial merge for vhost level target fabric driver")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/scsi.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/vhost/scsi.c
++++ b/drivers/vhost/scsi.c
+@@ -846,7 +846,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *
+ u64 tag;
+ u32 exp_data_len, data_direction;
+ unsigned int out = 0, in = 0;
+- int head, ret, prot_bytes;
++ int head, ret, prot_bytes, c = 0;
+ size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp);
+ size_t out_size, in_size;
+ u16 lun;
+@@ -865,7 +865,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *
+
+ vhost_disable_notify(&vs->dev, vq);
+
+- for (;;) {
++ do {
+ head = vhost_get_vq_desc(vq, vq->iov,
+ ARRAY_SIZE(vq->iov), &out, &in,
+ NULL, NULL);
+@@ -1080,7 +1080,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *
+ */
+ INIT_WORK(&cmd->work, vhost_scsi_submission_work);
+ queue_work(vhost_scsi_workqueue, &cmd->work);
+- }
++ } while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
+ out:
+ mutex_unlock(&vq->mutex);
+ }
--- /dev/null
+From e79b431fb901ba1106670bcc80b9b617b25def7d Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 17 May 2019 00:29:51 -0400
+Subject: vhost: vsock: add weight support
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit e79b431fb901ba1106670bcc80b9b617b25def7d upstream.
+
+This patch will check the weight and exit the loop if we exceeds the
+weight. This is useful for preventing vsock kthread from hogging cpu
+which is guest triggerable. The weight can help to avoid starving the
+request from on direction while another direction is being processed.
+
+The value of weight is picked from vhost-net.
+
+This addresses CVE-2019-3900.
+
+Cc: Stefan Hajnoczi <stefanha@redhat.com>
+Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/vsock.c | 16 ++++++++++------
+ 1 file changed, 10 insertions(+), 6 deletions(-)
+
+--- a/drivers/vhost/vsock.c
++++ b/drivers/vhost/vsock.c
+@@ -86,6 +86,7 @@ vhost_transport_do_send_pkt(struct vhost
+ struct vhost_virtqueue *vq)
+ {
+ struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
++ int pkts = 0, total_len = 0;
+ bool added = false;
+ bool restart_tx = false;
+
+@@ -97,7 +98,7 @@ vhost_transport_do_send_pkt(struct vhost
+ /* Avoid further vmexits, we're already processing the virtqueue */
+ vhost_disable_notify(&vsock->dev, vq);
+
+- for (;;) {
++ do {
+ struct virtio_vsock_pkt *pkt;
+ struct iov_iter iov_iter;
+ unsigned out, in;
+@@ -182,8 +183,9 @@ vhost_transport_do_send_pkt(struct vhost
+ */
+ virtio_transport_deliver_tap_pkt(pkt);
+
++ total_len += pkt->len;
+ virtio_transport_free_pkt(pkt);
+- }
++ } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
+ if (added)
+ vhost_signal(&vsock->dev, vq);
+
+@@ -358,7 +360,7 @@ static void vhost_vsock_handle_tx_kick(s
+ struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
+ dev);
+ struct virtio_vsock_pkt *pkt;
+- int head;
++ int head, pkts = 0, total_len = 0;
+ unsigned int out, in;
+ bool added = false;
+
+@@ -368,7 +370,7 @@ static void vhost_vsock_handle_tx_kick(s
+ goto out;
+
+ vhost_disable_notify(&vsock->dev, vq);
+- for (;;) {
++ do {
+ u32 len;
+
+ if (!vhost_vsock_more_replies(vsock)) {
+@@ -409,9 +411,11 @@ static void vhost_vsock_handle_tx_kick(s
+ else
+ virtio_transport_free_pkt(pkt);
+
+- vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
++ len += sizeof(pkt->hdr);
++ vhost_add_used(vq, head, len);
++ total_len += len;
+ added = true;
+- }
++ } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
+
+ no_more_replies:
+ if (added)
--- /dev/null
+From e2412c07f8f3040593dfb88207865a3cd58680c0 Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 17 May 2019 00:29:50 -0400
+Subject: vhost_net: fix possible infinite loop
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit e2412c07f8f3040593dfb88207865a3cd58680c0 upstream.
+
+When the rx buffer is too small for a packet, we will discard the vq
+descriptor and retry it for the next packet:
+
+while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
+ &busyloop_intr))) {
+...
+ /* On overrun, truncate and discard */
+ if (unlikely(headcount > UIO_MAXIOV)) {
+ iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
+ err = sock->ops->recvmsg(sock, &msg,
+ 1, MSG_DONTWAIT | MSG_TRUNC);
+ pr_debug("Discarded rx packet: len %zd\n", sock_len);
+ continue;
+ }
+...
+}
+
+This makes it possible to trigger a infinite while..continue loop
+through the co-opreation of two VMs like:
+
+1) Malicious VM1 allocate 1 byte rx buffer and try to slow down the
+ vhost process as much as possible e.g using indirect descriptors or
+ other.
+2) Malicious VM2 generate packets to VM1 as fast as possible
+
+Fixing this by checking against weight at the end of RX and TX
+loop. This also eliminate other similar cases when:
+
+- userspace is consuming the packets in the meanwhile
+- theoretical TOCTOU attack if guest moving avail index back and forth
+ to hit the continue after vhost find guest just add new buffers
+
+This addresses CVE-2019-3900.
+
+Fixes: d8316f3991d20 ("vhost: fix total length when packets are too short")
+Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+
+---
+ drivers/vhost/net.c | 20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -482,7 +482,7 @@ static void handle_tx(struct vhost_net *
+ hdr_size = nvq->vhost_hlen;
+ zcopy = nvq->ubufs;
+
+- for (;;) {
++ do {
+ /* Release DMAs done buffers first */
+ if (zcopy)
+ vhost_zerocopy_signal_used(net, vq);
+@@ -578,10 +578,7 @@ static void handle_tx(struct vhost_net *
+ else
+ vhost_zerocopy_signal_used(net, vq);
+ vhost_net_tx_packet(net);
+- if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
+- total_len)))
+- break;
+- }
++ } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
+ out:
+ mutex_unlock(&vq->mutex);
+ }
+@@ -779,7 +776,11 @@ static void handle_rx(struct vhost_net *
+ vq->log : NULL;
+ mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
+
+- while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
++ do {
++ sock_len = vhost_net_rx_peek_head_len(net, sock->sk);
++
++ if (!sock_len)
++ break;
+ sock_len += sock_hlen;
+ vhost_len = sock_len + vhost_hlen;
+ headcount = get_rx_bufs(vq, vq->heads, vhost_len,
+@@ -860,9 +861,8 @@ static void handle_rx(struct vhost_net *
+ vhost_log_write(vq, vq_log, log, vhost_len,
+ vq->iov, in);
+ total_len += vhost_len;
+- if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
+- goto out;
+- }
++ } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
++
+ vhost_net_enable_vq(net, vq);
+ out:
+ mutex_unlock(&vq->mutex);
+@@ -941,7 +941,7 @@ static int vhost_net_open(struct inode *
+ vhost_net_buf_init(&n->vqs[i].rxq);
+ }
+ vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
+- VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
++ VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
+
+ vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
--- /dev/null
+From 272f35cba53d088085e5952fd81d7a133ab90789 Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 20 Jul 2018 08:15:15 +0800
+Subject: vhost_net: introduce vhost_exceeds_weight()
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit 272f35cba53d088085e5952fd81d7a133ab90789 upstream.
+
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/net.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -446,6 +446,12 @@ static bool vhost_exceeds_maxpend(struct
+ == nvq->done_idx;
+ }
+
++static bool vhost_exceeds_weight(int pkts, int total_len)
++{
++ return total_len >= VHOST_NET_WEIGHT ||
++ pkts >= VHOST_NET_PKT_WEIGHT;
++}
++
+ /* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+ static void handle_tx(struct vhost_net *net)
+@@ -550,7 +556,6 @@ static void handle_tx(struct vhost_net *
+ msg.msg_control = NULL;
+ ubufs = NULL;
+ }
+-
+ total_len += len;
+ if (total_len < VHOST_NET_WEIGHT &&
+ !vhost_vq_avail_empty(&net->dev, vq) &&
+@@ -579,8 +584,7 @@ static void handle_tx(struct vhost_net *
+ else
+ vhost_zerocopy_signal_used(net, vq);
+ vhost_net_tx_packet(net);
+- if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
++ if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+@@ -863,8 +867,7 @@ static void handle_rx(struct vhost_net *
+ vhost_log_write(vq, vq_log, log, vhost_len,
+ vq->iov, in);
+ total_len += vhost_len;
+- if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+- unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
++ if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
+ vhost_poll_queue(&vq->poll);
+ goto out;
+ }
--- /dev/null
+From db688c24eada63b1efe6d0d7d835e5c3bdd71fd3 Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Tue, 24 Apr 2018 10:34:36 +0200
+Subject: vhost_net: use packet weight for rx handler, too
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit db688c24eada63b1efe6d0d7d835e5c3bdd71fd3 upstream.
+
+Similar to commit a2ac99905f1e ("vhost-net: set packet weight of
+tx polling to 2 * vq size"), we need a packet-based limit for
+handler_rx, too - elsewhere, under rx flood with small packets,
+tx can be delayed for a very long time, even without busypolling.
+
+The pkt limit applied to handle_rx must be the same applied by
+handle_tx, or we will get unfair scheduling between rx and tx.
+Tying such limit to the queue length makes it less effective for
+large queue length values and can introduce large process
+scheduler latencies, so a constant valued is used - likewise
+the existing bytes limit.
+
+The selected limit has been validated with PVP[1] performance
+test with different queue sizes:
+
+queue size 256 512 1024
+
+baseline 366 354 362
+weight 128 715 723 670
+weight 256 740 745 733
+weight 512 600 460 583
+weight 1024 423 427 418
+
+A packet weight of 256 gives peek performances in under all the
+tested scenarios.
+
+No measurable regression in unidirectional performance tests has
+been detected.
+
+[1] https://developers.redhat.com/blog/2017/06/05/measuring-and-comparing-open-vswitch-performance/
+
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Balbir Singh <sblbir@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/net.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -45,8 +45,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "
+ #define VHOST_NET_WEIGHT 0x80000
+
+ /* Max number of packets transferred before requeueing the job.
+- * Using this limit prevents one virtqueue from starving rx. */
+-#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2)
++ * Using this limit prevents one virtqueue from starving others with small
++ * pkts.
++ */
++#define VHOST_NET_PKT_WEIGHT 256
+
+ /* MAX number of TX used buffers for outstanding zerocopy */
+ #define VHOST_MAX_PEND 128
+@@ -578,7 +580,7 @@ static void handle_tx(struct vhost_net *
+ vhost_zerocopy_signal_used(net, vq);
+ vhost_net_tx_packet(net);
+ if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) {
++ unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+@@ -760,6 +762,7 @@ static void handle_rx(struct vhost_net *
+ struct socket *sock;
+ struct iov_iter fixup;
+ __virtio16 num_buffers;
++ int recv_pkts = 0;
+
+ mutex_lock_nested(&vq->mutex, 0);
+ sock = vq->private_data;
+@@ -860,7 +863,8 @@ static void handle_rx(struct vhost_net *
+ vhost_log_write(vq, vq_log, log, vhost_len,
+ vq->iov, in);
+ total_len += vhost_len;
+- if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
++ if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
++ unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ goto out;
+ }