]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 5 Jul 2019 11:15:39 +0000 (13:15 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 5 Jul 2019 11:15:39 +0000 (13:15 +0200)
added patches:
btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch
vhost-introduce-vhost_exceeds_weight.patch
vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch
vhost-scsi-add-weight-support.patch
vhost-vsock-add-weight-support.patch
vhost_net-fix-possible-infinite-loop.patch
vhost_net-introduce-vhost_exceeds_weight.patch
vhost_net-use-packet-weight-for-rx-handler-too.patch

queue-4.14/btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch [new file with mode: 0644]
queue-4.14/series
queue-4.14/vhost-introduce-vhost_exceeds_weight.patch [new file with mode: 0644]
queue-4.14/vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch [new file with mode: 0644]
queue-4.14/vhost-scsi-add-weight-support.patch [new file with mode: 0644]
queue-4.14/vhost-vsock-add-weight-support.patch [new file with mode: 0644]
queue-4.14/vhost_net-fix-possible-infinite-loop.patch [new file with mode: 0644]
queue-4.14/vhost_net-introduce-vhost_exceeds_weight.patch [new file with mode: 0644]
queue-4.14/vhost_net-use-packet-weight-for-rx-handler-too.patch [new file with mode: 0644]

diff --git a/queue-4.14/btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch b/queue-4.14/btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch
new file mode 100644 (file)
index 0000000..0a841a3
--- /dev/null
@@ -0,0 +1,123 @@
+From debd1c065d2037919a7da67baf55cc683fee09f0 Mon Sep 17 00:00:00 2001
+From: Nikolay Borisov <nborisov@suse.com>
+Date: Fri, 17 May 2019 10:44:25 +0300
+Subject: btrfs: Ensure replaced device doesn't have pending chunk allocation
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit debd1c065d2037919a7da67baf55cc683fee09f0 upstream.
+
+Recent FITRIM work, namely bbbf7243d62d ("btrfs: combine device update
+operations during transaction commit") combined the way certain
+operations are recoded in a transaction. As a result an ASSERT was added
+in dev_replace_finish to ensure the new code works correctly.
+Unfortunately I got reports that it's possible to trigger the assert,
+meaning that during a device replace it's possible to have an unfinished
+chunk allocation on the source device.
+
+This is supposed to be prevented by the fact that a transaction is
+committed before finishing the replace oepration and alter acquiring the
+chunk mutex. This is not sufficient since by the time the transaction is
+committed and the chunk mutex acquired it's possible to allocate a chunk
+depending on the workload being executed on the replaced device. This
+bug has been present ever since device replace was introduced but there
+was never code which checks for it.
+
+The correct way to fix is to ensure that there is no pending device
+modification operation when the chunk mutex is acquire and if there is
+repeat transaction commit. Unfortunately it's not possible to just
+exclude the source device from btrfs_fs_devices::dev_alloc_list since
+this causes ENOSPC to be hit in transaction commit.
+
+Fixing that in another way would need to add special cases to handle the
+last writes and forbid new ones. The looped transaction fix is more
+obvious, and can be easily backported. The runtime of dev-replace is
+long so there's no noticeable delay caused by that.
+
+Reported-by: David Sterba <dsterba@suse.com>
+Fixes: 391cd9df81ac ("Btrfs: fix unprotected alloc list insertion during the finishing procedure of replace")
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/btrfs/dev-replace.c |   29 +++++++++++++++++++----------
+ fs/btrfs/volumes.c     |    2 ++
+ fs/btrfs/volumes.h     |    5 +++++
+ 3 files changed, 26 insertions(+), 10 deletions(-)
+
+--- a/fs/btrfs/dev-replace.c
++++ b/fs/btrfs/dev-replace.c
+@@ -512,18 +512,27 @@ static int btrfs_dev_replace_finishing(s
+       }
+       btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+-      trans = btrfs_start_transaction(root, 0);
+-      if (IS_ERR(trans)) {
+-              mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+-              return PTR_ERR(trans);
++      while (1) {
++              trans = btrfs_start_transaction(root, 0);
++              if (IS_ERR(trans)) {
++                      mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
++                      return PTR_ERR(trans);
++              }
++              ret = btrfs_commit_transaction(trans);
++              WARN_ON(ret);
++              mutex_lock(&uuid_mutex);
++              /* keep away write_all_supers() during the finishing procedure */
++              mutex_lock(&fs_info->fs_devices->device_list_mutex);
++              mutex_lock(&fs_info->chunk_mutex);
++              if (src_device->has_pending_chunks) {
++                      mutex_unlock(&root->fs_info->chunk_mutex);
++                      mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
++                      mutex_unlock(&uuid_mutex);
++              } else {
++                      break;
++              }
+       }
+-      ret = btrfs_commit_transaction(trans);
+-      WARN_ON(ret);
+-      mutex_lock(&uuid_mutex);
+-      /* keep away write_all_supers() during the finishing procedure */
+-      mutex_lock(&fs_info->fs_devices->device_list_mutex);
+-      mutex_lock(&fs_info->chunk_mutex);
+       btrfs_dev_replace_lock(dev_replace, 1);
+       dev_replace->replace_state =
+               scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -4851,6 +4851,7 @@ static int __btrfs_alloc_chunk(struct bt
+       for (i = 0; i < map->num_stripes; i++) {
+               num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+               btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
++              map->stripes[i].dev->has_pending_chunks = true;
+       }
+       atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
+@@ -7310,6 +7311,7 @@ void btrfs_update_commit_device_bytes_us
+               for (i = 0; i < map->num_stripes; i++) {
+                       dev = map->stripes[i].dev;
+                       dev->commit_bytes_used = dev->bytes_used;
++                      dev->has_pending_chunks = false;
+               }
+       }
+       mutex_unlock(&fs_info->chunk_mutex);
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -61,6 +61,11 @@ struct btrfs_device {
+       spinlock_t io_lock ____cacheline_aligned;
+       int running_pending;
++      /* When true means this device has pending chunk alloc in
++       * current transaction. Protected by chunk_mutex.
++       */
++      bool has_pending_chunks;
++
+       /* regular prio bios */
+       struct btrfs_pending_bios pending_bios;
+       /* sync bios */
index a6b55d34f7328cda19a1adb22ce0dcf756300cda..fd28cf93728216b28087f7e63ebee27ac796c8d6 100644 (file)
@@ -37,3 +37,11 @@ drm-imx-notify-drm-core-before-sending-event-during-crtc-disable.patch
 drm-imx-only-send-event-on-crtc-disable-if-kept-disabled.patch
 ftrace-x86-remove-possible-deadlock-between-register_kprobe-and-ftrace_run_update_code.patch
 mm-vmscan.c-prevent-useless-kswapd-loops.patch
+btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch
+vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch
+vhost_net-use-packet-weight-for-rx-handler-too.patch
+vhost_net-introduce-vhost_exceeds_weight.patch
+vhost-introduce-vhost_exceeds_weight.patch
+vhost_net-fix-possible-infinite-loop.patch
+vhost-vsock-add-weight-support.patch
+vhost-scsi-add-weight-support.patch
diff --git a/queue-4.14/vhost-introduce-vhost_exceeds_weight.patch b/queue-4.14/vhost-introduce-vhost_exceeds_weight.patch
new file mode 100644 (file)
index 0000000..f73fc8f
--- /dev/null
@@ -0,0 +1,188 @@
+From e82b9b0727ff6d665fff2d326162b460dded554d Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 17 May 2019 00:29:49 -0400
+Subject: vhost: introduce vhost_exceeds_weight()
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit e82b9b0727ff6d665fff2d326162b460dded554d upstream.
+
+We used to have vhost_exceeds_weight() for vhost-net to:
+
+- prevent vhost kthread from hogging the cpu
+- balance the time spent between TX and RX
+
+This function could be useful for vsock and scsi as well. So move it
+to vhost.c. Device must specify a weight which counts the number of
+requests, or it can also specific a byte_weight which counts the
+number of bytes that has been processed.
+
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/net.c   |   18 +++++-------------
+ drivers/vhost/scsi.c  |    8 +++++++-
+ drivers/vhost/vhost.c |   20 +++++++++++++++++++-
+ drivers/vhost/vhost.h |    6 +++++-
+ drivers/vhost/vsock.c |   11 ++++++++++-
+ 5 files changed, 46 insertions(+), 17 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -446,12 +446,6 @@ static bool vhost_exceeds_maxpend(struct
+               == nvq->done_idx;
+ }
+-static bool vhost_exceeds_weight(int pkts, int total_len)
+-{
+-      return total_len >= VHOST_NET_WEIGHT ||
+-             pkts >= VHOST_NET_PKT_WEIGHT;
+-}
+-
+ /* Expects to be always run from workqueue - which acts as
+  * read-size critical section for our kind of RCU. */
+ static void handle_tx(struct vhost_net *net)
+@@ -584,10 +578,9 @@ static void handle_tx(struct vhost_net *
+               else
+                       vhost_zerocopy_signal_used(net, vq);
+               vhost_net_tx_packet(net);
+-              if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
+-                      vhost_poll_queue(&vq->poll);
++              if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
++                                                total_len)))
+                       break;
+-              }
+       }
+ out:
+       mutex_unlock(&vq->mutex);
+@@ -867,10 +860,8 @@ static void handle_rx(struct vhost_net *
+                       vhost_log_write(vq, vq_log, log, vhost_len,
+                                       vq->iov, in);
+               total_len += vhost_len;
+-              if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
+-                      vhost_poll_queue(&vq->poll);
++              if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
+                       goto out;
+-              }
+       }
+       vhost_net_enable_vq(net, vq);
+ out:
+@@ -949,7 +940,8 @@ static int vhost_net_open(struct inode *
+               n->vqs[i].sock_hlen = 0;
+               vhost_net_buf_init(&n->vqs[i].rxq);
+       }
+-      vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
++      vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
++                     VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
+       vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+       vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
+--- a/drivers/vhost/scsi.c
++++ b/drivers/vhost/scsi.c
+@@ -58,6 +58,12 @@
+ #define VHOST_SCSI_PREALLOC_UPAGES 2048
+ #define VHOST_SCSI_PREALLOC_PROT_SGLS 512
++/* Max number of requests before requeueing the job.
++ * Using this limit prevents one virtqueue from starving others with
++ * request.
++ */
++#define VHOST_SCSI_WEIGHT 256
++
+ struct vhost_scsi_inflight {
+       /* Wait for the flush operation to finish */
+       struct completion comp;
+@@ -1427,7 +1433,7 @@ static int vhost_scsi_open(struct inode
+               vqs[i] = &vs->vqs[i].vq;
+               vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
+       }
+-      vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ);
++      vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, VHOST_SCSI_WEIGHT, 0);
+       vhost_scsi_init_inflight(vs, NULL);
+--- a/drivers/vhost/vhost.c
++++ b/drivers/vhost/vhost.c
+@@ -412,8 +412,24 @@ static void vhost_dev_free_iovecs(struct
+               vhost_vq_free_iovecs(dev->vqs[i]);
+ }
++bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
++                        int pkts, int total_len)
++{
++      struct vhost_dev *dev = vq->dev;
++
++      if ((dev->byte_weight && total_len >= dev->byte_weight) ||
++          pkts >= dev->weight) {
++              vhost_poll_queue(&vq->poll);
++              return true;
++      }
++
++      return false;
++}
++EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
++
+ void vhost_dev_init(struct vhost_dev *dev,
+-                  struct vhost_virtqueue **vqs, int nvqs)
++                  struct vhost_virtqueue **vqs, int nvqs,
++                  int weight, int byte_weight)
+ {
+       struct vhost_virtqueue *vq;
+       int i;
+@@ -427,6 +443,8 @@ void vhost_dev_init(struct vhost_dev *de
+       dev->iotlb = NULL;
+       dev->mm = NULL;
+       dev->worker = NULL;
++      dev->weight = weight;
++      dev->byte_weight = byte_weight;
+       init_llist_head(&dev->work_list);
+       init_waitqueue_head(&dev->wait);
+       INIT_LIST_HEAD(&dev->read_list);
+--- a/drivers/vhost/vhost.h
++++ b/drivers/vhost/vhost.h
+@@ -173,9 +173,13 @@ struct vhost_dev {
+       struct list_head read_list;
+       struct list_head pending_list;
+       wait_queue_head_t wait;
++      int weight;
++      int byte_weight;
+ };
+-void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
++bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
++void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
++                  int nvqs, int weight, int byte_weight);
+ long vhost_dev_set_owner(struct vhost_dev *dev);
+ bool vhost_dev_has_owner(struct vhost_dev *dev);
+ long vhost_dev_check_owner(struct vhost_dev *);
+--- a/drivers/vhost/vsock.c
++++ b/drivers/vhost/vsock.c
+@@ -21,6 +21,14 @@
+ #include "vhost.h"
+ #define VHOST_VSOCK_DEFAULT_HOST_CID  2
++/* Max number of bytes transferred before requeueing the job.
++ * Using this limit prevents one virtqueue from starving others. */
++#define VHOST_VSOCK_WEIGHT 0x80000
++/* Max number of packets transferred before requeueing the job.
++ * Using this limit prevents one virtqueue from starving others with
++ * small pkts.
++ */
++#define VHOST_VSOCK_PKT_WEIGHT 256
+ enum {
+       VHOST_VSOCK_FEATURES = VHOST_FEATURES,
+@@ -531,7 +539,8 @@ static int vhost_vsock_dev_open(struct i
+       vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
+       vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;
+-      vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs));
++      vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
++                     VHOST_VSOCK_PKT_WEIGHT, VHOST_VSOCK_WEIGHT);
+       file->private_data = vsock;
+       spin_lock_init(&vsock->send_pkt_list_lock);
diff --git a/queue-4.14/vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch b/queue-4.14/vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch
new file mode 100644 (file)
index 0000000..0cf4d1e
--- /dev/null
@@ -0,0 +1,137 @@
+From a2ac99905f1ea8b15997a6ec39af69aa28a3653b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?haibinzhang=28=E5=BC=A0=E6=B5=B7=E6=96=8C=29?=
+ <haibinzhang@tencent.com>
+Date: Mon, 9 Apr 2018 07:22:17 +0000
+Subject: vhost-net: set packet weight of tx polling to 2 * vq size
+
+From: haibinzhang(张海斌) <haibinzhang@tencent.com>
+
+commit a2ac99905f1ea8b15997a6ec39af69aa28a3653b upstream.
+
+handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy
+polling udp packets with small length(e.g. 1byte udp payload), because setting
+VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length.
+
+Ping-Latencies shown below were tested between two Virtual Machines using
+netperf (UDP_STREAM, len=1), and then another machine pinged the client:
+
+vq size=256
+Packet-Weight   Ping-Latencies(millisecond)
+                   min      avg       max
+Origin           3.319   18.489    57.303
+64               1.643    2.021     2.552
+128              1.825    2.600     3.224
+256              1.997    2.710     4.295
+512              1.860    3.171     4.631
+1024             2.002    4.173     9.056
+2048             2.257    5.650     9.688
+4096             2.093    8.508    15.943
+
+vq size=512
+Packet-Weight   Ping-Latencies(millisecond)
+                   min      avg       max
+Origin           6.537   29.177    66.245
+64               2.798    3.614     4.403
+128              2.861    3.820     4.775
+256              3.008    4.018     4.807
+512              3.254    4.523     5.824
+1024             3.079    5.335     7.747
+2048             3.944    8.201    12.762
+4096             4.158   11.057    19.985
+
+Seems pretty consistent, a small dip at 2 VQ sizes.
+Ring size is a hint from device about a burst size it can tolerate. Based on
+benchmarks, set the weight to 2 * vq size.
+
+To evaluate this change, another tests were done using netperf(RR, TX) between
+two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz, and vq size was
+tweaked through qemu. Results shown below does not show obvious changes.
+
+vq size=256 TCP_RR                vq size=512 TCP_RR
+size/sessions/+thu%/+normalize%   size/sessions/+thu%/+normalize%
+   1/       1/  -7%/        -2%      1/       1/   0%/        -2%
+   1/       4/  +1%/         0%      1/       4/  +1%/         0%
+   1/       8/  +1%/        -2%      1/       8/   0%/        +1%
+  64/       1/  -6%/         0%     64/       1/  +7%/        +3%
+  64/       4/   0%/        +2%     64/       4/  -1%/        +1%
+  64/       8/   0%/         0%     64/       8/  -1%/        -2%
+ 256/       1/  -3%/        -4%    256/       1/  -4%/        -2%
+ 256/       4/  +3%/        +4%    256/       4/  +1%/        +2%
+ 256/       8/  +2%/         0%    256/       8/  +1%/        -1%
+
+vq size=256 UDP_RR                vq size=512 UDP_RR
+size/sessions/+thu%/+normalize%   size/sessions/+thu%/+normalize%
+   1/       1/  -5%/        +1%      1/       1/  -3%/        -2%
+   1/       4/  +4%/        +1%      1/       4/  -2%/        +2%
+   1/       8/  -1%/        -1%      1/       8/  -1%/         0%
+  64/       1/  -2%/        -3%     64/       1/  +1%/        +1%
+  64/       4/  -5%/        -1%     64/       4/  +2%/         0%
+  64/       8/   0%/        -1%     64/       8/  -2%/        +1%
+ 256/       1/  +7%/        +1%    256/       1/  -7%/         0%
+ 256/       4/  +1%/        +1%    256/       4/  -3%/        -4%
+ 256/       8/  +2%/        +2%    256/       8/  +1%/        +1%
+
+vq size=256 TCP_STREAM            vq size=512 TCP_STREAM
+size/sessions/+thu%/+normalize%   size/sessions/+thu%/+normalize%
+  64/       1/   0%/        -3%     64/       1/   0%/         0%
+  64/       4/  +3%/        -1%     64/       4/  -2%/        +4%
+  64/       8/  +9%/        -4%     64/       8/  -1%/        +2%
+ 256/       1/  +1%/        -4%    256/       1/  +1%/        +1%
+ 256/       4/  -1%/        -1%    256/       4/  -3%/         0%
+ 256/       8/  +7%/        +5%    256/       8/  -3%/         0%
+ 512/       1/  +1%/         0%    512/       1/  -1%/        -1%
+ 512/       4/  +1%/        -1%    512/       4/   0%/         0%
+ 512/       8/  +7%/        -5%    512/       8/  +6%/        -1%
+1024/       1/   0%/        -1%   1024/       1/   0%/        +1%
+1024/       4/  +3%/         0%   1024/       4/  +1%/         0%
+1024/       8/  +8%/        +5%   1024/       8/  -1%/         0%
+2048/       1/  +2%/        +2%   2048/       1/  -1%/         0%
+2048/       4/  +1%/         0%   2048/       4/   0%/        -1%
+2048/       8/  -2%/         0%   2048/       8/   5%/        -1%
+4096/       1/  -2%/         0%   4096/       1/  -2%/         0%
+4096/       4/  +2%/         0%   4096/       4/   0%/         0%
+4096/       8/  +9%/        -2%   4096/       8/  -5%/        -1%
+
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Haibin Zhang <haibinzhang@tencent.com>
+Signed-off-by: Yunfang Tai <yunfangtai@tencent.com>
+Signed-off-by: Lidong Chen <lidongchen@tencent.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Balbir Singh <sblbir@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/net.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -44,6 +44,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "
+  * Using this limit prevents one virtqueue from starving others. */
+ #define VHOST_NET_WEIGHT 0x80000
++/* Max number of packets transferred before requeueing the job.
++ * Using this limit prevents one virtqueue from starving rx. */
++#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2)
++
+ /* MAX number of TX used buffers for outstanding zerocopy */
+ #define VHOST_MAX_PEND 128
+ #define VHOST_GOODCOPY_LEN 256
+@@ -461,6 +465,7 @@ static void handle_tx(struct vhost_net *
+       struct socket *sock;
+       struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+       bool zcopy, zcopy_used;
++      int sent_pkts = 0;
+       mutex_lock(&vq->mutex);
+       sock = vq->private_data;
+@@ -572,7 +577,8 @@ static void handle_tx(struct vhost_net *
+               else
+                       vhost_zerocopy_signal_used(net, vq);
+               vhost_net_tx_packet(net);
+-              if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
++              if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
++                  unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) {
+                       vhost_poll_queue(&vq->poll);
+                       break;
+               }
diff --git a/queue-4.14/vhost-scsi-add-weight-support.patch b/queue-4.14/vhost-scsi-add-weight-support.patch
new file mode 100644 (file)
index 0000000..6223f50
--- /dev/null
@@ -0,0 +1,58 @@
+From c1ea02f15ab5efb3e93fc3144d895410bf79fcf2 Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 17 May 2019 00:29:52 -0400
+Subject: vhost: scsi: add weight support
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit c1ea02f15ab5efb3e93fc3144d895410bf79fcf2 upstream.
+
+This patch will check the weight and exit the loop if we exceeds the
+weight. This is useful for preventing scsi kthread from hogging cpu
+which is guest triggerable.
+
+This addresses CVE-2019-3900.
+
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Stefan Hajnoczi <stefanha@redhat.com>
+Fixes: 057cbf49a1f0 ("tcm_vhost: Initial merge for vhost level target fabric driver")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/scsi.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/vhost/scsi.c
++++ b/drivers/vhost/scsi.c
+@@ -846,7 +846,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *
+       u64 tag;
+       u32 exp_data_len, data_direction;
+       unsigned int out = 0, in = 0;
+-      int head, ret, prot_bytes;
++      int head, ret, prot_bytes, c = 0;
+       size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp);
+       size_t out_size, in_size;
+       u16 lun;
+@@ -865,7 +865,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *
+       vhost_disable_notify(&vs->dev, vq);
+-      for (;;) {
++      do {
+               head = vhost_get_vq_desc(vq, vq->iov,
+                                        ARRAY_SIZE(vq->iov), &out, &in,
+                                        NULL, NULL);
+@@ -1080,7 +1080,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *
+                */
+               INIT_WORK(&cmd->work, vhost_scsi_submission_work);
+               queue_work(vhost_scsi_workqueue, &cmd->work);
+-      }
++      } while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
+ out:
+       mutex_unlock(&vq->mutex);
+ }
diff --git a/queue-4.14/vhost-vsock-add-weight-support.patch b/queue-4.14/vhost-vsock-add-weight-support.patch
new file mode 100644 (file)
index 0000000..4be3bf7
--- /dev/null
@@ -0,0 +1,92 @@
+From e79b431fb901ba1106670bcc80b9b617b25def7d Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 17 May 2019 00:29:51 -0400
+Subject: vhost: vsock: add weight support
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit e79b431fb901ba1106670bcc80b9b617b25def7d upstream.
+
+This patch will check the weight and exit the loop if we exceeds the
+weight. This is useful for preventing vsock kthread from hogging cpu
+which is guest triggerable. The weight can help to avoid starving the
+request from on direction while another direction is being processed.
+
+The value of weight is picked from vhost-net.
+
+This addresses CVE-2019-3900.
+
+Cc: Stefan Hajnoczi <stefanha@redhat.com>
+Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/vsock.c |   16 ++++++++++------
+ 1 file changed, 10 insertions(+), 6 deletions(-)
+
+--- a/drivers/vhost/vsock.c
++++ b/drivers/vhost/vsock.c
+@@ -86,6 +86,7 @@ vhost_transport_do_send_pkt(struct vhost
+                           struct vhost_virtqueue *vq)
+ {
+       struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
++      int pkts = 0, total_len = 0;
+       bool added = false;
+       bool restart_tx = false;
+@@ -97,7 +98,7 @@ vhost_transport_do_send_pkt(struct vhost
+       /* Avoid further vmexits, we're already processing the virtqueue */
+       vhost_disable_notify(&vsock->dev, vq);
+-      for (;;) {
++      do {
+               struct virtio_vsock_pkt *pkt;
+               struct iov_iter iov_iter;
+               unsigned out, in;
+@@ -182,8 +183,9 @@ vhost_transport_do_send_pkt(struct vhost
+                */
+               virtio_transport_deliver_tap_pkt(pkt);
++              total_len += pkt->len;
+               virtio_transport_free_pkt(pkt);
+-      }
++      } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
+       if (added)
+               vhost_signal(&vsock->dev, vq);
+@@ -358,7 +360,7 @@ static void vhost_vsock_handle_tx_kick(s
+       struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
+                                                dev);
+       struct virtio_vsock_pkt *pkt;
+-      int head;
++      int head, pkts = 0, total_len = 0;
+       unsigned int out, in;
+       bool added = false;
+@@ -368,7 +370,7 @@ static void vhost_vsock_handle_tx_kick(s
+               goto out;
+       vhost_disable_notify(&vsock->dev, vq);
+-      for (;;) {
++      do {
+               u32 len;
+               if (!vhost_vsock_more_replies(vsock)) {
+@@ -409,9 +411,11 @@ static void vhost_vsock_handle_tx_kick(s
+               else
+                       virtio_transport_free_pkt(pkt);
+-              vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
++              len += sizeof(pkt->hdr);
++              vhost_add_used(vq, head, len);
++              total_len += len;
+               added = true;
+-      }
++      } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
+ no_more_replies:
+       if (added)
diff --git a/queue-4.14/vhost_net-fix-possible-infinite-loop.patch b/queue-4.14/vhost_net-fix-possible-infinite-loop.patch
new file mode 100644 (file)
index 0000000..102eb41
--- /dev/null
@@ -0,0 +1,112 @@
+From e2412c07f8f3040593dfb88207865a3cd58680c0 Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 17 May 2019 00:29:50 -0400
+Subject: vhost_net: fix possible infinite loop
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit e2412c07f8f3040593dfb88207865a3cd58680c0 upstream.
+
+When the rx buffer is too small for a packet, we will discard the vq
+descriptor and retry it for the next packet:
+
+while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
+                                             &busyloop_intr))) {
+...
+       /* On overrun, truncate and discard */
+       if (unlikely(headcount > UIO_MAXIOV)) {
+               iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
+               err = sock->ops->recvmsg(sock, &msg,
+                                        1, MSG_DONTWAIT | MSG_TRUNC);
+               pr_debug("Discarded rx packet: len %zd\n", sock_len);
+               continue;
+       }
+...
+}
+
+This makes it possible to trigger a infinite while..continue loop
+through the co-opreation of two VMs like:
+
+1) Malicious VM1 allocate 1 byte rx buffer and try to slow down the
+   vhost process as much as possible e.g using indirect descriptors or
+   other.
+2) Malicious VM2 generate packets to VM1 as fast as possible
+
+Fixing this by checking against weight at the end of RX and TX
+loop. This also eliminate other similar cases when:
+
+- userspace is consuming the packets in the meanwhile
+- theoretical TOCTOU attack if guest moving avail index back and forth
+  to hit the continue after vhost find guest just add new buffers
+
+This addresses CVE-2019-3900.
+
+Fixes: d8316f3991d20 ("vhost: fix total length when packets are too short")
+Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+
+---
+ drivers/vhost/net.c |   20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -482,7 +482,7 @@ static void handle_tx(struct vhost_net *
+       hdr_size = nvq->vhost_hlen;
+       zcopy = nvq->ubufs;
+-      for (;;) {
++      do {
+               /* Release DMAs done buffers first */
+               if (zcopy)
+                       vhost_zerocopy_signal_used(net, vq);
+@@ -578,10 +578,7 @@ static void handle_tx(struct vhost_net *
+               else
+                       vhost_zerocopy_signal_used(net, vq);
+               vhost_net_tx_packet(net);
+-              if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
+-                                                total_len)))
+-                      break;
+-      }
++      } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
+ out:
+       mutex_unlock(&vq->mutex);
+ }
+@@ -779,7 +776,11 @@ static void handle_rx(struct vhost_net *
+               vq->log : NULL;
+       mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
+-      while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
++      do {
++              sock_len = vhost_net_rx_peek_head_len(net, sock->sk);
++
++              if (!sock_len)
++                      break;
+               sock_len += sock_hlen;
+               vhost_len = sock_len + vhost_hlen;
+               headcount = get_rx_bufs(vq, vq->heads, vhost_len,
+@@ -860,9 +861,8 @@ static void handle_rx(struct vhost_net *
+                       vhost_log_write(vq, vq_log, log, vhost_len,
+                                       vq->iov, in);
+               total_len += vhost_len;
+-              if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
+-                      goto out;
+-      }
++      } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
++
+       vhost_net_enable_vq(net, vq);
+ out:
+       mutex_unlock(&vq->mutex);
+@@ -941,7 +941,7 @@ static int vhost_net_open(struct inode *
+               vhost_net_buf_init(&n->vqs[i].rxq);
+       }
+       vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
+-                     VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
++                     VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
+       vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+       vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
diff --git a/queue-4.14/vhost_net-introduce-vhost_exceeds_weight.patch b/queue-4.14/vhost_net-introduce-vhost_exceeds_weight.patch
new file mode 100644 (file)
index 0000000..7efde1a
--- /dev/null
@@ -0,0 +1,61 @@
+From 272f35cba53d088085e5952fd81d7a133ab90789 Mon Sep 17 00:00:00 2001
+From: Jason Wang <jasowang@redhat.com>
+Date: Fri, 20 Jul 2018 08:15:15 +0800
+Subject: vhost_net: introduce vhost_exceeds_weight()
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit 272f35cba53d088085e5952fd81d7a133ab90789 upstream.
+
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Balbir Singh <sblbir@amzn.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/net.c |   13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -446,6 +446,12 @@ static bool vhost_exceeds_maxpend(struct
+               == nvq->done_idx;
+ }
++static bool vhost_exceeds_weight(int pkts, int total_len)
++{
++      return total_len >= VHOST_NET_WEIGHT ||
++             pkts >= VHOST_NET_PKT_WEIGHT;
++}
++
+ /* Expects to be always run from workqueue - which acts as
+  * read-size critical section for our kind of RCU. */
+ static void handle_tx(struct vhost_net *net)
+@@ -550,7 +556,6 @@ static void handle_tx(struct vhost_net *
+                       msg.msg_control = NULL;
+                       ubufs = NULL;
+               }
+-
+               total_len += len;
+               if (total_len < VHOST_NET_WEIGHT &&
+                   !vhost_vq_avail_empty(&net->dev, vq) &&
+@@ -579,8 +584,7 @@ static void handle_tx(struct vhost_net *
+               else
+                       vhost_zerocopy_signal_used(net, vq);
+               vhost_net_tx_packet(net);
+-              if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+-                  unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
++              if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
+                       vhost_poll_queue(&vq->poll);
+                       break;
+               }
+@@ -863,8 +867,7 @@ static void handle_rx(struct vhost_net *
+                       vhost_log_write(vq, vq_log, log, vhost_len,
+                                       vq->iov, in);
+               total_len += vhost_len;
+-              if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+-                  unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
++              if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
+                       vhost_poll_queue(&vq->poll);
+                       goto out;
+               }
diff --git a/queue-4.14/vhost_net-use-packet-weight-for-rx-handler-too.patch b/queue-4.14/vhost_net-use-packet-weight-for-rx-handler-too.patch
new file mode 100644 (file)
index 0000000..78025dd
--- /dev/null
@@ -0,0 +1,92 @@
+From db688c24eada63b1efe6d0d7d835e5c3bdd71fd3 Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Tue, 24 Apr 2018 10:34:36 +0200
+Subject: vhost_net: use packet weight for rx handler, too
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit db688c24eada63b1efe6d0d7d835e5c3bdd71fd3 upstream.
+
+Similar to commit a2ac99905f1e ("vhost-net: set packet weight of
+tx polling to 2 * vq size"), we need a packet-based limit for
+handler_rx, too - elsewhere, under rx flood with small packets,
+tx can be delayed for a very long time, even without busypolling.
+
+The pkt limit applied to handle_rx must be the same applied by
+handle_tx, or we will get unfair scheduling between rx and tx.
+Tying such limit to the queue length makes it less effective for
+large queue length values and can introduce large process
+scheduler latencies, so a constant valued is used - likewise
+the existing bytes limit.
+
+The selected limit has been validated with PVP[1] performance
+test with different queue sizes:
+
+queue size             256     512     1024
+
+baseline               366     354     362
+weight 128             715     723     670
+weight 256             740     745     733
+weight 512             600     460     583
+weight 1024            423     427     418
+
+A packet weight of 256 gives peek performances in under all the
+tested scenarios.
+
+No measurable regression in unidirectional performance tests has
+been detected.
+
+[1] https://developers.redhat.com/blog/2017/06/05/measuring-and-comparing-open-vswitch-performance/
+
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Balbir Singh <sblbir@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/net.c |   12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -45,8 +45,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "
+ #define VHOST_NET_WEIGHT 0x80000
+ /* Max number of packets transferred before requeueing the job.
+- * Using this limit prevents one virtqueue from starving rx. */
+-#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2)
++ * Using this limit prevents one virtqueue from starving others with small
++ * pkts.
++ */
++#define VHOST_NET_PKT_WEIGHT 256
+ /* MAX number of TX used buffers for outstanding zerocopy */
+ #define VHOST_MAX_PEND 128
+@@ -578,7 +580,7 @@ static void handle_tx(struct vhost_net *
+                       vhost_zerocopy_signal_used(net, vq);
+               vhost_net_tx_packet(net);
+               if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+-                  unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) {
++                  unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
+                       vhost_poll_queue(&vq->poll);
+                       break;
+               }
+@@ -760,6 +762,7 @@ static void handle_rx(struct vhost_net *
+       struct socket *sock;
+       struct iov_iter fixup;
+       __virtio16 num_buffers;
++      int recv_pkts = 0;
+       mutex_lock_nested(&vq->mutex, 0);
+       sock = vq->private_data;
+@@ -860,7 +863,8 @@ static void handle_rx(struct vhost_net *
+                       vhost_log_write(vq, vq_log, log, vhost_len,
+                                       vq->iov, in);
+               total_len += vhost_len;
+-              if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
++              if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
++                  unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
+                       vhost_poll_queue(&vq->poll);
+                       goto out;
+               }