From: Greg Kroah-Hartman Date: Fri, 5 Jul 2019 11:15:39 +0000 (+0200) Subject: 4.14-stable patches X-Git-Tag: v5.1.17~33 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ae01845ffe80ddc2d87ad8195025d43eb07ff18d;p=thirdparty%2Fkernel%2Fstable-queue.git 4.14-stable patches added patches: btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch vhost-introduce-vhost_exceeds_weight.patch vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch vhost-scsi-add-weight-support.patch vhost-vsock-add-weight-support.patch vhost_net-fix-possible-infinite-loop.patch vhost_net-introduce-vhost_exceeds_weight.patch vhost_net-use-packet-weight-for-rx-handler-too.patch --- diff --git a/queue-4.14/btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch b/queue-4.14/btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch new file mode 100644 index 00000000000..0a841a31956 --- /dev/null +++ b/queue-4.14/btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch @@ -0,0 +1,123 @@ +From debd1c065d2037919a7da67baf55cc683fee09f0 Mon Sep 17 00:00:00 2001 +From: Nikolay Borisov +Date: Fri, 17 May 2019 10:44:25 +0300 +Subject: btrfs: Ensure replaced device doesn't have pending chunk allocation + +From: Nikolay Borisov + +commit debd1c065d2037919a7da67baf55cc683fee09f0 upstream. + +Recent FITRIM work, namely bbbf7243d62d ("btrfs: combine device update +operations during transaction commit") combined the way certain +operations are recoded in a transaction. As a result an ASSERT was added +in dev_replace_finish to ensure the new code works correctly. +Unfortunately I got reports that it's possible to trigger the assert, +meaning that during a device replace it's possible to have an unfinished +chunk allocation on the source device. + +This is supposed to be prevented by the fact that a transaction is +committed before finishing the replace oepration and alter acquiring the +chunk mutex. This is not sufficient since by the time the transaction is +committed and the chunk mutex acquired it's possible to allocate a chunk +depending on the workload being executed on the replaced device. This +bug has been present ever since device replace was introduced but there +was never code which checks for it. + +The correct way to fix is to ensure that there is no pending device +modification operation when the chunk mutex is acquire and if there is +repeat transaction commit. Unfortunately it's not possible to just +exclude the source device from btrfs_fs_devices::dev_alloc_list since +this causes ENOSPC to be hit in transaction commit. + +Fixing that in another way would need to add special cases to handle the +last writes and forbid new ones. The looped transaction fix is more +obvious, and can be easily backported. The runtime of dev-replace is +long so there's no noticeable delay caused by that. + +Reported-by: David Sterba +Fixes: 391cd9df81ac ("Btrfs: fix unprotected alloc list insertion during the finishing procedure of replace") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Nikolay Borisov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/dev-replace.c | 29 +++++++++++++++++++---------- + fs/btrfs/volumes.c | 2 ++ + fs/btrfs/volumes.h | 5 +++++ + 3 files changed, 26 insertions(+), 10 deletions(-) + +--- a/fs/btrfs/dev-replace.c ++++ b/fs/btrfs/dev-replace.c +@@ -512,18 +512,27 @@ static int btrfs_dev_replace_finishing(s + } + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + +- trans = btrfs_start_transaction(root, 0); +- if (IS_ERR(trans)) { +- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +- return PTR_ERR(trans); ++ while (1) { ++ trans = btrfs_start_transaction(root, 0); ++ if (IS_ERR(trans)) { ++ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); ++ return PTR_ERR(trans); ++ } ++ ret = btrfs_commit_transaction(trans); ++ WARN_ON(ret); ++ mutex_lock(&uuid_mutex); ++ /* keep away write_all_supers() during the finishing procedure */ ++ mutex_lock(&fs_info->fs_devices->device_list_mutex); ++ mutex_lock(&fs_info->chunk_mutex); ++ if (src_device->has_pending_chunks) { ++ mutex_unlock(&root->fs_info->chunk_mutex); ++ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); ++ mutex_unlock(&uuid_mutex); ++ } else { ++ break; ++ } + } +- ret = btrfs_commit_transaction(trans); +- WARN_ON(ret); + +- mutex_lock(&uuid_mutex); +- /* keep away write_all_supers() during the finishing procedure */ +- mutex_lock(&fs_info->fs_devices->device_list_mutex); +- mutex_lock(&fs_info->chunk_mutex); + btrfs_dev_replace_lock(dev_replace, 1); + dev_replace->replace_state = + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -4851,6 +4851,7 @@ static int __btrfs_alloc_chunk(struct bt + for (i = 0; i < map->num_stripes; i++) { + num_bytes = map->stripes[i].dev->bytes_used + stripe_size; + btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); ++ map->stripes[i].dev->has_pending_chunks = true; + } + + atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); +@@ -7310,6 +7311,7 @@ void btrfs_update_commit_device_bytes_us + for (i = 0; i < map->num_stripes; i++) { + dev = map->stripes[i].dev; + dev->commit_bytes_used = dev->bytes_used; ++ dev->has_pending_chunks = false; + } + } + mutex_unlock(&fs_info->chunk_mutex); +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -61,6 +61,11 @@ struct btrfs_device { + + spinlock_t io_lock ____cacheline_aligned; + int running_pending; ++ /* When true means this device has pending chunk alloc in ++ * current transaction. Protected by chunk_mutex. ++ */ ++ bool has_pending_chunks; ++ + /* regular prio bios */ + struct btrfs_pending_bios pending_bios; + /* sync bios */ diff --git a/queue-4.14/series b/queue-4.14/series index a6b55d34f73..fd28cf93728 100644 --- a/queue-4.14/series +++ b/queue-4.14/series @@ -37,3 +37,11 @@ drm-imx-notify-drm-core-before-sending-event-during-crtc-disable.patch drm-imx-only-send-event-on-crtc-disable-if-kept-disabled.patch ftrace-x86-remove-possible-deadlock-between-register_kprobe-and-ftrace_run_update_code.patch mm-vmscan.c-prevent-useless-kswapd-loops.patch +btrfs-ensure-replaced-device-doesn-t-have-pending-chunk-allocation.patch +vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch +vhost_net-use-packet-weight-for-rx-handler-too.patch +vhost_net-introduce-vhost_exceeds_weight.patch +vhost-introduce-vhost_exceeds_weight.patch +vhost_net-fix-possible-infinite-loop.patch +vhost-vsock-add-weight-support.patch +vhost-scsi-add-weight-support.patch diff --git a/queue-4.14/vhost-introduce-vhost_exceeds_weight.patch b/queue-4.14/vhost-introduce-vhost_exceeds_weight.patch new file mode 100644 index 00000000000..f73fc8f1274 --- /dev/null +++ b/queue-4.14/vhost-introduce-vhost_exceeds_weight.patch @@ -0,0 +1,188 @@ +From e82b9b0727ff6d665fff2d326162b460dded554d Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Fri, 17 May 2019 00:29:49 -0400 +Subject: vhost: introduce vhost_exceeds_weight() + +From: Jason Wang + +commit e82b9b0727ff6d665fff2d326162b460dded554d upstream. + +We used to have vhost_exceeds_weight() for vhost-net to: + +- prevent vhost kthread from hogging the cpu +- balance the time spent between TX and RX + +This function could be useful for vsock and scsi as well. So move it +to vhost.c. Device must specify a weight which counts the number of +requests, or it can also specific a byte_weight which counts the +number of bytes that has been processed. + +Signed-off-by: Jason Wang +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Balbir Singh +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/vhost/net.c | 18 +++++------------- + drivers/vhost/scsi.c | 8 +++++++- + drivers/vhost/vhost.c | 20 +++++++++++++++++++- + drivers/vhost/vhost.h | 6 +++++- + drivers/vhost/vsock.c | 11 ++++++++++- + 5 files changed, 46 insertions(+), 17 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -446,12 +446,6 @@ static bool vhost_exceeds_maxpend(struct + == nvq->done_idx; + } + +-static bool vhost_exceeds_weight(int pkts, int total_len) +-{ +- return total_len >= VHOST_NET_WEIGHT || +- pkts >= VHOST_NET_PKT_WEIGHT; +-} +- + /* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ + static void handle_tx(struct vhost_net *net) +@@ -584,10 +578,9 @@ static void handle_tx(struct vhost_net * + else + vhost_zerocopy_signal_used(net, vq); + vhost_net_tx_packet(net); +- if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) { +- vhost_poll_queue(&vq->poll); ++ if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts, ++ total_len))) + break; +- } + } + out: + mutex_unlock(&vq->mutex); +@@ -867,10 +860,8 @@ static void handle_rx(struct vhost_net * + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; +- if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) { +- vhost_poll_queue(&vq->poll); ++ if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len))) + goto out; +- } + } + vhost_net_enable_vq(net, vq); + out: +@@ -949,7 +940,8 @@ static int vhost_net_open(struct inode * + n->vqs[i].sock_hlen = 0; + vhost_net_buf_init(&n->vqs[i].rxq); + } +- vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); ++ vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, ++ VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT); + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); +--- a/drivers/vhost/scsi.c ++++ b/drivers/vhost/scsi.c +@@ -58,6 +58,12 @@ + #define VHOST_SCSI_PREALLOC_UPAGES 2048 + #define VHOST_SCSI_PREALLOC_PROT_SGLS 512 + ++/* Max number of requests before requeueing the job. ++ * Using this limit prevents one virtqueue from starving others with ++ * request. ++ */ ++#define VHOST_SCSI_WEIGHT 256 ++ + struct vhost_scsi_inflight { + /* Wait for the flush operation to finish */ + struct completion comp; +@@ -1427,7 +1433,7 @@ static int vhost_scsi_open(struct inode + vqs[i] = &vs->vqs[i].vq; + vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick; + } +- vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ); ++ vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, VHOST_SCSI_WEIGHT, 0); + + vhost_scsi_init_inflight(vs, NULL); + +--- a/drivers/vhost/vhost.c ++++ b/drivers/vhost/vhost.c +@@ -412,8 +412,24 @@ static void vhost_dev_free_iovecs(struct + vhost_vq_free_iovecs(dev->vqs[i]); + } + ++bool vhost_exceeds_weight(struct vhost_virtqueue *vq, ++ int pkts, int total_len) ++{ ++ struct vhost_dev *dev = vq->dev; ++ ++ if ((dev->byte_weight && total_len >= dev->byte_weight) || ++ pkts >= dev->weight) { ++ vhost_poll_queue(&vq->poll); ++ return true; ++ } ++ ++ return false; ++} ++EXPORT_SYMBOL_GPL(vhost_exceeds_weight); ++ + void vhost_dev_init(struct vhost_dev *dev, +- struct vhost_virtqueue **vqs, int nvqs) ++ struct vhost_virtqueue **vqs, int nvqs, ++ int weight, int byte_weight) + { + struct vhost_virtqueue *vq; + int i; +@@ -427,6 +443,8 @@ void vhost_dev_init(struct vhost_dev *de + dev->iotlb = NULL; + dev->mm = NULL; + dev->worker = NULL; ++ dev->weight = weight; ++ dev->byte_weight = byte_weight; + init_llist_head(&dev->work_list); + init_waitqueue_head(&dev->wait); + INIT_LIST_HEAD(&dev->read_list); +--- a/drivers/vhost/vhost.h ++++ b/drivers/vhost/vhost.h +@@ -173,9 +173,13 @@ struct vhost_dev { + struct list_head read_list; + struct list_head pending_list; + wait_queue_head_t wait; ++ int weight; ++ int byte_weight; + }; + +-void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs); ++bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len); ++void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, ++ int nvqs, int weight, int byte_weight); + long vhost_dev_set_owner(struct vhost_dev *dev); + bool vhost_dev_has_owner(struct vhost_dev *dev); + long vhost_dev_check_owner(struct vhost_dev *); +--- a/drivers/vhost/vsock.c ++++ b/drivers/vhost/vsock.c +@@ -21,6 +21,14 @@ + #include "vhost.h" + + #define VHOST_VSOCK_DEFAULT_HOST_CID 2 ++/* Max number of bytes transferred before requeueing the job. ++ * Using this limit prevents one virtqueue from starving others. */ ++#define VHOST_VSOCK_WEIGHT 0x80000 ++/* Max number of packets transferred before requeueing the job. ++ * Using this limit prevents one virtqueue from starving others with ++ * small pkts. ++ */ ++#define VHOST_VSOCK_PKT_WEIGHT 256 + + enum { + VHOST_VSOCK_FEATURES = VHOST_FEATURES, +@@ -531,7 +539,8 @@ static int vhost_vsock_dev_open(struct i + vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick; + vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick; + +- vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs)); ++ vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), ++ VHOST_VSOCK_PKT_WEIGHT, VHOST_VSOCK_WEIGHT); + + file->private_data = vsock; + spin_lock_init(&vsock->send_pkt_list_lock); diff --git a/queue-4.14/vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch b/queue-4.14/vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch new file mode 100644 index 00000000000..0cf4d1e16dd --- /dev/null +++ b/queue-4.14/vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch @@ -0,0 +1,137 @@ +From a2ac99905f1ea8b15997a6ec39af69aa28a3653b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?haibinzhang=28=E5=BC=A0=E6=B5=B7=E6=96=8C=29?= + +Date: Mon, 9 Apr 2018 07:22:17 +0000 +Subject: vhost-net: set packet weight of tx polling to 2 * vq size + +From: haibinzhang(张海斌) + +commit a2ac99905f1ea8b15997a6ec39af69aa28a3653b upstream. + +handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy +polling udp packets with small length(e.g. 1byte udp payload), because setting +VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length. + +Ping-Latencies shown below were tested between two Virtual Machines using +netperf (UDP_STREAM, len=1), and then another machine pinged the client: + +vq size=256 +Packet-Weight Ping-Latencies(millisecond) + min avg max +Origin 3.319 18.489 57.303 +64 1.643 2.021 2.552 +128 1.825 2.600 3.224 +256 1.997 2.710 4.295 +512 1.860 3.171 4.631 +1024 2.002 4.173 9.056 +2048 2.257 5.650 9.688 +4096 2.093 8.508 15.943 + +vq size=512 +Packet-Weight Ping-Latencies(millisecond) + min avg max +Origin 6.537 29.177 66.245 +64 2.798 3.614 4.403 +128 2.861 3.820 4.775 +256 3.008 4.018 4.807 +512 3.254 4.523 5.824 +1024 3.079 5.335 7.747 +2048 3.944 8.201 12.762 +4096 4.158 11.057 19.985 + +Seems pretty consistent, a small dip at 2 VQ sizes. +Ring size is a hint from device about a burst size it can tolerate. Based on +benchmarks, set the weight to 2 * vq size. + +To evaluate this change, another tests were done using netperf(RR, TX) between +two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz, and vq size was +tweaked through qemu. Results shown below does not show obvious changes. + +vq size=256 TCP_RR vq size=512 TCP_RR +size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize% + 1/ 1/ -7%/ -2% 1/ 1/ 0%/ -2% + 1/ 4/ +1%/ 0% 1/ 4/ +1%/ 0% + 1/ 8/ +1%/ -2% 1/ 8/ 0%/ +1% + 64/ 1/ -6%/ 0% 64/ 1/ +7%/ +3% + 64/ 4/ 0%/ +2% 64/ 4/ -1%/ +1% + 64/ 8/ 0%/ 0% 64/ 8/ -1%/ -2% + 256/ 1/ -3%/ -4% 256/ 1/ -4%/ -2% + 256/ 4/ +3%/ +4% 256/ 4/ +1%/ +2% + 256/ 8/ +2%/ 0% 256/ 8/ +1%/ -1% + +vq size=256 UDP_RR vq size=512 UDP_RR +size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize% + 1/ 1/ -5%/ +1% 1/ 1/ -3%/ -2% + 1/ 4/ +4%/ +1% 1/ 4/ -2%/ +2% + 1/ 8/ -1%/ -1% 1/ 8/ -1%/ 0% + 64/ 1/ -2%/ -3% 64/ 1/ +1%/ +1% + 64/ 4/ -5%/ -1% 64/ 4/ +2%/ 0% + 64/ 8/ 0%/ -1% 64/ 8/ -2%/ +1% + 256/ 1/ +7%/ +1% 256/ 1/ -7%/ 0% + 256/ 4/ +1%/ +1% 256/ 4/ -3%/ -4% + 256/ 8/ +2%/ +2% 256/ 8/ +1%/ +1% + +vq size=256 TCP_STREAM vq size=512 TCP_STREAM +size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize% + 64/ 1/ 0%/ -3% 64/ 1/ 0%/ 0% + 64/ 4/ +3%/ -1% 64/ 4/ -2%/ +4% + 64/ 8/ +9%/ -4% 64/ 8/ -1%/ +2% + 256/ 1/ +1%/ -4% 256/ 1/ +1%/ +1% + 256/ 4/ -1%/ -1% 256/ 4/ -3%/ 0% + 256/ 8/ +7%/ +5% 256/ 8/ -3%/ 0% + 512/ 1/ +1%/ 0% 512/ 1/ -1%/ -1% + 512/ 4/ +1%/ -1% 512/ 4/ 0%/ 0% + 512/ 8/ +7%/ -5% 512/ 8/ +6%/ -1% +1024/ 1/ 0%/ -1% 1024/ 1/ 0%/ +1% +1024/ 4/ +3%/ 0% 1024/ 4/ +1%/ 0% +1024/ 8/ +8%/ +5% 1024/ 8/ -1%/ 0% +2048/ 1/ +2%/ +2% 2048/ 1/ -1%/ 0% +2048/ 4/ +1%/ 0% 2048/ 4/ 0%/ -1% +2048/ 8/ -2%/ 0% 2048/ 8/ 5%/ -1% +4096/ 1/ -2%/ 0% 4096/ 1/ -2%/ 0% +4096/ 4/ +2%/ 0% 4096/ 4/ 0%/ 0% +4096/ 8/ +9%/ -2% 4096/ 8/ -5%/ -1% + +Acked-by: Michael S. Tsirkin +Signed-off-by: Haibin Zhang +Signed-off-by: Yunfang Tai +Signed-off-by: Lidong Chen +Signed-off-by: David S. Miller +Signed-off-by: Balbir Singh +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/vhost/net.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -44,6 +44,10 @@ MODULE_PARM_DESC(experimental_zcopytx, " + * Using this limit prevents one virtqueue from starving others. */ + #define VHOST_NET_WEIGHT 0x80000 + ++/* Max number of packets transferred before requeueing the job. ++ * Using this limit prevents one virtqueue from starving rx. */ ++#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2) ++ + /* MAX number of TX used buffers for outstanding zerocopy */ + #define VHOST_MAX_PEND 128 + #define VHOST_GOODCOPY_LEN 256 +@@ -461,6 +465,7 @@ static void handle_tx(struct vhost_net * + struct socket *sock; + struct vhost_net_ubuf_ref *uninitialized_var(ubufs); + bool zcopy, zcopy_used; ++ int sent_pkts = 0; + + mutex_lock(&vq->mutex); + sock = vq->private_data; +@@ -572,7 +577,8 @@ static void handle_tx(struct vhost_net * + else + vhost_zerocopy_signal_used(net, vq); + vhost_net_tx_packet(net); +- if (unlikely(total_len >= VHOST_NET_WEIGHT)) { ++ if (unlikely(total_len >= VHOST_NET_WEIGHT) || ++ unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) { + vhost_poll_queue(&vq->poll); + break; + } diff --git a/queue-4.14/vhost-scsi-add-weight-support.patch b/queue-4.14/vhost-scsi-add-weight-support.patch new file mode 100644 index 00000000000..6223f5021c1 --- /dev/null +++ b/queue-4.14/vhost-scsi-add-weight-support.patch @@ -0,0 +1,58 @@ +From c1ea02f15ab5efb3e93fc3144d895410bf79fcf2 Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Fri, 17 May 2019 00:29:52 -0400 +Subject: vhost: scsi: add weight support + +From: Jason Wang + +commit c1ea02f15ab5efb3e93fc3144d895410bf79fcf2 upstream. + +This patch will check the weight and exit the loop if we exceeds the +weight. This is useful for preventing scsi kthread from hogging cpu +which is guest triggerable. + +This addresses CVE-2019-3900. + +Cc: Paolo Bonzini +Cc: Stefan Hajnoczi +Fixes: 057cbf49a1f0 ("tcm_vhost: Initial merge for vhost level target fabric driver") +Signed-off-by: Jason Wang +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Balbir Singh +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/vhost/scsi.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/vhost/scsi.c ++++ b/drivers/vhost/scsi.c +@@ -846,7 +846,7 @@ vhost_scsi_handle_vq(struct vhost_scsi * + u64 tag; + u32 exp_data_len, data_direction; + unsigned int out = 0, in = 0; +- int head, ret, prot_bytes; ++ int head, ret, prot_bytes, c = 0; + size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp); + size_t out_size, in_size; + u16 lun; +@@ -865,7 +865,7 @@ vhost_scsi_handle_vq(struct vhost_scsi * + + vhost_disable_notify(&vs->dev, vq); + +- for (;;) { ++ do { + head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), &out, &in, + NULL, NULL); +@@ -1080,7 +1080,7 @@ vhost_scsi_handle_vq(struct vhost_scsi * + */ + INIT_WORK(&cmd->work, vhost_scsi_submission_work); + queue_work(vhost_scsi_workqueue, &cmd->work); +- } ++ } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); + out: + mutex_unlock(&vq->mutex); + } diff --git a/queue-4.14/vhost-vsock-add-weight-support.patch b/queue-4.14/vhost-vsock-add-weight-support.patch new file mode 100644 index 00000000000..4be3bf71609 --- /dev/null +++ b/queue-4.14/vhost-vsock-add-weight-support.patch @@ -0,0 +1,92 @@ +From e79b431fb901ba1106670bcc80b9b617b25def7d Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Fri, 17 May 2019 00:29:51 -0400 +Subject: vhost: vsock: add weight support + +From: Jason Wang + +commit e79b431fb901ba1106670bcc80b9b617b25def7d upstream. + +This patch will check the weight and exit the loop if we exceeds the +weight. This is useful for preventing vsock kthread from hogging cpu +which is guest triggerable. The weight can help to avoid starving the +request from on direction while another direction is being processed. + +The value of weight is picked from vhost-net. + +This addresses CVE-2019-3900. + +Cc: Stefan Hajnoczi +Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko") +Signed-off-by: Jason Wang +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Balbir Singh +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/vhost/vsock.c | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +--- a/drivers/vhost/vsock.c ++++ b/drivers/vhost/vsock.c +@@ -86,6 +86,7 @@ vhost_transport_do_send_pkt(struct vhost + struct vhost_virtqueue *vq) + { + struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; ++ int pkts = 0, total_len = 0; + bool added = false; + bool restart_tx = false; + +@@ -97,7 +98,7 @@ vhost_transport_do_send_pkt(struct vhost + /* Avoid further vmexits, we're already processing the virtqueue */ + vhost_disable_notify(&vsock->dev, vq); + +- for (;;) { ++ do { + struct virtio_vsock_pkt *pkt; + struct iov_iter iov_iter; + unsigned out, in; +@@ -182,8 +183,9 @@ vhost_transport_do_send_pkt(struct vhost + */ + virtio_transport_deliver_tap_pkt(pkt); + ++ total_len += pkt->len; + virtio_transport_free_pkt(pkt); +- } ++ } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); + if (added) + vhost_signal(&vsock->dev, vq); + +@@ -358,7 +360,7 @@ static void vhost_vsock_handle_tx_kick(s + struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, + dev); + struct virtio_vsock_pkt *pkt; +- int head; ++ int head, pkts = 0, total_len = 0; + unsigned int out, in; + bool added = false; + +@@ -368,7 +370,7 @@ static void vhost_vsock_handle_tx_kick(s + goto out; + + vhost_disable_notify(&vsock->dev, vq); +- for (;;) { ++ do { + u32 len; + + if (!vhost_vsock_more_replies(vsock)) { +@@ -409,9 +411,11 @@ static void vhost_vsock_handle_tx_kick(s + else + virtio_transport_free_pkt(pkt); + +- vhost_add_used(vq, head, sizeof(pkt->hdr) + len); ++ len += sizeof(pkt->hdr); ++ vhost_add_used(vq, head, len); ++ total_len += len; + added = true; +- } ++ } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); + + no_more_replies: + if (added) diff --git a/queue-4.14/vhost_net-fix-possible-infinite-loop.patch b/queue-4.14/vhost_net-fix-possible-infinite-loop.patch new file mode 100644 index 00000000000..102eb419abd --- /dev/null +++ b/queue-4.14/vhost_net-fix-possible-infinite-loop.patch @@ -0,0 +1,112 @@ +From e2412c07f8f3040593dfb88207865a3cd58680c0 Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Fri, 17 May 2019 00:29:50 -0400 +Subject: vhost_net: fix possible infinite loop + +From: Jason Wang + +commit e2412c07f8f3040593dfb88207865a3cd58680c0 upstream. + +When the rx buffer is too small for a packet, we will discard the vq +descriptor and retry it for the next packet: + +while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk, + &busyloop_intr))) { +... + /* On overrun, truncate and discard */ + if (unlikely(headcount > UIO_MAXIOV)) { + iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); + err = sock->ops->recvmsg(sock, &msg, + 1, MSG_DONTWAIT | MSG_TRUNC); + pr_debug("Discarded rx packet: len %zd\n", sock_len); + continue; + } +... +} + +This makes it possible to trigger a infinite while..continue loop +through the co-opreation of two VMs like: + +1) Malicious VM1 allocate 1 byte rx buffer and try to slow down the + vhost process as much as possible e.g using indirect descriptors or + other. +2) Malicious VM2 generate packets to VM1 as fast as possible + +Fixing this by checking against weight at the end of RX and TX +loop. This also eliminate other similar cases when: + +- userspace is consuming the packets in the meanwhile +- theoretical TOCTOU attack if guest moving avail index back and forth + to hit the continue after vhost find guest just add new buffers + +This addresses CVE-2019-3900. + +Fixes: d8316f3991d20 ("vhost: fix total length when packets are too short") +Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server") +Signed-off-by: Jason Wang +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Balbir Singh + +--- + drivers/vhost/net.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -482,7 +482,7 @@ static void handle_tx(struct vhost_net * + hdr_size = nvq->vhost_hlen; + zcopy = nvq->ubufs; + +- for (;;) { ++ do { + /* Release DMAs done buffers first */ + if (zcopy) + vhost_zerocopy_signal_used(net, vq); +@@ -578,10 +578,7 @@ static void handle_tx(struct vhost_net * + else + vhost_zerocopy_signal_used(net, vq); + vhost_net_tx_packet(net); +- if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts, +- total_len))) +- break; +- } ++ } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); + out: + mutex_unlock(&vq->mutex); + } +@@ -779,7 +776,11 @@ static void handle_rx(struct vhost_net * + vq->log : NULL; + mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + +- while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) { ++ do { ++ sock_len = vhost_net_rx_peek_head_len(net, sock->sk); ++ ++ if (!sock_len) ++ break; + sock_len += sock_hlen; + vhost_len = sock_len + vhost_hlen; + headcount = get_rx_bufs(vq, vq->heads, vhost_len, +@@ -860,9 +861,8 @@ static void handle_rx(struct vhost_net * + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; +- if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len))) +- goto out; +- } ++ } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); ++ + vhost_net_enable_vq(net, vq); + out: + mutex_unlock(&vq->mutex); +@@ -941,7 +941,7 @@ static int vhost_net_open(struct inode * + vhost_net_buf_init(&n->vqs[i].rxq); + } + vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, +- VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT); ++ VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT); + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); diff --git a/queue-4.14/vhost_net-introduce-vhost_exceeds_weight.patch b/queue-4.14/vhost_net-introduce-vhost_exceeds_weight.patch new file mode 100644 index 00000000000..7efde1adb96 --- /dev/null +++ b/queue-4.14/vhost_net-introduce-vhost_exceeds_weight.patch @@ -0,0 +1,61 @@ +From 272f35cba53d088085e5952fd81d7a133ab90789 Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Fri, 20 Jul 2018 08:15:15 +0800 +Subject: vhost_net: introduce vhost_exceeds_weight() + +From: Jason Wang + +commit 272f35cba53d088085e5952fd81d7a133ab90789 upstream. + +Signed-off-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Balbir Singh +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/vhost/net.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -446,6 +446,12 @@ static bool vhost_exceeds_maxpend(struct + == nvq->done_idx; + } + ++static bool vhost_exceeds_weight(int pkts, int total_len) ++{ ++ return total_len >= VHOST_NET_WEIGHT || ++ pkts >= VHOST_NET_PKT_WEIGHT; ++} ++ + /* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ + static void handle_tx(struct vhost_net *net) +@@ -550,7 +556,6 @@ static void handle_tx(struct vhost_net * + msg.msg_control = NULL; + ubufs = NULL; + } +- + total_len += len; + if (total_len < VHOST_NET_WEIGHT && + !vhost_vq_avail_empty(&net->dev, vq) && +@@ -579,8 +584,7 @@ static void handle_tx(struct vhost_net * + else + vhost_zerocopy_signal_used(net, vq); + vhost_net_tx_packet(net); +- if (unlikely(total_len >= VHOST_NET_WEIGHT) || +- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) { ++ if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) { + vhost_poll_queue(&vq->poll); + break; + } +@@ -863,8 +867,7 @@ static void handle_rx(struct vhost_net * + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; +- if (unlikely(total_len >= VHOST_NET_WEIGHT) || +- unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) { ++ if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) { + vhost_poll_queue(&vq->poll); + goto out; + } diff --git a/queue-4.14/vhost_net-use-packet-weight-for-rx-handler-too.patch b/queue-4.14/vhost_net-use-packet-weight-for-rx-handler-too.patch new file mode 100644 index 00000000000..78025dd1ff5 --- /dev/null +++ b/queue-4.14/vhost_net-use-packet-weight-for-rx-handler-too.patch @@ -0,0 +1,92 @@ +From db688c24eada63b1efe6d0d7d835e5c3bdd71fd3 Mon Sep 17 00:00:00 2001 +From: Paolo Abeni +Date: Tue, 24 Apr 2018 10:34:36 +0200 +Subject: vhost_net: use packet weight for rx handler, too + +From: Paolo Abeni + +commit db688c24eada63b1efe6d0d7d835e5c3bdd71fd3 upstream. + +Similar to commit a2ac99905f1e ("vhost-net: set packet weight of +tx polling to 2 * vq size"), we need a packet-based limit for +handler_rx, too - elsewhere, under rx flood with small packets, +tx can be delayed for a very long time, even without busypolling. + +The pkt limit applied to handle_rx must be the same applied by +handle_tx, or we will get unfair scheduling between rx and tx. +Tying such limit to the queue length makes it less effective for +large queue length values and can introduce large process +scheduler latencies, so a constant valued is used - likewise +the existing bytes limit. + +The selected limit has been validated with PVP[1] performance +test with different queue sizes: + +queue size 256 512 1024 + +baseline 366 354 362 +weight 128 715 723 670 +weight 256 740 745 733 +weight 512 600 460 583 +weight 1024 423 427 418 + +A packet weight of 256 gives peek performances in under all the +tested scenarios. + +No measurable regression in unidirectional performance tests has +been detected. + +[1] https://developers.redhat.com/blog/2017/06/05/measuring-and-comparing-open-vswitch-performance/ + +Signed-off-by: Paolo Abeni +Acked-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Balbir Singh +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/vhost/net.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -45,8 +45,10 @@ MODULE_PARM_DESC(experimental_zcopytx, " + #define VHOST_NET_WEIGHT 0x80000 + + /* Max number of packets transferred before requeueing the job. +- * Using this limit prevents one virtqueue from starving rx. */ +-#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2) ++ * Using this limit prevents one virtqueue from starving others with small ++ * pkts. ++ */ ++#define VHOST_NET_PKT_WEIGHT 256 + + /* MAX number of TX used buffers for outstanding zerocopy */ + #define VHOST_MAX_PEND 128 +@@ -578,7 +580,7 @@ static void handle_tx(struct vhost_net * + vhost_zerocopy_signal_used(net, vq); + vhost_net_tx_packet(net); + if (unlikely(total_len >= VHOST_NET_WEIGHT) || +- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) { ++ unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) { + vhost_poll_queue(&vq->poll); + break; + } +@@ -760,6 +762,7 @@ static void handle_rx(struct vhost_net * + struct socket *sock; + struct iov_iter fixup; + __virtio16 num_buffers; ++ int recv_pkts = 0; + + mutex_lock_nested(&vq->mutex, 0); + sock = vq->private_data; +@@ -860,7 +863,8 @@ static void handle_rx(struct vhost_net * + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; +- if (unlikely(total_len >= VHOST_NET_WEIGHT)) { ++ if (unlikely(total_len >= VHOST_NET_WEIGHT) || ++ unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) { + vhost_poll_queue(&vq->poll); + goto out; + }