From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 26 Jan 2019 10:07:06 +0000 (+0100)
Subject: 4.9-stable patches
X-Git-Tag: v4.9.154~70
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d0bf63bf44247afa779ab87d06e2079f53137bc6;p=thirdparty%2Fkernel%2Fstable-queue.git

4.9-stable patches

added patches:
	net-bridge-fix-ethernet-header-pointer-before-check-skb-forwardable.patch
	net-fix-usage-of-pskb_trim_rcsum.patch
	net-ipv4-fix-memory-leak-in-network-namespace-dismantle.patch
	net_sched-refetch-skb-protocol-for-each-filter.patch
	openvswitch-avoid-oob-read-when-parsing-flow-nlattrs.patch
	vhost-log-dirty-page-correctly.patch
---

diff --git a/queue-4.9/net-bridge-fix-ethernet-header-pointer-before-check-skb-forwardable.patch b/queue-4.9/net-bridge-fix-ethernet-header-pointer-before-check-skb-forwardable.patch
new file mode 100644
index 00000000000..e287e4af670
--- /dev/null
+++ b/queue-4.9/net-bridge-fix-ethernet-header-pointer-before-check-skb-forwardable.patch
@@ -0,0 +1,69 @@
+From foo@baz Sat Jan 26 10:53:10 CET 2019
+From: Yunjian Wang <wangyunjian@huawei.com>
+Date: Thu, 17 Jan 2019 09:46:41 +0800
+Subject: net: bridge: Fix ethernet header pointer before check skb forwardable
+
+From: Yunjian Wang <wangyunjian@huawei.com>
+
+[ Upstream commit 28c1382fa28f2e2d9d0d6f25ae879b5af2ecbd03 ]
+
+The skb header should be set to ethernet header before using
+is_skb_forwardable. Because the ethernet header length has been
+considered in is_skb_forwardable(including dev->hard_header_len
+length).
+
+To reproduce the issue:
+1, add 2 ports on linux bridge br using following commands:
+$ brctl addbr br
+$ brctl addif br eth0
+$ brctl addif br eth1
+2, the MTU of eth0 and eth1 is 1500
+3, send a packet(Data 1480, UDP 8, IP 20, Ethernet 14, VLAN 4)
+from eth0 to eth1
+
+So the expect result is packet larger than 1500 cannot pass through
+eth0 and eth1. But currently, the packet passes through success, it
+means eth1's MTU limit doesn't take effect.
+
+Fixes: f6367b4660dd ("bridge: use is_skb_forwardable in forward path")
+Cc: bridge@lists.linux-foundation.org
+Cc: Nkolay Aleksandrov <nikolay@cumulusnetworks.com>
+Cc: Roopa Prabhu <roopa@cumulusnetworks.com>
+Cc: Stephen Hemminger <stephen@networkplumber.org>
+Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bridge/br_forward.c |    9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+
+--- a/net/bridge/br_forward.c
++++ b/net/bridge/br_forward.c
+@@ -35,10 +35,10 @@ static inline int should_deliver(const s
+ 
+ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
+ {
++	skb_push(skb, ETH_HLEN);
+ 	if (!is_skb_forwardable(skb->dev, skb))
+ 		goto drop;
+ 
+-	skb_push(skb, ETH_HLEN);
+ 	br_drop_fake_rtable(skb);
+ 
+ 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+@@ -96,12 +96,11 @@ static void __br_forward(const struct ne
+ 		net = dev_net(indev);
+ 	} else {
+ 		if (unlikely(netpoll_tx_running(to->br->dev))) {
+-			if (!is_skb_forwardable(skb->dev, skb)) {
++			skb_push(skb, ETH_HLEN);
++			if (!is_skb_forwardable(skb->dev, skb))
+ 				kfree_skb(skb);
+-			} else {
+-				skb_push(skb, ETH_HLEN);
++			else
+ 				br_netpoll_send_skb(to, skb);
+-			}
+ 			return;
+ 		}
+ 		br_hook = NF_BR_LOCAL_OUT;
diff --git a/queue-4.9/net-fix-usage-of-pskb_trim_rcsum.patch b/queue-4.9/net-fix-usage-of-pskb_trim_rcsum.patch
new file mode 100644
index 00000000000..ddcd674922f
--- /dev/null
+++ b/queue-4.9/net-fix-usage-of-pskb_trim_rcsum.patch
@@ -0,0 +1,75 @@
+From foo@baz Sat Jan 26 10:53:10 CET 2019
+From: Ross Lagerwall <ross.lagerwall@citrix.com>
+Date: Thu, 17 Jan 2019 15:34:38 +0000
+Subject: net: Fix usage of pskb_trim_rcsum
+
+From: Ross Lagerwall <ross.lagerwall@citrix.com>
+
+[ Upstream commit 6c57f0458022298e4da1729c67bd33ce41c14e7a ]
+
+In certain cases, pskb_trim_rcsum() may change skb pointers.
+Reinitialize header pointers afterwards to avoid potential
+use-after-frees. Add a note in the documentation of
+pskb_trim_rcsum(). Found by KASAN.
+
+Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ppp/pppoe.c                  |    1 +
+ include/linux/skbuff.h                   |    1 +
+ net/bridge/br_netfilter_ipv6.c           |    1 +
+ net/bridge/netfilter/nft_reject_bridge.c |    1 +
+ net/ipv4/ip_input.c                      |    1 +
+ 5 files changed, 5 insertions(+)
+
+--- a/drivers/net/ppp/pppoe.c
++++ b/drivers/net/ppp/pppoe.c
+@@ -442,6 +442,7 @@ static int pppoe_rcv(struct sk_buff *skb
+ 	if (pskb_trim_rcsum(skb, len))
+ 		goto drop;
+ 
++	ph = pppoe_hdr(skb);
+ 	pn = pppoe_pernet(dev_net(dev));
+ 
+ 	/* Note that get_item does a sock_hold(), so sk_pppox(po)
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -2962,6 +2962,7 @@ int pskb_trim_rcsum_slow(struct sk_buff
+  *
+  *	This is exactly the same as pskb_trim except that it ensures the
+  *	checksum of received packets are still valid after the operation.
++ *	It can change skb pointers.
+  */
+ 
+ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
+--- a/net/bridge/br_netfilter_ipv6.c
++++ b/net/bridge/br_netfilter_ipv6.c
+@@ -131,6 +131,7 @@ int br_validate_ipv6(struct net *net, st
+ 					IPSTATS_MIB_INDISCARDS);
+ 			goto drop;
+ 		}
++		hdr = ipv6_hdr(skb);
+ 	}
+ 	if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb))
+ 		goto drop;
+--- a/net/bridge/netfilter/nft_reject_bridge.c
++++ b/net/bridge/netfilter/nft_reject_bridge.c
+@@ -236,6 +236,7 @@ static bool reject6_br_csum_ok(struct sk
+ 	    pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
+ 		return false;
+ 
++	ip6h = ipv6_hdr(skb);
+ 	thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
+ 	if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
+ 		return false;
+--- a/net/ipv4/ip_input.c
++++ b/net/ipv4/ip_input.c
+@@ -475,6 +475,7 @@ int ip_rcv(struct sk_buff *skb, struct n
+ 		goto drop;
+ 	}
+ 
++	iph = ip_hdr(skb);
+ 	skb->transport_header = skb->network_header + iph->ihl*4;
+ 
+ 	/* Remove any debris in the socket control block */
diff --git a/queue-4.9/net-ipv4-fix-memory-leak-in-network-namespace-dismantle.patch b/queue-4.9/net-ipv4-fix-memory-leak-in-network-namespace-dismantle.patch
new file mode 100644
index 00000000000..52160a488f8
--- /dev/null
+++ b/queue-4.9/net-ipv4-fix-memory-leak-in-network-namespace-dismantle.patch
@@ -0,0 +1,147 @@
+From foo@baz Sat Jan 26 10:53:10 CET 2019
+From: Ido Schimmel <idosch@mellanox.com>
+Date: Wed, 9 Jan 2019 09:57:39 +0000
+Subject: net: ipv4: Fix memory leak in network namespace dismantle
+
+From: Ido Schimmel <idosch@mellanox.com>
+
+[ Upstream commit f97f4dd8b3bb9d0993d2491e0f22024c68109184 ]
+
+IPv4 routing tables are flushed in two cases:
+
+1. In response to events in the netdev and inetaddr notification chains
+2. When a network namespace is being dismantled
+
+In both cases only routes associated with a dead nexthop group are
+flushed. However, a nexthop group will only be marked as dead in case it
+is populated with actual nexthops using a nexthop device. This is not
+the case when the route in question is an error route (e.g.,
+'blackhole', 'unreachable').
+
+Therefore, when a network namespace is being dismantled such routes are
+not flushed and leaked [1].
+
+To reproduce:
+# ip netns add blue
+# ip -n blue route add unreachable 192.0.2.0/24
+# ip netns del blue
+
+Fix this by not skipping error routes that are not marked with
+RTNH_F_DEAD when flushing the routing tables.
+
+To prevent the flushing of such routes in case #1, add a parameter to
+fib_table_flush() that indicates if the table is flushed as part of
+namespace dismantle or not.
+
+Note that this problem does not exist in IPv6 since error routes are
+associated with the loopback device.
+
+[1]
+unreferenced object 0xffff888066650338 (size 56):
+  comm "ip", pid 1206, jiffies 4294786063 (age 26.235s)
+  hex dump (first 32 bytes):
+    00 00 00 00 00 00 00 00 b0 1c 62 61 80 88 ff ff  ..........ba....
+    e8 8b a1 64 80 88 ff ff 00 07 00 08 fe 00 00 00  ...d............
+  backtrace:
+    [<00000000856ed27d>] inet_rtm_newroute+0x129/0x220
+    [<00000000fcdfc00a>] rtnetlink_rcv_msg+0x397/0xa20
+    [<00000000cb85801a>] netlink_rcv_skb+0x132/0x380
+    [<00000000ebc991d2>] netlink_unicast+0x4c0/0x690
+    [<0000000014f62875>] netlink_sendmsg+0x929/0xe10
+    [<00000000bac9d967>] sock_sendmsg+0xc8/0x110
+    [<00000000223e6485>] ___sys_sendmsg+0x77a/0x8f0
+    [<000000002e94f880>] __sys_sendmsg+0xf7/0x250
+    [<00000000ccb1fa72>] do_syscall_64+0x14d/0x610
+    [<00000000ffbe3dae>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
+    [<000000003a8b605b>] 0xffffffffffffffff
+unreferenced object 0xffff888061621c88 (size 48):
+  comm "ip", pid 1206, jiffies 4294786063 (age 26.235s)
+  hex dump (first 32 bytes):
+    6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+    6b 6b 6b 6b 6b 6b 6b 6b d8 8e 26 5f 80 88 ff ff  kkkkkkkk..&_....
+  backtrace:
+    [<00000000733609e3>] fib_table_insert+0x978/0x1500
+    [<00000000856ed27d>] inet_rtm_newroute+0x129/0x220
+    [<00000000fcdfc00a>] rtnetlink_rcv_msg+0x397/0xa20
+    [<00000000cb85801a>] netlink_rcv_skb+0x132/0x380
+    [<00000000ebc991d2>] netlink_unicast+0x4c0/0x690
+    [<0000000014f62875>] netlink_sendmsg+0x929/0xe10
+    [<00000000bac9d967>] sock_sendmsg+0xc8/0x110
+    [<00000000223e6485>] ___sys_sendmsg+0x77a/0x8f0
+    [<000000002e94f880>] __sys_sendmsg+0xf7/0x250
+    [<00000000ccb1fa72>] do_syscall_64+0x14d/0x610
+    [<00000000ffbe3dae>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
+    [<000000003a8b605b>] 0xffffffffffffffff
+
+Fixes: 8cced9eff1d4 ("[NETNS]: Enable routing configuration in non-initial namespace.")
+Signed-off-by: Ido Schimmel <idosch@mellanox.com>
+Reviewed-by: David Ahern <dsahern@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/ip_fib.h    |    2 +-
+ net/ipv4/fib_frontend.c |    4 ++--
+ net/ipv4/fib_trie.c     |   14 ++++++++++++--
+ 3 files changed, 15 insertions(+), 5 deletions(-)
+
+--- a/include/net/ip_fib.h
++++ b/include/net/ip_fib.h
+@@ -242,7 +242,7 @@ int fib_table_insert(struct net *, struc
+ int fib_table_delete(struct net *, struct fib_table *, struct fib_config *);
+ int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
+ 		   struct netlink_callback *cb);
+-int fib_table_flush(struct net *net, struct fib_table *table);
++int fib_table_flush(struct net *net, struct fib_table *table, bool flush_all);
+ struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
+ void fib_table_flush_external(struct fib_table *table);
+ void fib_free_table(struct fib_table *tb);
+--- a/net/ipv4/fib_frontend.c
++++ b/net/ipv4/fib_frontend.c
+@@ -193,7 +193,7 @@ static void fib_flush(struct net *net)
+ 		struct fib_table *tb;
+ 
+ 		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
+-			flushed += fib_table_flush(net, tb);
++			flushed += fib_table_flush(net, tb, false);
+ 	}
+ 
+ 	if (flushed)
+@@ -1277,7 +1277,7 @@ static void ip_fib_net_exit(struct net *
+ 
+ 		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+ 			hlist_del(&tb->tb_hlist);
+-			fib_table_flush(net, tb);
++			fib_table_flush(net, tb, true);
+ 			fib_free_table(tb);
+ 		}
+ 	}
+--- a/net/ipv4/fib_trie.c
++++ b/net/ipv4/fib_trie.c
+@@ -1826,7 +1826,7 @@ void fib_table_flush_external(struct fib
+ }
+ 
+ /* Caller must hold RTNL. */
+-int fib_table_flush(struct net *net, struct fib_table *tb)
++int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
+ {
+ 	struct trie *t = (struct trie *)tb->tb_data;
+ 	struct key_vector *pn = t->kv;
+@@ -1874,7 +1874,17 @@ int fib_table_flush(struct net *net, str
+ 		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ 			struct fib_info *fi = fa->fa_info;
+ 
+-			if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
++			if (!fi ||
++			    (!(fi->fib_flags & RTNH_F_DEAD) &&
++			     !fib_props[fa->fa_type].error)) {
++				slen = fa->fa_slen;
++				continue;
++			}
++
++			/* Do not flush error routes if network namespace is
++			 * not being dismantled
++			 */
++			if (!flush_all && fib_props[fa->fa_type].error) {
+ 				slen = fa->fa_slen;
+ 				continue;
+ 			}
diff --git a/queue-4.9/net_sched-refetch-skb-protocol-for-each-filter.patch b/queue-4.9/net_sched-refetch-skb-protocol-for-each-filter.patch
new file mode 100644
index 00000000000..53fe89a74c2
--- /dev/null
+++ b/queue-4.9/net_sched-refetch-skb-protocol-for-each-filter.patch
@@ -0,0 +1,60 @@
+From foo@baz Sat Jan 26 10:53:10 CET 2019
+From: Cong Wang <xiyou.wangcong@gmail.com>
+Date: Fri, 11 Jan 2019 18:55:42 -0800
+Subject: net_sched: refetch skb protocol for each filter
+
+From: Cong Wang <xiyou.wangcong@gmail.com>
+
+[ Upstream commit cd0c4e70fc0ccfa705cdf55efb27519ce9337a26 ]
+
+Martin reported a set of filters don't work after changing
+from reclassify to continue. Looking into the code, it
+looks like skb protocol is not always fetched for each
+iteration of the filters. But, as demonstrated by Martin,
+TC actions could modify skb->protocol, for example act_vlan,
+this means we have to refetch skb protocol in each iteration,
+rather than using the one we fetch in the beginning of the loop.
+
+This bug is _not_ introduced by commit 3b3ae880266d
+("net: sched: consolidate tc_classify{,_compat}"), technically,
+if act_vlan is the only action that modifies skb protocol, then
+it is commit c7e2b9689ef8 ("sched: introduce vlan action") which
+introduced this bug.
+
+Reported-by: Martin Olsson <martin.olsson+netdev@sentorsecurity.com>
+Cc: Jamal Hadi Salim <jhs@mojatatu.com>
+Cc: Jiri Pirko <jiri@resnulli.us>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/sch_api.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/net/sched/sch_api.c
++++ b/net/sched/sch_api.c
+@@ -1850,7 +1850,6 @@ done:
+ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ 		struct tcf_result *res, bool compat_mode)
+ {
+-	__be16 protocol = tc_skb_protocol(skb);
+ #ifdef CONFIG_NET_CLS_ACT
+ 	const struct tcf_proto *old_tp = tp;
+ 	int limit = 0;
+@@ -1858,6 +1857,7 @@ int tc_classify(struct sk_buff *skb, con
+ reclassify:
+ #endif
+ 	for (; tp; tp = rcu_dereference_bh(tp->next)) {
++		__be16 protocol = tc_skb_protocol(skb);
+ 		int err;
+ 
+ 		if (tp->protocol != protocol &&
+@@ -1884,7 +1884,6 @@ reset:
+ 	}
+ 
+ 	tp = old_tp;
+-	protocol = tc_skb_protocol(skb);
+ 	goto reclassify;
+ #endif
+ }
diff --git a/queue-4.9/openvswitch-avoid-oob-read-when-parsing-flow-nlattrs.patch b/queue-4.9/openvswitch-avoid-oob-read-when-parsing-flow-nlattrs.patch
new file mode 100644
index 00000000000..523a48bad0c
--- /dev/null
+++ b/queue-4.9/openvswitch-avoid-oob-read-when-parsing-flow-nlattrs.patch
@@ -0,0 +1,34 @@
+From foo@baz Sat Jan 26 10:53:10 CET 2019
+From: Ross Lagerwall <ross.lagerwall@citrix.com>
+Date: Mon, 14 Jan 2019 09:16:56 +0000
+Subject: openvswitch: Avoid OOB read when parsing flow nlattrs
+
+From: Ross Lagerwall <ross.lagerwall@citrix.com>
+
+[ Upstream commit 04a4af334b971814eedf4e4a413343ad3287d9a9 ]
+
+For nested and variable attributes, the expected length of an attribute
+is not known and marked by a negative number.  This results in an OOB
+read when the expected length is later used to check if the attribute is
+all zeros. Fix this by using the actual length of the attribute rather
+than the expected length.
+
+Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
+Acked-by: Pravin B Shelar <pshelar@ovn.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/openvswitch/flow_netlink.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/openvswitch/flow_netlink.c
++++ b/net/openvswitch/flow_netlink.c
+@@ -409,7 +409,7 @@ static int __parse_flow_nlattrs(const st
+ 			return -EINVAL;
+ 		}
+ 
+-		if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
++		if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) {
+ 			attrs |= 1 << type;
+ 			a[type] = nla;
+ 		}
diff --git a/queue-4.9/series b/queue-4.9/series
new file mode 100644
index 00000000000..ce5814f07af
--- /dev/null
+++ b/queue-4.9/series
@@ -0,0 +1,6 @@
+net-bridge-fix-ethernet-header-pointer-before-check-skb-forwardable.patch
+net-fix-usage-of-pskb_trim_rcsum.patch
+openvswitch-avoid-oob-read-when-parsing-flow-nlattrs.patch
+vhost-log-dirty-page-correctly.patch
+net-ipv4-fix-memory-leak-in-network-namespace-dismantle.patch
+net_sched-refetch-skb-protocol-for-each-filter.patch
diff --git a/queue-4.9/vhost-log-dirty-page-correctly.patch b/queue-4.9/vhost-log-dirty-page-correctly.patch
new file mode 100644
index 00000000000..49c537af4d0
--- /dev/null
+++ b/queue-4.9/vhost-log-dirty-page-correctly.patch
@@ -0,0 +1,202 @@
+From foo@baz Sat Jan 26 10:53:10 CET 2019
+From: Jason Wang <jasowang@redhat.com>
+Date: Wed, 16 Jan 2019 16:54:42 +0800
+Subject: vhost: log dirty page correctly
+
+From: Jason Wang <jasowang@redhat.com>
+
+[ Upstream commit cc5e710759470bc7f3c61d11fd54586f15fdbdf4 ]
+
+Vhost dirty page logging API is designed to sync through GPA. But we
+try to log GIOVA when device IOTLB is enabled. This is wrong and may
+lead to missing data after migration.
+
+To solve this issue, when logging with device IOTLB enabled, we will:
+
+1) reuse the device IOTLB translation result of GIOVA->HVA mapping to
+   get HVA, for writable descriptor, get HVA through iovec. For used
+   ring update, translate its GIOVA to HVA
+2) traverse the GPA->HVA mapping to get the possible GPA and log
+   through GPA. Pay attention this reverse mapping is not guaranteed
+   to be unique, so we should log each possible GPA in this case.
+
+This fix the failure of scp to guest during migration. In -next, we
+will probably support passing GIOVA->GPA instead of GIOVA->HVA.
+
+Fixes: 6b1e6cc7855b ("vhost: new device IOTLB API")
+Reported-by: Jintack Lim <jintack@cs.columbia.edu>
+Cc: Jintack Lim <jintack@cs.columbia.edu>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vhost/net.c   |    3 +
+ drivers/vhost/vhost.c |   97 ++++++++++++++++++++++++++++++++++++++++++--------
+ drivers/vhost/vhost.h |    3 +
+ 3 files changed, 87 insertions(+), 16 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -751,7 +751,8 @@ static void handle_rx(struct vhost_net *
+ 		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+ 					    headcount);
+ 		if (unlikely(vq_log))
+-			vhost_log_write(vq, vq_log, log, vhost_len);
++			vhost_log_write(vq, vq_log, log, vhost_len,
++					vq->iov, in);
+ 		total_len += vhost_len;
+ 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+ 			vhost_poll_queue(&vq->poll);
+--- a/drivers/vhost/vhost.c
++++ b/drivers/vhost/vhost.c
+@@ -1646,13 +1646,87 @@ static int log_write(void __user *log_ba
+ 	return r;
+ }
+ 
++static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
++{
++	struct vhost_umem *umem = vq->umem;
++	struct vhost_umem_node *u;
++	u64 start, end, l, min;
++	int r;
++	bool hit = false;
++
++	while (len) {
++		min = len;
++		/* More than one GPAs can be mapped into a single HVA. So
++		 * iterate all possible umems here to be safe.
++		 */
++		list_for_each_entry(u, &umem->umem_list, link) {
++			if (u->userspace_addr > hva - 1 + len ||
++			    u->userspace_addr - 1 + u->size < hva)
++				continue;
++			start = max(u->userspace_addr, hva);
++			end = min(u->userspace_addr - 1 + u->size,
++				  hva - 1 + len);
++			l = end - start + 1;
++			r = log_write(vq->log_base,
++				      u->start + start - u->userspace_addr,
++				      l);
++			if (r < 0)
++				return r;
++			hit = true;
++			min = min(l, min);
++		}
++
++		if (!hit)
++			return -EFAULT;
++
++		len -= min;
++		hva += min;
++	}
++
++	return 0;
++}
++
++static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
++{
++	struct iovec iov[64];
++	int i, ret;
++
++	if (!vq->iotlb)
++		return log_write(vq->log_base, vq->log_addr + used_offset, len);
++
++	ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
++			     len, iov, 64, VHOST_ACCESS_WO);
++	if (ret)
++		return ret;
++
++	for (i = 0; i < ret; i++) {
++		ret = log_write_hva(vq,	(uintptr_t)iov[i].iov_base,
++				    iov[i].iov_len);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
+ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
+-		    unsigned int log_num, u64 len)
++		    unsigned int log_num, u64 len, struct iovec *iov, int count)
+ {
+ 	int i, r;
+ 
+ 	/* Make sure data written is seen before log. */
+ 	smp_wmb();
++
++	if (vq->iotlb) {
++		for (i = 0; i < count; i++) {
++			r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
++					  iov[i].iov_len);
++			if (r < 0)
++				return r;
++		}
++		return 0;
++	}
++
+ 	for (i = 0; i < log_num; ++i) {
+ 		u64 l = min(log[i].len, len);
+ 		r = log_write(vq->log_base, log[i].addr, l);
+@@ -1682,9 +1756,8 @@ static int vhost_update_used_flags(struc
+ 		smp_wmb();
+ 		/* Log used flag write. */
+ 		used = &vq->used->flags;
+-		log_write(vq->log_base, vq->log_addr +
+-			  (used - (void __user *)vq->used),
+-			  sizeof vq->used->flags);
++		log_used(vq, (used - (void __user *)vq->used),
++			 sizeof vq->used->flags);
+ 		if (vq->log_ctx)
+ 			eventfd_signal(vq->log_ctx, 1);
+ 	}
+@@ -1702,9 +1775,8 @@ static int vhost_update_avail_event(stru
+ 		smp_wmb();
+ 		/* Log avail event write */
+ 		used = vhost_avail_event(vq);
+-		log_write(vq->log_base, vq->log_addr +
+-			  (used - (void __user *)vq->used),
+-			  sizeof *vhost_avail_event(vq));
++		log_used(vq, (used - (void __user *)vq->used),
++			 sizeof *vhost_avail_event(vq));
+ 		if (vq->log_ctx)
+ 			eventfd_signal(vq->log_ctx, 1);
+ 	}
+@@ -2103,10 +2175,8 @@ static int __vhost_add_used_n(struct vho
+ 		/* Make sure data is seen before log. */
+ 		smp_wmb();
+ 		/* Log used ring entry write. */
+-		log_write(vq->log_base,
+-			  vq->log_addr +
+-			   ((void __user *)used - (void __user *)vq->used),
+-			  count * sizeof *used);
++		log_used(vq, ((void __user *)used - (void __user *)vq->used),
++			 count * sizeof *used);
+ 	}
+ 	old = vq->last_used_idx;
+ 	new = (vq->last_used_idx += count);
+@@ -2148,9 +2218,8 @@ int vhost_add_used_n(struct vhost_virtqu
+ 		/* Make sure used idx is seen before log. */
+ 		smp_wmb();
+ 		/* Log used index update. */
+-		log_write(vq->log_base,
+-			  vq->log_addr + offsetof(struct vring_used, idx),
+-			  sizeof vq->used->idx);
++		log_used(vq, offsetof(struct vring_used, idx),
++			 sizeof vq->used->idx);
+ 		if (vq->log_ctx)
+ 			eventfd_signal(vq->log_ctx, 1);
+ 	}
+--- a/drivers/vhost/vhost.h
++++ b/drivers/vhost/vhost.h
+@@ -199,7 +199,8 @@ bool vhost_vq_avail_empty(struct vhost_d
+ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+ 
+ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
+-		    unsigned int log_num, u64 len);
++		    unsigned int log_num, u64 len,
++		    struct iovec *iov, int count);
+ int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
+ 
+ struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);