From: Greg Kroah-Hartman Date: Wed, 2 Dec 2020 09:55:20 +0000 (+0100) Subject: 5.9-stable patches X-Git-Tag: v4.14.211~21 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=7920e02aa72adfeb39e81594d5ae97e338e444c7;p=thirdparty%2Fkernel%2Fstable-queue.git 5.9-stable patches added patches: devlink-hold-rtnl-lock-while-reading-netdev-attributes.patch devlink-make-sure-devlink-instance-and-port-are-in-same-net-namespace.patch ipv6-addrlabel-fix-possible-memory-leak-in-ip6addrlbl_net_init.patch mptcp-fix-null-ptr-dereference-on-bad-mpj.patch net-af_iucv-set-correct-sk_protocol-for-child-sockets.patch net-openvswitch-fix-ttl-decrement-action-netlink-message-format.patch net-packet-fix-packet-receive-on-l3-devices-without-visible-hard-header.patch net-tls-missing-received-data-after-fast-remote-close.patch net-tls-protect-from-calling-tls_dev_del-for-tls-rx-twice.patch rose-fix-null-pointer-dereference-in-rose_send_frame.patch sock-set-sk_err-to-ee_errno-on-dequeue-from-errq.patch tcp-set-inet_ecn_xmit-configuration-in-tcp_reinit_congestion_control.patch tun-honor-iocb_nowait-flag.patch usbnet-ipheth-fix-connectivity-with-ios-14.patch vsock-virtio-discard-packets-only-when-socket-is-really-closed.patch --- diff --git a/queue-5.9/devlink-hold-rtnl-lock-while-reading-netdev-attributes.patch b/queue-5.9/devlink-hold-rtnl-lock-while-reading-netdev-attributes.patch new file mode 100644 index 00000000000..f74f5b14ce0 --- /dev/null +++ b/queue-5.9/devlink-hold-rtnl-lock-while-reading-netdev-attributes.patch @@ -0,0 +1,55 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Parav Pandit +Date: Wed, 25 Nov 2020 11:16:19 +0200 +Subject: devlink: Hold rtnl lock while reading netdev attributes + +From: Parav Pandit + +[ Upstream commit b187c9b4178b87954dbc94e78a7094715794714f ] + +A netdevice of a devlink port can be moved to different net namespace +than its parent devlink instance. +This scenario occurs when devlink reload is not used. + +When netdevice is undergoing migration to net namespace, its ifindex +and name may change. + +In such use case, devlink port query may read stale netdev attributes. + +Fix it by reading them under rtnl lock. + +Fixes: bfcd3a466172 ("Introduce devlink infrastructure") +Signed-off-by: Parav Pandit +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/core/devlink.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/net/core/devlink.c ++++ b/net/core/devlink.c +@@ -616,6 +616,8 @@ static int devlink_nl_port_fill(struct s + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + ++ /* Hold rtnl lock while accessing port's netdev attributes. */ ++ rtnl_lock(); + spin_lock_bh(&devlink_port->type_lock); + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) + goto nla_put_failure_type_locked; +@@ -642,6 +644,7 @@ static int devlink_nl_port_fill(struct s + goto nla_put_failure_type_locked; + } + spin_unlock_bh(&devlink_port->type_lock); ++ rtnl_unlock(); + if (devlink_nl_port_attrs_put(msg, devlink_port)) + goto nla_put_failure; + if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) +@@ -652,6 +655,7 @@ static int devlink_nl_port_fill(struct s + + nla_put_failure_type_locked: + spin_unlock_bh(&devlink_port->type_lock); ++ rtnl_unlock(); + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; diff --git a/queue-5.9/devlink-make-sure-devlink-instance-and-port-are-in-same-net-namespace.patch b/queue-5.9/devlink-make-sure-devlink-instance-and-port-are-in-same-net-namespace.patch new file mode 100644 index 00000000000..b56fb23a6a5 --- /dev/null +++ b/queue-5.9/devlink-make-sure-devlink-instance-and-port-are-in-same-net-namespace.patch @@ -0,0 +1,38 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Parav Pandit +Date: Wed, 25 Nov 2020 11:16:20 +0200 +Subject: devlink: Make sure devlink instance and port are in same net namespace + +From: Parav Pandit + +[ Upstream commit a7b43649507dae4e55ff0087cad4e4dd1c6d5b99 ] + +When devlink reload operation is not used, netdev of an Ethernet port may +be present in different net namespace than the net namespace of the +devlink instance. + +Ensure that both the devlink instance and devlink port netdev are located +in same net namespace. + +Fixes: 070c63f20f6c ("net: devlink: allow to change namespaces during reload") +Signed-off-by: Parav Pandit +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/core/devlink.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/core/devlink.c ++++ b/net/core/devlink.c +@@ -626,9 +626,10 @@ static int devlink_nl_port_fill(struct s + devlink_port->desired_type)) + goto nla_put_failure_type_locked; + if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { ++ struct net *net = devlink_net(devlink_port->devlink); + struct net_device *netdev = devlink_port->type_dev; + +- if (netdev && ++ if (netdev && net_eq(net, dev_net(netdev)) && + (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, + netdev->ifindex) || + nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, diff --git a/queue-5.9/ipv6-addrlabel-fix-possible-memory-leak-in-ip6addrlbl_net_init.patch b/queue-5.9/ipv6-addrlabel-fix-possible-memory-leak-in-ip6addrlbl_net_init.patch new file mode 100644 index 00000000000..10652955330 --- /dev/null +++ b/queue-5.9/ipv6-addrlabel-fix-possible-memory-leak-in-ip6addrlbl_net_init.patch @@ -0,0 +1,84 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Wang Hai +Date: Tue, 24 Nov 2020 15:17:28 +0800 +Subject: ipv6: addrlabel: fix possible memory leak in ip6addrlbl_net_init + +From: Wang Hai + +[ Upstream commit e255e11e66da8281e337e4e352956e8a4999fca4 ] + +kmemleak report a memory leak as follows: + +unreferenced object 0xffff8880059c6a00 (size 64): + comm "ip", pid 23696, jiffies 4296590183 (age 1755.384s) + hex dump (first 32 bytes): + 20 01 00 10 00 00 00 00 00 00 00 00 00 00 00 00 ............... + 1c 00 00 00 00 00 00 00 00 00 00 00 07 00 00 00 ................ + backtrace: + [<00000000aa4e7a87>] ip6addrlbl_add+0x90/0xbb0 + [<0000000070b8d7f1>] ip6addrlbl_net_init+0x109/0x170 + [<000000006a9ca9d4>] ops_init+0xa8/0x3c0 + [<000000002da57bf2>] setup_net+0x2de/0x7e0 + [<000000004e52d573>] copy_net_ns+0x27d/0x530 + [<00000000b07ae2b4>] create_new_namespaces+0x382/0xa30 + [<000000003b76d36f>] unshare_nsproxy_namespaces+0xa1/0x1d0 + [<0000000030653721>] ksys_unshare+0x3a4/0x780 + [<0000000007e82e40>] __x64_sys_unshare+0x2d/0x40 + [<0000000031a10c08>] do_syscall_64+0x33/0x40 + [<0000000099df30e7>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +We should free all rules when we catch an error in ip6addrlbl_net_init(). +otherwise a memory leak will occur. + +Fixes: 2a8cc6c89039 ("[IPV6] ADDRCONF: Support RFC3484 configurable address selection policy table.") +Reported-by: Hulk Robot +Signed-off-by: Wang Hai +Link: https://lore.kernel.org/r/20201124071728.8385-1-wanghai38@huawei.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/addrlabel.c | 26 +++++++++++++++++--------- + 1 file changed, 17 insertions(+), 9 deletions(-) + +--- a/net/ipv6/addrlabel.c ++++ b/net/ipv6/addrlabel.c +@@ -306,7 +306,9 @@ static int ip6addrlbl_del(struct net *ne + /* add default label */ + static int __net_init ip6addrlbl_net_init(struct net *net) + { +- int err = 0; ++ struct ip6addrlbl_entry *p = NULL; ++ struct hlist_node *n; ++ int err; + int i; + + ADDRLABEL(KERN_DEBUG "%s\n", __func__); +@@ -315,14 +317,20 @@ static int __net_init ip6addrlbl_net_ini + INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); + + for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { +- int ret = ip6addrlbl_add(net, +- ip6addrlbl_init_table[i].prefix, +- ip6addrlbl_init_table[i].prefixlen, +- 0, +- ip6addrlbl_init_table[i].label, 0); +- /* XXX: should we free all rules when we catch an error? */ +- if (ret && (!err || err != -ENOMEM)) +- err = ret; ++ err = ip6addrlbl_add(net, ++ ip6addrlbl_init_table[i].prefix, ++ ip6addrlbl_init_table[i].prefixlen, ++ 0, ++ ip6addrlbl_init_table[i].label, 0); ++ if (err) ++ goto err_ip6addrlbl_add; ++ } ++ return 0; ++ ++err_ip6addrlbl_add: ++ hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { ++ hlist_del_rcu(&p->list); ++ kfree_rcu(p, rcu); + } + return err; + } diff --git a/queue-5.9/mptcp-fix-null-ptr-dereference-on-bad-mpj.patch b/queue-5.9/mptcp-fix-null-ptr-dereference-on-bad-mpj.patch new file mode 100644 index 00000000000..0abf556da01 --- /dev/null +++ b/queue-5.9/mptcp-fix-null-ptr-dereference-on-bad-mpj.patch @@ -0,0 +1,47 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Paolo Abeni +Date: Thu, 26 Nov 2020 15:17:53 +0100 +Subject: mptcp: fix NULL ptr dereference on bad MPJ + +From: Paolo Abeni + +[ Upstream commit d3ab78858f1451351221061a1c365495df196500 ] + +If an msk listener receives an MPJ carrying an invalid token, it +will zero the request socket msk entry. That should later +cause fallback and subflow reset - as per RFC - at +subflow_syn_recv_sock() time due to failing hmac validation. + +Since commit 4cf8b7e48a09 ("subflow: introduce and use +mptcp_can_accept_new_subflow()"), we unconditionally dereference +- in mptcp_can_accept_new_subflow - the subflow request msk +before performing hmac validation. In the above scenario we +hit a NULL ptr dereference. + +Address the issue doing the hmac validation earlier. + +Fixes: 4cf8b7e48a09 ("subflow: introduce and use mptcp_can_accept_new_subflow()") +Tested-by: Davide Caratti +Signed-off-by: Paolo Abeni +Reviewed-by: Matthieu Baerts +Link: https://lore.kernel.org/r/03b2cfa3ac80d8fc18272edc6442a9ddf0b1e34e.1606400227.git.pabeni@redhat.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/mptcp/subflow.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/net/mptcp/subflow.c ++++ b/net/mptcp/subflow.c +@@ -542,9 +542,8 @@ create_msk: + fallback = true; + } else if (subflow_req->mp_join) { + mptcp_get_options(skb, &mp_opt); +- if (!mp_opt.mp_join || +- !mptcp_can_accept_new_subflow(subflow_req->msk) || +- !subflow_hmac_valid(req, &mp_opt)) { ++ if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || ++ !mptcp_can_accept_new_subflow(subflow_req->msk)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + fallback = true; + } diff --git a/queue-5.9/net-af_iucv-set-correct-sk_protocol-for-child-sockets.patch b/queue-5.9/net-af_iucv-set-correct-sk_protocol-for-child-sockets.patch new file mode 100644 index 00000000000..b40436caab8 --- /dev/null +++ b/queue-5.9/net-af_iucv-set-correct-sk_protocol-for-child-sockets.patch @@ -0,0 +1,45 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Julian Wiedmann +Date: Fri, 20 Nov 2020 11:06:57 +0100 +Subject: net/af_iucv: set correct sk_protocol for child sockets + +From: Julian Wiedmann + +[ Upstream commit c5dab0941fcdc9664eb0ec0d4d51433216d91336 ] + +Child sockets erroneously inherit their parent's sk_type (ie. SOCK_*), +instead of the PF_IUCV protocol that the parent was created with in +iucv_sock_create(). + +We're currently not using sk->sk_protocol ourselves, so this shouldn't +have much impact (except eg. getting the output in skb_dump() right). + +Fixes: eac3731bd04c ("[S390]: Add AF_IUCV socket support") +Signed-off-by: Julian Wiedmann +Link: https://lore.kernel.org/r/20201120100657.34407-1-jwi@linux.ibm.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/iucv/af_iucv.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/iucv/af_iucv.c ++++ b/net/iucv/af_iucv.c +@@ -1645,7 +1645,7 @@ static int iucv_callback_connreq(struct + } + + /* Create the new socket */ +- nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); ++ nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0); + if (!nsk) { + err = pr_iucv->path_sever(path, user_data); + iucv_path_free(path); +@@ -1851,7 +1851,7 @@ static int afiucv_hs_callback_syn(struct + goto out; + } + +- nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); ++ nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0); + bh_lock_sock(sk); + if ((sk->sk_state != IUCV_LISTEN) || + sk_acceptq_is_full(sk) || diff --git a/queue-5.9/net-openvswitch-fix-ttl-decrement-action-netlink-message-format.patch b/queue-5.9/net-openvswitch-fix-ttl-decrement-action-netlink-message-format.patch new file mode 100644 index 00000000000..6a0c3b3b3a3 --- /dev/null +++ b/queue-5.9/net-openvswitch-fix-ttl-decrement-action-netlink-message-format.patch @@ -0,0 +1,170 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Eelco Chaudron +Date: Tue, 24 Nov 2020 07:34:44 -0500 +Subject: net: openvswitch: fix TTL decrement action netlink message format + +From: Eelco Chaudron + +[ Upstream commit 69929d4c49e182f8526d42c43b37b460d562d3a0 ] + +Currently, the openvswitch module is not accepting the correctly formated +netlink message for the TTL decrement action. For both setting and getting +the dec_ttl action, the actions should be nested in the +OVS_DEC_TTL_ATTR_ACTION attribute as mentioned in the openvswitch.h uapi. + +When the original patch was sent, it was tested with a private OVS userspace +implementation. This implementation was unfortunately not upstreamed and +reviewed, hence an erroneous version of this patch was sent out. + +Leaving the patch as-is would cause problems as the kernel module could +interpret additional attributes as actions and vice-versa, due to the +actions not being encapsulated/nested within the actual attribute, but +being concatinated after it. + +Fixes: 744676e77720 ("openvswitch: add TTL decrement action") +Signed-off-by: Eelco Chaudron +Link: https://lore.kernel.org/r/160622121495.27296.888010441924340582.stgit@wsfd-netdev64.ntdv.lab.eng.bos.redhat.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + include/uapi/linux/openvswitch.h | 2 + + net/openvswitch/actions.c | 7 +-- + net/openvswitch/flow_netlink.c | 74 ++++++++++++++++++++++++++++----------- + 3 files changed, 60 insertions(+), 23 deletions(-) + +--- a/include/uapi/linux/openvswitch.h ++++ b/include/uapi/linux/openvswitch.h +@@ -1058,4 +1058,6 @@ enum ovs_dec_ttl_attr { + __OVS_DEC_TTL_ATTR_MAX + }; + ++#define OVS_DEC_TTL_ATTR_MAX (__OVS_DEC_TTL_ATTR_MAX - 1) ++ + #endif /* _LINUX_OPENVSWITCH_H */ +--- a/net/openvswitch/actions.c ++++ b/net/openvswitch/actions.c +@@ -970,14 +970,13 @@ static int dec_ttl_exception_handler(str + { + /* The first action is always 'OVS_DEC_TTL_ATTR_ARG'. */ + struct nlattr *dec_ttl_arg = nla_data(attr); +- int rem = nla_len(attr); + + if (nla_len(dec_ttl_arg)) { +- struct nlattr *actions = nla_next(dec_ttl_arg, &rem); ++ struct nlattr *actions = nla_data(dec_ttl_arg); + + if (actions) +- return clone_execute(dp, skb, key, 0, actions, rem, +- last, false); ++ return clone_execute(dp, skb, key, 0, nla_data(actions), ++ nla_len(actions), last, false); + } + consume_skb(skb); + return 0; +--- a/net/openvswitch/flow_netlink.c ++++ b/net/openvswitch/flow_netlink.c +@@ -2503,28 +2503,42 @@ static int validate_and_copy_dec_ttl(str + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log) + { +- int start, err; +- u32 nested = true; ++ const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1]; ++ int start, action_start, err, rem; ++ const struct nlattr *a, *actions; ++ ++ memset(attrs, 0, sizeof(attrs)); ++ nla_for_each_nested(a, attr, rem) { ++ int type = nla_type(a); ++ ++ /* Ignore unknown attributes to be future proof. */ ++ if (type > OVS_DEC_TTL_ATTR_MAX) ++ continue; + +- if (!nla_len(attr)) +- return ovs_nla_add_action(sfa, OVS_ACTION_ATTR_DEC_TTL, +- NULL, 0, log); ++ if (!type || attrs[type]) ++ return -EINVAL; ++ ++ attrs[type] = a; ++ } ++ ++ actions = attrs[OVS_DEC_TTL_ATTR_ACTION]; ++ if (rem || !actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) ++ return -EINVAL; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log); + if (start < 0) + return start; + +- err = ovs_nla_add_action(sfa, OVS_DEC_TTL_ATTR_ACTION, &nested, +- sizeof(nested), log); +- +- if (err) +- return err; ++ action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log); ++ if (action_start < 0) ++ return start; + +- err = __ovs_nla_copy_actions(net, attr, key, sfa, eth_type, ++ err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, + vlan_tci, mpls_label_count, log); + if (err) + return err; + ++ add_nested_action_end(*sfa, action_start); + add_nested_action_end(*sfa, start); + return 0; + } +@@ -3487,20 +3501,42 @@ out: + static int dec_ttl_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) + { +- int err = 0, rem = nla_len(attr); +- struct nlattr *start; ++ struct nlattr *start, *action_start; ++ const struct nlattr *a; ++ int err = 0, rem; + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL); +- + if (!start) + return -EMSGSIZE; + +- err = ovs_nla_put_actions(nla_data(attr), rem, skb); +- if (err) +- nla_nest_cancel(skb, start); +- else +- nla_nest_end(skb, start); ++ nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) { ++ switch (nla_type(a)) { ++ case OVS_DEC_TTL_ATTR_ACTION: ++ ++ action_start = nla_nest_start_noflag(skb, OVS_DEC_TTL_ATTR_ACTION); ++ if (!action_start) { ++ err = -EMSGSIZE; ++ goto out; ++ } + ++ err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); ++ if (err) ++ goto out; ++ ++ nla_nest_end(skb, action_start); ++ break; ++ ++ default: ++ /* Ignore all other option to be future compatible */ ++ break; ++ } ++ } ++ ++ nla_nest_end(skb, start); ++ return 0; ++ ++out: ++ nla_nest_cancel(skb, start); + return err; + } + diff --git a/queue-5.9/net-packet-fix-packet-receive-on-l3-devices-without-visible-hard-header.patch b/queue-5.9/net-packet-fix-packet-receive-on-l3-devices-without-visible-hard-header.patch new file mode 100644 index 00000000000..177a5bb3511 --- /dev/null +++ b/queue-5.9/net-packet-fix-packet-receive-on-l3-devices-without-visible-hard-header.patch @@ -0,0 +1,136 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Eyal Birger +Date: Sat, 21 Nov 2020 08:28:17 +0200 +Subject: net/packet: fix packet receive on L3 devices without visible hard header + +From: Eyal Birger + +[ Upstream commit d549699048b4b5c22dd710455bcdb76966e55aa3 ] + +In the patchset merged by commit b9fcf0a0d826 +("Merge branch 'support-AF_PACKET-for-layer-3-devices'") L3 devices which +did not have header_ops were given one for the purpose of protocol parsing +on af_packet transmit path. + +That change made af_packet receive path regard these devices as having a +visible L3 header and therefore aligned incoming skb->data to point to the +skb's mac_header. Some devices, such as ipip, xfrmi, and others, do not +reset their mac_header prior to ingress and therefore their incoming +packets became malformed. + +Ideally these devices would reset their mac headers, or af_packet would be +able to rely on dev->hard_header_len being 0 for such cases, but it seems +this is not the case. + +Fix by changing af_packet RX ll visibility criteria to include the +existence of a '.create()' header operation, which is used when creating +a device hard header - via dev_hard_header() - by upper layers, and does +not exist in these L3 devices. + +As this predicate may be useful in other situations, add it as a common +dev_has_header() helper in netdevice.h. + +Fixes: b9fcf0a0d826 ("Merge branch 'support-AF_PACKET-for-layer-3-devices'") +Signed-off-by: Eyal Birger +Acked-by: Jason A. Donenfeld +Acked-by: Willem de Bruijn +Link: https://lore.kernel.org/r/20201121062817.3178900-1-eyal.birger@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/netdevice.h | 5 +++++ + net/packet/af_packet.c | 38 +++++++++++++++++++++----------------- + 2 files changed, 26 insertions(+), 17 deletions(-) + +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -3103,6 +3103,11 @@ static inline bool dev_validate_header(c + return false; + } + ++static inline bool dev_has_header(const struct net_device *dev) ++{ ++ return dev->header_ops && dev->header_ops->create; ++} ++ + typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, + int len, int size); + int register_gifconf(unsigned int family, gifconf_func_t *gifconf); +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -93,38 +93,42 @@ + + /* + Assumptions: +- - if device has no dev->hard_header routine, it adds and removes ll header +- inside itself. In this case ll header is invisible outside of device, +- but higher levels still should reserve dev->hard_header_len. +- Some devices are enough clever to reallocate skb, when header +- will not fit to reserved space (tunnel), another ones are silly +- (PPP). ++ - If the device has no dev->header_ops->create, there is no LL header ++ visible above the device. In this case, its hard_header_len should be 0. ++ The device may prepend its own header internally. In this case, its ++ needed_headroom should be set to the space needed for it to add its ++ internal header. ++ For example, a WiFi driver pretending to be an Ethernet driver should ++ set its hard_header_len to be the Ethernet header length, and set its ++ needed_headroom to be (the real WiFi header length - the fake Ethernet ++ header length). + - packet socket receives packets with pulled ll header, + so that SOCK_RAW should push it back. + + On receive: + ----------- + +-Incoming, dev->hard_header!=NULL ++Incoming, dev_has_header(dev) == true + mac_header -> ll header + data -> data + +-Outgoing, dev->hard_header!=NULL ++Outgoing, dev_has_header(dev) == true + mac_header -> ll header + data -> ll header + +-Incoming, dev->hard_header==NULL +- mac_header -> UNKNOWN position. It is very likely, that it points to ll +- header. PPP makes it, that is wrong, because introduce +- assymetry between rx and tx paths. ++Incoming, dev_has_header(dev) == false ++ mac_header -> data ++ However drivers often make it point to the ll header. ++ This is incorrect because the ll header should be invisible to us. + data -> data + +-Outgoing, dev->hard_header==NULL +- mac_header -> data. ll header is still not built! ++Outgoing, dev_has_header(dev) == false ++ mac_header -> data. ll header is invisible to us. + data -> data + + Resume +- If dev->hard_header==NULL we are unlikely to restore sensible ll header. ++ If dev_has_header(dev) == false we are unable to restore the ll header, ++ because it is invisible to us. + + + On transmit: +@@ -2066,7 +2070,7 @@ static int packet_rcv(struct sk_buff *sk + + skb->dev = dev; + +- if (dev->header_ops) { ++ if (dev_has_header(dev)) { + /* The device has an explicit notion of ll header, + * exported to higher levels. + * +@@ -2195,7 +2199,7 @@ static int tpacket_rcv(struct sk_buff *s + if (!net_eq(dev_net(dev), sock_net(sk))) + goto drop; + +- if (dev->header_ops) { ++ if (dev_has_header(dev)) { + if (sk->sk_type != SOCK_DGRAM) + skb_push(skb, skb->data - skb_mac_header(skb)); + else if (skb->pkt_type == PACKET_OUTGOING) { diff --git a/queue-5.9/net-tls-missing-received-data-after-fast-remote-close.patch b/queue-5.9/net-tls-missing-received-data-after-fast-remote-close.patch new file mode 100644 index 00000000000..f1540283c99 --- /dev/null +++ b/queue-5.9/net-tls-missing-received-data-after-fast-remote-close.patch @@ -0,0 +1,50 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Vadim Fedorenko +Date: Thu, 19 Nov 2020 18:59:48 +0300 +Subject: net/tls: missing received data after fast remote close + +From: Vadim Fedorenko + +[ Upstream commit 20ffc7adf53a5fd3d19751fbff7895bcca66686e ] + +In case when tcp socket received FIN after some data and the +parser haven't started before reading data caller will receive +an empty buffer. This behavior differs from plain TCP socket and +leads to special treating in user-space. +The flow that triggers the race is simple. Server sends small +amount of data right after the connection is configured to use TLS +and closes the connection. In this case receiver sees TLS Handshake +data, configures TLS socket right after Change Cipher Spec record. +While the configuration is in process, TCP socket receives small +Application Data record, Encrypted Alert record and FIN packet. So +the TCP socket changes sk_shutdown to RCV_SHUTDOWN and sk_flag with +SK_DONE bit set. The received data is not parsed upon arrival and is +never sent to user-space. + +Patch unpauses parser directly if we have unparsed data in tcp +receive queue. + +Fixes: fcf4793e278e ("tls: check RCV_SHUTDOWN in tls_wait_data") +Signed-off-by: Vadim Fedorenko +Link: https://lore.kernel.org/r/1605801588-12236-1-git-send-email-vfedorenko@novek.ru +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/tls/tls_sw.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/net/tls/tls_sw.c ++++ b/net/tls/tls_sw.c +@@ -1295,6 +1295,12 @@ static struct sk_buff *tls_wait_data(str + return NULL; + } + ++ if (!skb_queue_empty(&sk->sk_receive_queue)) { ++ __strp_unpause(&ctx->strp); ++ if (ctx->recv_pkt) ++ return ctx->recv_pkt; ++ } ++ + if (sk->sk_shutdown & RCV_SHUTDOWN) + return NULL; + diff --git a/queue-5.9/net-tls-protect-from-calling-tls_dev_del-for-tls-rx-twice.patch b/queue-5.9/net-tls-protect-from-calling-tls_dev_del-for-tls-rx-twice.patch new file mode 100644 index 00000000000..5346d39fa40 --- /dev/null +++ b/queue-5.9/net-tls-protect-from-calling-tls_dev_del-for-tls-rx-twice.patch @@ -0,0 +1,65 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Maxim Mikityanskiy +Date: Wed, 25 Nov 2020 14:18:10 -0800 +Subject: net/tls: Protect from calling tls_dev_del for TLS RX twice + +From: Maxim Mikityanskiy + +[ Upstream commit 025cc2fb6a4e84e9a0552c0017dcd1c24b7ac7da ] + +tls_device_offload_cleanup_rx doesn't clear tls_ctx->netdev after +calling tls_dev_del if TLX TX offload is also enabled. Clearing +tls_ctx->netdev gets postponed until tls_device_gc_task. It leaves a +time frame when tls_device_down may get called and call tls_dev_del for +RX one extra time, confusing the driver, which may lead to a crash. + +This patch corrects this racy behavior by adding a flag to prevent +tls_device_down from calling tls_dev_del the second time. + +Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") +Signed-off-by: Maxim Mikityanskiy +Signed-off-by: Saeed Mahameed +Link: https://lore.kernel.org/r/20201125221810.69870-1-saeedm@nvidia.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tls.h | 6 ++++++ + net/tls/tls_device.c | 5 ++++- + 2 files changed, 10 insertions(+), 1 deletion(-) + +--- a/include/net/tls.h ++++ b/include/net/tls.h +@@ -199,6 +199,12 @@ enum tls_context_flags { + * to be atomic. + */ + TLS_TX_SYNC_SCHED = 1, ++ /* tls_dev_del was called for the RX side, device state was released, ++ * but tls_ctx->netdev might still be kept, because TX-side driver ++ * resources might not be released yet. Used to prevent the second ++ * tls_dev_del call in tls_device_down if it happens simultaneously. ++ */ ++ TLS_RX_DEV_CLOSED = 2, + }; + + struct cipher_context { +--- a/net/tls/tls_device.c ++++ b/net/tls/tls_device.c +@@ -1262,6 +1262,8 @@ void tls_device_offload_cleanup_rx(struc + if (tls_ctx->tx_conf != TLS_HW) { + dev_put(netdev); + tls_ctx->netdev = NULL; ++ } else { ++ set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); + } + out: + up_read(&device_offload_lock); +@@ -1291,7 +1293,8 @@ static int tls_device_down(struct net_de + if (ctx->tx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); +- if (ctx->rx_conf == TLS_HW) ++ if (ctx->rx_conf == TLS_HW && ++ !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_RX); + WRITE_ONCE(ctx->netdev, NULL); diff --git a/queue-5.9/rose-fix-null-pointer-dereference-in-rose_send_frame.patch b/queue-5.9/rose-fix-null-pointer-dereference-in-rose_send_frame.patch new file mode 100644 index 00000000000..154266e6503 --- /dev/null +++ b/queue-5.9/rose-fix-null-pointer-dereference-in-rose_send_frame.patch @@ -0,0 +1,64 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Anmol Karn +Date: Fri, 20 Nov 2020 00:40:43 +0530 +Subject: rose: Fix Null pointer dereference in rose_send_frame() + +From: Anmol Karn + +[ Upstream commit 3b3fd068c56e3fbea30090859216a368398e39bf ] + +rose_send_frame() dereferences `neigh->dev` when called from +rose_transmit_clear_request(), and the first occurrence of the +`neigh` is in rose_loopback_timer() as `rose_loopback_neigh`, +and it is initialized in rose_add_loopback_neigh() as NULL. +i.e when `rose_loopback_neigh` used in rose_loopback_timer() +its `->dev` was still NULL and rose_loopback_timer() was calling +rose_rx_call_request() without checking for NULL. + +- net/rose/rose_link.c +This bug seems to get triggered in this line: + +rose_call = (ax25_address *)neigh->dev->dev_addr; + +Fix it by adding NULL checking for `rose_loopback_neigh->dev` +in rose_loopback_timer(). + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Suggested-by: Jakub Kicinski +Reported-by: syzbot+a1c743815982d9496393@syzkaller.appspotmail.com +Tested-by: syzbot+a1c743815982d9496393@syzkaller.appspotmail.com +Link: https://syzkaller.appspot.com/bug?id=9d2a7ca8c7f2e4b682c97578dfa3f236258300b3 +Signed-off-by: Anmol Karn +Link: https://lore.kernel.org/r/20201119191043.28813-1-anmol.karan123@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/rose/rose_loopback.c | 17 +++++++++++++---- + 1 file changed, 13 insertions(+), 4 deletions(-) + +--- a/net/rose/rose_loopback.c ++++ b/net/rose/rose_loopback.c +@@ -96,10 +96,19 @@ static void rose_loopback_timer(struct t + } + + if (frametype == ROSE_CALL_REQUEST) { +- if ((dev = rose_dev_get(dest)) != NULL) { +- if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) +- kfree_skb(skb); +- } else { ++ if (!rose_loopback_neigh->dev) { ++ kfree_skb(skb); ++ continue; ++ } ++ ++ dev = rose_dev_get(dest); ++ if (!dev) { ++ kfree_skb(skb); ++ continue; ++ } ++ ++ if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) { ++ dev_put(dev); + kfree_skb(skb); + } + } else { diff --git a/queue-5.9/series b/queue-5.9/series new file mode 100644 index 00000000000..9b3575599b1 --- /dev/null +++ b/queue-5.9/series @@ -0,0 +1,15 @@ +devlink-hold-rtnl-lock-while-reading-netdev-attributes.patch +devlink-make-sure-devlink-instance-and-port-are-in-same-net-namespace.patch +ipv6-addrlabel-fix-possible-memory-leak-in-ip6addrlbl_net_init.patch +net-af_iucv-set-correct-sk_protocol-for-child-sockets.patch +net-openvswitch-fix-ttl-decrement-action-netlink-message-format.patch +net-tls-missing-received-data-after-fast-remote-close.patch +net-tls-protect-from-calling-tls_dev_del-for-tls-rx-twice.patch +rose-fix-null-pointer-dereference-in-rose_send_frame.patch +sock-set-sk_err-to-ee_errno-on-dequeue-from-errq.patch +tcp-set-inet_ecn_xmit-configuration-in-tcp_reinit_congestion_control.patch +tun-honor-iocb_nowait-flag.patch +usbnet-ipheth-fix-connectivity-with-ios-14.patch +vsock-virtio-discard-packets-only-when-socket-is-really-closed.patch +mptcp-fix-null-ptr-dereference-on-bad-mpj.patch +net-packet-fix-packet-receive-on-l3-devices-without-visible-hard-header.patch diff --git a/queue-5.9/sock-set-sk_err-to-ee_errno-on-dequeue-from-errq.patch b/queue-5.9/sock-set-sk_err-to-ee_errno-on-dequeue-from-errq.patch new file mode 100644 index 00000000000..7f1702049c6 --- /dev/null +++ b/queue-5.9/sock-set-sk_err-to-ee_errno-on-dequeue-from-errq.patch @@ -0,0 +1,47 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Willem de Bruijn +Date: Thu, 26 Nov 2020 10:12:20 -0500 +Subject: sock: set sk_err to ee_errno on dequeue from errq + +From: Willem de Bruijn + +[ Upstream commit 985f7337421a811cb354ca93882f943c8335a6f5 ] + +When setting sk_err, set it to ee_errno, not ee_origin. + +Commit f5f99309fa74 ("sock: do not set sk_err in +sock_dequeue_err_skb") disabled updating sk_err on errq dequeue, +which is correct for most error types (origins): + + - sk->sk_err = err; + +Commit 38b257938ac6 ("sock: reset sk_err when the error queue is +empty") reenabled the behavior for IMCP origins, which do require it: + + + if (icmp_next) + + sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; + +But read from ee_errno. + +Fixes: 38b257938ac6 ("sock: reset sk_err when the error queue is empty") +Reported-by: Ayush Ranjan +Signed-off-by: Willem de Bruijn +Acked-by: Soheil Hassas Yeganeh +Link: https://lore.kernel.org/r/20201126151220.2819322-1-willemdebruijn.kernel@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/core/skbuff.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -4555,7 +4555,7 @@ struct sk_buff *sock_dequeue_err_skb(str + if (skb && (skb_next = skb_peek(q))) { + icmp_next = is_icmp_err_skb(skb_next); + if (icmp_next) +- sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; ++ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + } + spin_unlock_irqrestore(&q->lock, flags); + diff --git a/queue-5.9/tcp-set-inet_ecn_xmit-configuration-in-tcp_reinit_congestion_control.patch b/queue-5.9/tcp-set-inet_ecn_xmit-configuration-in-tcp_reinit_congestion_control.patch new file mode 100644 index 00000000000..948a333a2d9 --- /dev/null +++ b/queue-5.9/tcp-set-inet_ecn_xmit-configuration-in-tcp_reinit_congestion_control.patch @@ -0,0 +1,49 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Alexander Duyck +Date: Thu, 19 Nov 2020 13:23:58 -0800 +Subject: tcp: Set INET_ECN_xmit configuration in tcp_reinit_congestion_control + +From: Alexander Duyck + +[ Upstream commit 55472017a4219ca965a957584affdb17549ae4a4 ] + +When setting congestion control via a BPF program it is seen that the +SYN/ACK for packets within a given flow will not include the ECT0 flag. A +bit of simple printk debugging shows that when this is configured without +BPF we will see the value INET_ECN_xmit value initialized in +tcp_assign_congestion_control however when we configure this via BPF the +socket is in the closed state and as such it isn't configured, and I do not +see it being initialized when we transition the socket into the listen +state. The result of this is that the ECT0 bit is configured based on +whatever the default state is for the socket. + +Any easy way to reproduce this is to monitor the following with tcpdump: +tools/testing/selftests/bpf/test_progs -t bpf_tcp_ca + +Without this patch the SYN/ACK will follow whatever the default is. If dctcp +all SYN/ACK packets will have the ECT0 bit set, and if it is not then ECT0 +will be cleared on all SYN/ACK packets. With this patch applied the SYN/ACK +bit matches the value seen on the other packets in the given stream. + +Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control") +Signed-off-by: Alexander Duyck +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_cong.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -197,6 +197,11 @@ static void tcp_reinit_congestion_contro + icsk->icsk_ca_setsockopt = 1; + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + ++ if (ca->flags & TCP_CONG_NEEDS_ECN) ++ INET_ECN_xmit(sk); ++ else ++ INET_ECN_dontxmit(sk); ++ + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + tcp_init_congestion_control(sk); + } diff --git a/queue-5.9/tun-honor-iocb_nowait-flag.patch b/queue-5.9/tun-honor-iocb_nowait-flag.patch new file mode 100644 index 00000000000..0554741a652 --- /dev/null +++ b/queue-5.9/tun-honor-iocb_nowait-flag.patch @@ -0,0 +1,59 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Jens Axboe +Date: Fri, 20 Nov 2020 07:59:54 -0700 +Subject: tun: honor IOCB_NOWAIT flag + +From: Jens Axboe + +[ Upstream commit 5aac0390a63b8718237a61dd0d24a29201d1c94a ] + +tun only checks the file O_NONBLOCK flag, but it should also be checking +the iocb IOCB_NOWAIT flag. Any fops using ->read/write_iter() should check +both, otherwise it breaks users that correctly expect O_NONBLOCK semantics +if IOCB_NOWAIT is set. + +Signed-off-by: Jens Axboe +Link: https://lore.kernel.org/r/e9451860-96cc-c7c7-47b8-fe42cadd5f4c@kernel.dk +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1979,12 +1979,15 @@ static ssize_t tun_chr_write_iter(struct + struct tun_file *tfile = file->private_data; + struct tun_struct *tun = tun_get(tfile); + ssize_t result; ++ int noblock = 0; + + if (!tun) + return -EBADFD; + +- result = tun_get_user(tun, tfile, NULL, from, +- file->f_flags & O_NONBLOCK, false); ++ if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) ++ noblock = 1; ++ ++ result = tun_get_user(tun, tfile, NULL, from, noblock, false); + + tun_put(tun); + return result; +@@ -2203,10 +2206,15 @@ static ssize_t tun_chr_read_iter(struct + struct tun_file *tfile = file->private_data; + struct tun_struct *tun = tun_get(tfile); + ssize_t len = iov_iter_count(to), ret; ++ int noblock = 0; + + if (!tun) + return -EBADFD; +- ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL); ++ ++ if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) ++ noblock = 1; ++ ++ ret = tun_do_read(tun, tfile, to, noblock, NULL); + ret = min_t(ssize_t, ret, len); + if (ret > 0) + iocb->ki_pos = ret; diff --git a/queue-5.9/usbnet-ipheth-fix-connectivity-with-ios-14.patch b/queue-5.9/usbnet-ipheth-fix-connectivity-with-ios-14.patch new file mode 100644 index 00000000000..68e54bdf357 --- /dev/null +++ b/queue-5.9/usbnet-ipheth-fix-connectivity-with-ios-14.patch @@ -0,0 +1,49 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Yves-Alexis Perez +Date: Thu, 19 Nov 2020 18:24:39 +0100 +Subject: usbnet: ipheth: fix connectivity with iOS 14 + +From: Yves-Alexis Perez + +[ Upstream commit f33d9e2b48a34e1558b67a473a1fc1d6e793f93c ] + +Starting with iOS 14 released in September 2020, connectivity using the +personal hotspot USB tethering function of iOS devices is broken. + +Communication between the host and the device (for example ICMP traffic +or DNS resolution using the DNS service running in the device itself) +works fine, but communication to endpoints further away doesn't work. + +Investigation on the matter shows that no UDP and ICMP traffic from the +tethered host is reaching the Internet at all. For TCP traffic there are +exchanges between tethered host and server but packets are modified in +transit leading to impossible communication. + +After some trials Matti Vuorela discovered that reducing the URB buffer +size by two bytes restored the previous behavior. While a better +solution might exist to fix the issue, since the protocol is not +publicly documented and considering the small size of the fix, let's do +that. + +Tested-by: Matti Vuorela +Signed-off-by: Yves-Alexis Perez +Link: https://lore.kernel.org/linux-usb/CAAn0qaXmysJ9vx3ZEMkViv_B19ju-_ExN8Yn_uSefxpjS6g4Lw@mail.gmail.com/ +Link: https://github.com/libimobiledevice/libimobiledevice/issues/1038 +Link: https://lore.kernel.org/r/20201119172439.94988-1-corsac@corsac.net +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/usb/ipheth.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/usb/ipheth.c ++++ b/drivers/net/usb/ipheth.c +@@ -59,7 +59,7 @@ + #define IPHETH_USBINTF_SUBCLASS 253 + #define IPHETH_USBINTF_PROTO 1 + +-#define IPHETH_BUF_SIZE 1516 ++#define IPHETH_BUF_SIZE 1514 + #define IPHETH_IP_ALIGN 2 /* padding at front of URB */ + #define IPHETH_TX_TIMEOUT (5 * HZ) + diff --git a/queue-5.9/vsock-virtio-discard-packets-only-when-socket-is-really-closed.patch b/queue-5.9/vsock-virtio-discard-packets-only-when-socket-is-really-closed.patch new file mode 100644 index 00000000000..0094056a399 --- /dev/null +++ b/queue-5.9/vsock-virtio-discard-packets-only-when-socket-is-really-closed.patch @@ -0,0 +1,73 @@ +From foo@baz Wed Dec 2 10:40:54 AM CET 2020 +From: Stefano Garzarella +Date: Fri, 20 Nov 2020 11:47:36 +0100 +Subject: vsock/virtio: discard packets only when socket is really closed + +From: Stefano Garzarella + +[ Upstream commit 3fe356d58efae54dade9ec94ea7c919ed20cf4db ] + +Starting from commit 8692cefc433f ("virtio_vsock: Fix race condition +in virtio_transport_recv_pkt"), we discard packets in +virtio_transport_recv_pkt() if the socket has been released. + +When the socket is connected, we schedule a delayed work to wait the +RST packet from the other peer, also if SHUTDOWN_MASK is set in +sk->sk_shutdown. +This is done to complete the virtio-vsock shutdown algorithm, releasing +the port assigned to the socket definitively only when the other peer +has consumed all the packets. + +If we discard the RST packet received, the socket will be closed only +when the VSOCK_CLOSE_TIMEOUT is reached. + +Sergio discovered the issue while running ab(1) HTTP benchmark using +libkrun [1] and observing a latency increase with that commit. + +To avoid this issue, we discard packet only if the socket is really +closed (SOCK_DONE flag is set). +We also set SOCK_DONE in virtio_transport_release() when we don't need +to wait any packets from the other peer (we didn't schedule the delayed +work). In this case we remove the socket from the vsock lists, releasing +the port assigned. + +[1] https://github.com/containers/libkrun + +Fixes: 8692cefc433f ("virtio_vsock: Fix race condition in virtio_transport_recv_pkt") +Cc: justin.he@arm.com +Reported-by: Sergio Lopez +Tested-by: Sergio Lopez +Signed-off-by: Stefano Garzarella +Acked-by: Jia He +Link: https://lore.kernel.org/r/20201120104736.73749-1-sgarzare@redhat.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/vmw_vsock/virtio_transport_common.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/net/vmw_vsock/virtio_transport_common.c ++++ b/net/vmw_vsock/virtio_transport_common.c +@@ -841,8 +841,10 @@ void virtio_transport_release(struct vso + virtio_transport_free_pkt(pkt); + } + +- if (remove_sock) ++ if (remove_sock) { ++ sock_set_flag(sk, SOCK_DONE); + vsock_remove_sock(vsk); ++ } + } + EXPORT_SYMBOL_GPL(virtio_transport_release); + +@@ -1132,8 +1134,8 @@ void virtio_transport_recv_pkt(struct vi + + lock_sock(sk); + +- /* Check if sk has been released before lock_sock */ +- if (sk->sk_shutdown == SHUTDOWN_MASK) { ++ /* Check if sk has been closed before lock_sock */ ++ if (sock_flag(sk, SOCK_DONE)) { + (void)virtio_transport_reset_no_sock(t, pkt); + release_sock(sk); + sock_put(sk);