From: Greg Kroah-Hartman Date: Thu, 14 Dec 2017 10:47:09 +0000 (+0100) Subject: 4.14-stable patches X-Git-Tag: v3.18.88~23 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=fa55523a3ed86c223eb79cc908d8fb7644b2b682;p=thirdparty%2Fkernel%2Fstable-queue.git 4.14-stable patches added patches: cls_bpf-don-t-decrement-net-s-refcount-when-offload-fails.patch net-accept-ufo-datagrams-from-tuntap-and-packet.patch net-ipv6-fixup-device-for-anycast-routes-during-copy.patch net-openvswitch-datapath-fix-data-type-in-queue_gso_packets.patch net-packet-fix-a-race-in-packet_bind-and-packet_notifier.patch net-qmi_wwan-add-quectel-bg96-2c7c-0296.patch net-realtek-r8169-implement-set_link_ksettings.patch net-remove-hlist_nulls_add_tail_rcu.patch net-sched-cbq-create-block-for-q-link.block.patch net-thunderx-fix-tcp-udp-checksum-offload-for-ipv4-pkts.patch net-thunderx-fix-tcp-udp-checksum-offload-for-ipv6-pkts.patch packet-fix-crash-in-fanout_demux_rollover.patch rds-fix-null-pointer-dereference-in-__rds_rdma_map.patch s390-qeth-build-max-size-gso-skbs-on-l2-devices.patch s390-qeth-fix-early-exit-from-error-path.patch s390-qeth-fix-gso-throughput-regression.patch s390-qeth-fix-thinko-in-ipv4-multicast-address-tracking.patch sctp-use-right-member-as-the-param-of-list_for_each_entry.patch sit-update-frag_off-info.patch stmmac-reset-last-tso-segment-size-after-device-open.patch tap-free-skb-if-flags-error.patch tcp-add-tcp_v4_fill_cb-tcp_v4_restore_cb.patch tcp-dccp-block-bh-before-arming-time_wait-timer.patch tcp-remove-buggy-call-to-tcp_v6_restore_cb.patch tcp-use-current-time-in-tcp_rcv_space_adjust.patch tcp-use-ipcb-instead-of-tcp_skb_cb-in-inet_exact_dif_match.patch tcp-when-scheduling-tlp-time-of-rto-should-account-for-current-ack.patch tipc-call-tipc_rcv-only-if-bearer-is-up-in-tipc_udp_recv.patch tipc-fix-memory-leak-in-tipc_accept_from_sock.patch tun-fix-rcu_read_lock-imbalance-in-tun_build_skb.patch tun-free-skb-in-early-errors.patch usbnet-fix-alignment-for-frames-with-no-ethernet-header.patch vhost-fix-skb-leak-in-handle_rx.patch --- diff --git a/queue-4.14/cls_bpf-don-t-decrement-net-s-refcount-when-offload-fails.patch b/queue-4.14/cls_bpf-don-t-decrement-net-s-refcount-when-offload-fails.patch new file mode 100644 index 00000000000..90d84649f63 --- /dev/null +++ b/queue-4.14/cls_bpf-don-t-decrement-net-s-refcount-when-offload-fails.patch @@ -0,0 +1,88 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Jakub Kicinski +Date: Mon, 27 Nov 2017 11:11:41 -0800 +Subject: cls_bpf: don't decrement net's refcount when offload fails + +From: Jakub Kicinski + + +[ Upstream commit 25415cec502a1232b19fffc85465882b19a90415 ] + +When cls_bpf offload was added it seemed like a good idea to +call cls_bpf_delete_prog() instead of extending the error +handling path, since the software state is fully initialized +at that point. This handling of errors without jumping to +the end of the function is error prone, as proven by later +commit missing that extra call to __cls_bpf_delete_prog(). + +__cls_bpf_delete_prog() is now expected to be invoked with +a reference on exts->net or the field zeroed out. The call +on the offload's error patch does not fullfil this requirement, +leading to each error stealing a reference on net namespace. + +Create a function undoing what cls_bpf_set_parms() did and +use it from __cls_bpf_delete_prog() and the error path. + +Fixes: aae2c35ec892 ("cls_bpf: use tcf_exts_get_net() before call_rcu()") +Signed-off-by: Jakub Kicinski +Reviewed-by: Simon Horman +Acked-by: Daniel Borkmann +Acked-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/cls_bpf.c | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) + +--- a/net/sched/cls_bpf.c ++++ b/net/sched/cls_bpf.c +@@ -246,11 +246,8 @@ static int cls_bpf_init(struct tcf_proto + return 0; + } + +-static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) ++static void cls_bpf_free_parms(struct cls_bpf_prog *prog) + { +- tcf_exts_destroy(&prog->exts); +- tcf_exts_put_net(&prog->exts); +- + if (cls_bpf_is_ebpf(prog)) + bpf_prog_put(prog->filter); + else +@@ -258,6 +255,14 @@ static void __cls_bpf_delete_prog(struct + + kfree(prog->bpf_name); + kfree(prog->bpf_ops); ++} ++ ++static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) ++{ ++ tcf_exts_destroy(&prog->exts); ++ tcf_exts_put_net(&prog->exts); ++ ++ cls_bpf_free_parms(prog); + kfree(prog); + } + +@@ -509,10 +514,8 @@ static int cls_bpf_change(struct net *ne + goto errout; + + ret = cls_bpf_offload(tp, prog, oldprog); +- if (ret) { +- __cls_bpf_delete_prog(prog); +- return ret; +- } ++ if (ret) ++ goto errout_parms; + + if (!tc_in_hw(prog->gen_flags)) + prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; +@@ -529,6 +532,8 @@ static int cls_bpf_change(struct net *ne + *arg = prog; + return 0; + ++errout_parms: ++ cls_bpf_free_parms(prog); + errout: + tcf_exts_destroy(&prog->exts); + kfree(prog); diff --git a/queue-4.14/net-accept-ufo-datagrams-from-tuntap-and-packet.patch b/queue-4.14/net-accept-ufo-datagrams-from-tuntap-and-packet.patch new file mode 100644 index 00000000000..a1c80caca1b --- /dev/null +++ b/queue-4.14/net-accept-ufo-datagrams-from-tuntap-and-packet.patch @@ -0,0 +1,519 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Willem de Bruijn +Date: Tue, 21 Nov 2017 10:22:25 -0500 +Subject: net: accept UFO datagrams from tuntap and packet + +From: Willem de Bruijn + + +[ Upstream commit 0c19f846d582af919db66a5914a0189f9f92c936 ] + +Tuntap and similar devices can inject GSO packets. Accept type +VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively. + +Processes are expected to use feature negotiation such as TUNSETOFFLOAD +to detect supported offload types and refrain from injecting other +packets. This process breaks down with live migration: guest kernels +do not renegotiate flags, so destination hosts need to expose all +features that the source host does. + +Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677. +This patch introduces nearly(*) no new code to simplify verification. +It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP +insertion and software UFO segmentation. + +It does not reinstate protocol stack support, hardware offload +(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception +of VIRTIO_NET_HDR_GSO_UDP packets in tuntap. + +To support SKB_GSO_UDP reappearing in the stack, also reinstate +logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD +by squashing in commit 939912216fa8 ("net: skb_needs_check() removes +CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1 +("net: avoid skb_warn_bad_offload false positives on UFO"). + +(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id, +ipv6_proxy_select_ident is changed to return a __be32 and this is +assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted +at the end of the enum to minimize code churn. + +Tested + Booted a v4.13 guest kernel with QEMU. On a host kernel before this + patch `ethtool -k eth0` shows UFO disabled. After the patch, it is + enabled, same as on a v4.13 host kernel. + + A UFO packet sent from the guest appears on the tap device: + host: + nc -l -p -u 8000 & + tcpdump -n -i tap0 + + guest: + dd if=/dev/zero of=payload.txt bs=1 count=2000 + nc -u 192.16.1.1 8000 < payload.txt + + Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds, + packets arriving fragmented: + + ./with_tap_pair.sh ./tap_send_ufo tap0 tap1 + (from https://github.com/wdebruij/kerneltools/tree/master/tests) + +Changes + v1 -> v2 + - simplified set_offload change (review comment) + - documented test procedure + +Link: http://lkml.kernel.org/r/ +Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.") +Reported-by: Michal Kubecek +Signed-off-by: Willem de Bruijn +Acked-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tap.c | 2 + drivers/net/tun.c | 2 + include/linux/netdev_features.h | 4 + + include/linux/netdevice.h | 1 + include/linux/skbuff.h | 2 + include/linux/virtio_net.h | 5 +- + include/net/ipv6.h | 2 + net/core/dev.c | 3 - + net/ipv4/af_inet.c | 12 ++++- + net/ipv4/udp_offload.c | 49 +++++++++++++++++++++-- + net/ipv6/output_core.c | 6 +- + net/ipv6/udp_offload.c | 85 ++++++++++++++++++++++++++++++++++++++-- + net/openvswitch/datapath.c | 14 ++++++ + net/openvswitch/flow.c | 6 ++ + net/sched/act_csum.c | 6 ++ + 15 files changed, 181 insertions(+), 18 deletions(-) + +--- a/drivers/net/tap.c ++++ b/drivers/net/tap.c +@@ -1080,7 +1080,7 @@ static long tap_ioctl(struct file *file, + case TUNSETOFFLOAD: + /* let the user check for future flags */ + if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | +- TUN_F_TSO_ECN)) ++ TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + rtnl_lock(); +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -2157,6 +2157,8 @@ static int set_offload(struct tun_struct + features |= NETIF_F_TSO6; + arg &= ~(TUN_F_TSO4|TUN_F_TSO6); + } ++ ++ arg &= ~TUN_F_UFO; + } + + /* This gives the user a way to test for new features in future by +--- a/include/linux/netdev_features.h ++++ b/include/linux/netdev_features.h +@@ -54,8 +54,9 @@ enum { + NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ + NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */ + NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */ ++ NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */ + /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ +- NETIF_F_GSO_ESP_BIT, ++ NETIF_F_GSO_UDP_BIT, + + NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ + NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ +@@ -132,6 +133,7 @@ enum { + #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) + #define NETIF_F_GSO_SCTP __NETIF_F(GSO_SCTP) + #define NETIF_F_GSO_ESP __NETIF_F(GSO_ESP) ++#define NETIF_F_GSO_UDP __NETIF_F(GSO_UDP) + #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) + #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) + #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -4101,6 +4101,7 @@ static inline bool net_gso_ok(netdev_fea + BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT)); ++ BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT)); + + return (features & feature) == feature; + } +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -569,6 +569,8 @@ enum { + SKB_GSO_SCTP = 1 << 14, + + SKB_GSO_ESP = 1 << 15, ++ ++ SKB_GSO_UDP = 1 << 16, + }; + + #if BITS_PER_LONG > 32 +--- a/include/linux/virtio_net.h ++++ b/include/linux/virtio_net.h +@@ -9,7 +9,7 @@ static inline int virtio_net_hdr_to_skb( + const struct virtio_net_hdr *hdr, + bool little_endian) + { +- unsigned short gso_type = 0; ++ unsigned int gso_type = 0; + + if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { +@@ -19,6 +19,9 @@ static inline int virtio_net_hdr_to_skb( + case VIRTIO_NET_HDR_GSO_TCPV6: + gso_type = SKB_GSO_TCPV6; + break; ++ case VIRTIO_NET_HDR_GSO_UDP: ++ gso_type = SKB_GSO_UDP; ++ break; + default: + return -EINVAL; + } +--- a/include/net/ipv6.h ++++ b/include/net/ipv6.h +@@ -727,7 +727,7 @@ static inline int ipv6_addr_diff(const s + __be32 ipv6_select_ident(struct net *net, + const struct in6_addr *daddr, + const struct in6_addr *saddr); +-void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); ++__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); + + int ip6_dst_hoplimit(struct dst_entry *dst); + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -2735,7 +2735,8 @@ EXPORT_SYMBOL(skb_mac_gso_segment); + static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) + { + if (tx_path) +- return skb->ip_summed != CHECKSUM_PARTIAL; ++ return skb->ip_summed != CHECKSUM_PARTIAL && ++ skb->ip_summed != CHECKSUM_UNNECESSARY; + + return skb->ip_summed == CHECKSUM_NONE; + } +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -1221,9 +1221,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); + struct sk_buff *inet_gso_segment(struct sk_buff *skb, + netdev_features_t features) + { +- bool fixedid = false, gso_partial, encap; ++ bool udpfrag = false, fixedid = false, gso_partial, encap; + struct sk_buff *segs = ERR_PTR(-EINVAL); + const struct net_offload *ops; ++ unsigned int offset = 0; + struct iphdr *iph; + int proto, tot_len; + int nhoff; +@@ -1258,6 +1259,7 @@ struct sk_buff *inet_gso_segment(struct + segs = ERR_PTR(-EPROTONOSUPPORT); + + if (!skb->encapsulation || encap) { ++ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); + + /* fixed ID is invalid if DF bit is not set */ +@@ -1277,7 +1279,13 @@ struct sk_buff *inet_gso_segment(struct + skb = segs; + do { + iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); +- if (skb_is_gso(skb)) { ++ if (udpfrag) { ++ iph->frag_off = htons(offset >> 3); ++ if (skb->next) ++ iph->frag_off |= htons(IP_MF); ++ offset += skb->len - nhoff - ihl; ++ tot_len = skb->len - nhoff; ++ } else if (skb_is_gso(skb)) { + if (!fixedid) { + iph->id = htons(id); + id += skb_shinfo(skb)->gso_segs; +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -187,16 +187,57 @@ out_unlock: + } + EXPORT_SYMBOL(skb_udp_tunnel_segment); + +-static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb, +- netdev_features_t features) ++static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, ++ netdev_features_t features) + { + struct sk_buff *segs = ERR_PTR(-EINVAL); ++ unsigned int mss; ++ __wsum csum; ++ struct udphdr *uh; ++ struct iphdr *iph; + + if (skb->encapsulation && + (skb_shinfo(skb)->gso_type & +- (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) ++ (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { + segs = skb_udp_tunnel_segment(skb, features, false); ++ goto out; ++ } ++ ++ if (!pskb_may_pull(skb, sizeof(struct udphdr))) ++ goto out; + ++ mss = skb_shinfo(skb)->gso_size; ++ if (unlikely(skb->len <= mss)) ++ goto out; ++ ++ /* Do software UFO. Complete and fill in the UDP checksum as ++ * HW cannot do checksum of UDP packets sent as multiple ++ * IP fragments. ++ */ ++ ++ uh = udp_hdr(skb); ++ iph = ip_hdr(skb); ++ ++ uh->check = 0; ++ csum = skb_checksum(skb, 0, skb->len, 0); ++ uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); ++ if (uh->check == 0) ++ uh->check = CSUM_MANGLED_0; ++ ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ ++ /* If there is no outer header we can fake a checksum offload ++ * due to the fact that we have already done the checksum in ++ * software prior to segmenting the frame. ++ */ ++ if (!skb->encap_hdr_csum) ++ features |= NETIF_F_HW_CSUM; ++ ++ /* Fragment the skb. IP headers of the fragments are updated in ++ * inet_gso_segment() ++ */ ++ segs = skb_segment(skb, features); ++out: + return segs; + } + +@@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_b + + static const struct net_offload udpv4_offload = { + .callbacks = { +- .gso_segment = udp4_tunnel_segment, ++ .gso_segment = udp4_ufo_fragment, + .gro_receive = udp4_gro_receive, + .gro_complete = udp4_gro_complete, + }, +--- a/net/ipv6/output_core.c ++++ b/net/ipv6/output_core.c +@@ -39,7 +39,7 @@ static u32 __ipv6_select_ident(struct ne + * + * The network header must be set before calling this. + */ +-void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) ++__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) + { + static u32 ip6_proxy_idents_hashrnd __read_mostly; + struct in6_addr buf[2]; +@@ -51,14 +51,14 @@ void ipv6_proxy_select_ident(struct net + offsetof(struct ipv6hdr, saddr), + sizeof(buf), buf); + if (!addrs) +- return; ++ return 0; + + net_get_random_once(&ip6_proxy_idents_hashrnd, + sizeof(ip6_proxy_idents_hashrnd)); + + id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, + &addrs[1], &addrs[0]); +- skb_shinfo(skb)->ip6_frag_id = htonl(id); ++ return htonl(id); + } + EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); + +--- a/net/ipv6/udp_offload.c ++++ b/net/ipv6/udp_offload.c +@@ -17,15 +17,94 @@ + #include + #include "ip6_offload.h" + +-static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb, +- netdev_features_t features) ++static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, ++ netdev_features_t features) + { + struct sk_buff *segs = ERR_PTR(-EINVAL); ++ unsigned int mss; ++ unsigned int unfrag_ip6hlen, unfrag_len; ++ struct frag_hdr *fptr; ++ u8 *packet_start, *prevhdr; ++ u8 nexthdr; ++ u8 frag_hdr_sz = sizeof(struct frag_hdr); ++ __wsum csum; ++ int tnl_hlen; ++ int err; ++ ++ mss = skb_shinfo(skb)->gso_size; ++ if (unlikely(skb->len <= mss)) ++ goto out; + + if (skb->encapsulation && skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) + segs = skb_udp_tunnel_segment(skb, features, true); ++ else { ++ const struct ipv6hdr *ipv6h; ++ struct udphdr *uh; ++ ++ if (!pskb_may_pull(skb, sizeof(struct udphdr))) ++ goto out; ++ ++ /* Do software UFO. Complete and fill in the UDP checksum as HW cannot ++ * do checksum of UDP packets sent as multiple IP fragments. ++ */ ++ ++ uh = udp_hdr(skb); ++ ipv6h = ipv6_hdr(skb); ++ ++ uh->check = 0; ++ csum = skb_checksum(skb, 0, skb->len, 0); ++ uh->check = udp_v6_check(skb->len, &ipv6h->saddr, ++ &ipv6h->daddr, csum); ++ if (uh->check == 0) ++ uh->check = CSUM_MANGLED_0; ++ ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ ++ /* If there is no outer header we can fake a checksum offload ++ * due to the fact that we have already done the checksum in ++ * software prior to segmenting the frame. ++ */ ++ if (!skb->encap_hdr_csum) ++ features |= NETIF_F_HW_CSUM; ++ ++ /* Check if there is enough headroom to insert fragment header. */ ++ tnl_hlen = skb_tnl_header_len(skb); ++ if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { ++ if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) ++ goto out; ++ } ++ ++ /* Find the unfragmentable header and shift it left by frag_hdr_sz ++ * bytes to insert fragment header. ++ */ ++ err = ip6_find_1stfragopt(skb, &prevhdr); ++ if (err < 0) ++ return ERR_PTR(err); ++ unfrag_ip6hlen = err; ++ nexthdr = *prevhdr; ++ *prevhdr = NEXTHDR_FRAGMENT; ++ unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + ++ unfrag_ip6hlen + tnl_hlen; ++ packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; ++ memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); ++ ++ SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; ++ skb->mac_header -= frag_hdr_sz; ++ skb->network_header -= frag_hdr_sz; ++ ++ fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); ++ fptr->nexthdr = nexthdr; ++ fptr->reserved = 0; ++ fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb); ++ ++ /* Fragment the skb. ipv6 header and the remaining fields of the ++ * fragment header are updated in ipv6_gso_segment() ++ */ ++ segs = skb_segment(skb, features); ++ } + ++out: + return segs; + } + +@@ -75,7 +154,7 @@ static int udp6_gro_complete(struct sk_b + + static const struct net_offload udpv6_offload = { + .callbacks = { +- .gso_segment = udp6_tunnel_segment, ++ .gso_segment = udp6_ufo_fragment, + .gro_receive = udp6_gro_receive, + .gro_complete = udp6_gro_complete, + }, +--- a/net/openvswitch/datapath.c ++++ b/net/openvswitch/datapath.c +@@ -335,6 +335,8 @@ static int queue_gso_packets(struct data + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) + { ++ unsigned short gso_type = skb_shinfo(skb)->gso_type; ++ struct sw_flow_key later_key; + struct sk_buff *segs, *nskb; + int err; + +@@ -345,9 +347,21 @@ static int queue_gso_packets(struct data + if (segs == NULL) + return -EINVAL; + ++ if (gso_type & SKB_GSO_UDP) { ++ /* The initial flow key extracted by ovs_flow_key_extract() ++ * in this case is for a first fragment, so we need to ++ * properly mark later fragments. ++ */ ++ later_key = *key; ++ later_key.ip.frag = OVS_FRAG_TYPE_LATER; ++ } ++ + /* Queue all of the segments. */ + skb = segs; + do { ++ if (gso_type & SKB_GSO_UDP && skb != segs) ++ key = &later_key; ++ + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); + if (err) + break; +--- a/net/openvswitch/flow.c ++++ b/net/openvswitch/flow.c +@@ -584,7 +584,8 @@ static int key_extract(struct sk_buff *s + key->ip.frag = OVS_FRAG_TYPE_LATER; + return 0; + } +- if (nh->frag_off & htons(IP_MF)) ++ if (nh->frag_off & htons(IP_MF) || ++ skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + else + key->ip.frag = OVS_FRAG_TYPE_NONE; +@@ -700,6 +701,9 @@ static int key_extract(struct sk_buff *s + + if (key->ip.frag == OVS_FRAG_TYPE_LATER) + return 0; ++ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) ++ key->ip.frag = OVS_FRAG_TYPE_FIRST; ++ + /* Transport layer. */ + if (key->ip.proto == NEXTHDR_TCP) { + if (tcphdr_ok(skb)) { +--- a/net/sched/act_csum.c ++++ b/net/sched/act_csum.c +@@ -229,6 +229,9 @@ static int tcf_csum_ipv4_udp(struct sk_b + const struct iphdr *iph; + u16 ul; + ++ if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) ++ return 1; ++ + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, +@@ -282,6 +285,9 @@ static int tcf_csum_ipv6_udp(struct sk_b + const struct ipv6hdr *ip6h; + u16 ul; + ++ if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) ++ return 1; ++ + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, diff --git a/queue-4.14/net-ipv6-fixup-device-for-anycast-routes-during-copy.patch b/queue-4.14/net-ipv6-fixup-device-for-anycast-routes-during-copy.patch new file mode 100644 index 00000000000..a7ea01da3de --- /dev/null +++ b/queue-4.14/net-ipv6-fixup-device-for-anycast-routes-during-copy.patch @@ -0,0 +1,51 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: David Ahern +Date: Tue, 21 Nov 2017 07:08:57 -0800 +Subject: net: ipv6: Fixup device for anycast routes during copy + +From: David Ahern + + +[ Upstream commit 98d11291d189cb5adf49694d0ad1b971c0212697 ] + +Florian reported a breakage with anycast routes due to commit +4832c30d5458 ("net: ipv6: put host and anycast routes on device with +address"). Prior to this commit anycast routes were added against the +loopback device causing repetitive route entries with no insight into +why they existed. e.g.: + $ ip -6 ro ls table local type anycast + anycast 2001:db8:1:: dev lo proto kernel metric 0 pref medium + anycast 2001:db8:2:: dev lo proto kernel metric 0 pref medium + anycast fe80:: dev lo proto kernel metric 0 pref medium + anycast fe80:: dev lo proto kernel metric 0 pref medium + +The point of commit 4832c30d5458 is to add the routes using the device +with the address which is causing the route to be added. e.g.,: + $ ip -6 ro ls table local type anycast + anycast 2001:db8:1:: dev eth1 proto kernel metric 0 pref medium + anycast 2001:db8:2:: dev eth2 proto kernel metric 0 pref medium + anycast fe80:: dev eth2 proto kernel metric 0 pref medium + anycast fe80:: dev eth1 proto kernel metric 0 pref medium + +For traffic to work as it did before, the dst device needs to be switched +to the loopback when the copy is created similar to local routes. + +Fixes: 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") +Signed-off-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/route.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -960,7 +960,7 @@ static struct net_device *ip6_rt_get_dev + { + struct net_device *dev = rt->dst.dev; + +- if (rt->rt6i_flags & RTF_LOCAL) { ++ if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { + /* for copies of local routes, dst->dev needs to be the + * device if it is a master device, the master device if + * device is enslaved, and the loopback as the default diff --git a/queue-4.14/net-openvswitch-datapath-fix-data-type-in-queue_gso_packets.patch b/queue-4.14/net-openvswitch-datapath-fix-data-type-in-queue_gso_packets.patch new file mode 100644 index 00000000000..0c43ace4941 --- /dev/null +++ b/queue-4.14/net-openvswitch-datapath-fix-data-type-in-queue_gso_packets.patch @@ -0,0 +1,43 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: "Gustavo A. R. Silva" +Date: Sat, 25 Nov 2017 13:14:40 -0600 +Subject: net: openvswitch: datapath: fix data type in queue_gso_packets + +From: "Gustavo A. R. Silva" + + +[ Upstream commit 2734166e89639c973c6e125ac8bcfc2d9db72b70 ] + +gso_type is being used in binary AND operations together with SKB_GSO_UDP. +The issue is that variable gso_type is of type unsigned short and +SKB_GSO_UDP expands to more than 16 bits: + +SKB_GSO_UDP = 1 << 16 + +this makes any binary AND operation between gso_type and SKB_GSO_UDP to +be always zero, hence making some code unreachable and likely causing +undesired behavior. + +Fix this by changing the data type of variable gso_type to unsigned int. + +Addresses-Coverity-ID: 1462223 +Fixes: 0c19f846d582 ("net: accept UFO datagrams from tuntap and packet") +Signed-off-by: Gustavo A. R. Silva +Acked-by: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/openvswitch/datapath.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/openvswitch/datapath.c ++++ b/net/openvswitch/datapath.c +@@ -335,7 +335,7 @@ static int queue_gso_packets(struct data + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) + { +- unsigned short gso_type = skb_shinfo(skb)->gso_type; ++ unsigned int gso_type = skb_shinfo(skb)->gso_type; + struct sw_flow_key later_key; + struct sk_buff *segs, *nskb; + int err; diff --git a/queue-4.14/net-packet-fix-a-race-in-packet_bind-and-packet_notifier.patch b/queue-4.14/net-packet-fix-a-race-in-packet_bind-and-packet_notifier.patch new file mode 100644 index 00000000000..65558a28644 --- /dev/null +++ b/queue-4.14/net-packet-fix-a-race-in-packet_bind-and-packet_notifier.patch @@ -0,0 +1,93 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Eric Dumazet +Date: Tue, 28 Nov 2017 08:03:30 -0800 +Subject: net/packet: fix a race in packet_bind() and packet_notifier() + +From: Eric Dumazet + + +[ Upstream commit 15fe076edea787807a7cdc168df832544b58eba6 ] + +syzbot reported crashes [1] and provided a C repro easing bug hunting. + +When/if packet_do_bind() calls __unregister_prot_hook() and releases +po->bind_lock, another thread can run packet_notifier() and process an +NETDEV_UP event. + +This calls register_prot_hook() and hooks again the socket right before +first thread is able to grab again po->bind_lock. + +Fixes this issue by temporarily setting po->num to 0, as suggested by +David Miller. + +[1] +dev_remove_pack: ffff8801bf16fa80 not found +------------[ cut here ]------------ +kernel BUG at net/core/dev.c:7945! ( BUG_ON(!list_empty(&dev->ptype_all)); ) +invalid opcode: 0000 [#1] SMP KASAN +Dumping ftrace buffer: + (ftrace buffer empty) +Modules linked in: +device syz0 entered promiscuous mode +CPU: 0 PID: 3161 Comm: syzkaller404108 Not tainted 4.14.0+ #190 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 +task: ffff8801cc57a500 task.stack: ffff8801cc588000 +RIP: 0010:netdev_run_todo+0x772/0xae0 net/core/dev.c:7945 +RSP: 0018:ffff8801cc58f598 EFLAGS: 00010293 +RAX: ffff8801cc57a500 RBX: dffffc0000000000 RCX: ffffffff841f75b2 +RDX: 0000000000000000 RSI: 1ffff100398b1ede RDI: ffff8801bf1f8810 +device syz0 entered promiscuous mode +RBP: ffff8801cc58f898 R08: 0000000000000001 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801bf1f8cd8 +R13: ffff8801cc58f870 R14: ffff8801bf1f8780 R15: ffff8801cc58f7f0 +FS: 0000000001716880(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000020b13000 CR3: 0000000005e25000 CR4: 00000000001406f0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + rtnl_unlock+0xe/0x10 net/core/rtnetlink.c:106 + tun_detach drivers/net/tun.c:670 [inline] + tun_chr_close+0x49/0x60 drivers/net/tun.c:2845 + __fput+0x333/0x7f0 fs/file_table.c:210 + ____fput+0x15/0x20 fs/file_table.c:244 + task_work_run+0x199/0x270 kernel/task_work.c:113 + exit_task_work include/linux/task_work.h:22 [inline] + do_exit+0x9bb/0x1ae0 kernel/exit.c:865 + do_group_exit+0x149/0x400 kernel/exit.c:968 + SYSC_exit_group kernel/exit.c:979 [inline] + SyS_exit_group+0x1d/0x20 kernel/exit.c:977 + entry_SYSCALL_64_fastpath+0x1f/0x96 +RIP: 0033:0x44ad19 + +Fixes: 30f7ea1c2b5f ("packet: race condition in packet_bind") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Cc: Francesco Ruggeri +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/packet/af_packet.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -3101,6 +3101,10 @@ static int packet_do_bind(struct sock *s + if (need_rehook) { + if (po->running) { + rcu_read_unlock(); ++ /* prevents packet_notifier() from calling ++ * register_prot_hook() ++ */ ++ po->num = 0; + __unregister_prot_hook(sk, true); + rcu_read_lock(); + dev_curr = po->prot_hook.dev; +@@ -3109,6 +3113,7 @@ static int packet_do_bind(struct sock *s + dev->ifindex); + } + ++ BUG_ON(po->running); + po->num = proto; + po->prot_hook.type = proto; + diff --git a/queue-4.14/net-qmi_wwan-add-quectel-bg96-2c7c-0296.patch b/queue-4.14/net-qmi_wwan-add-quectel-bg96-2c7c-0296.patch new file mode 100644 index 00000000000..20e1672102c --- /dev/null +++ b/queue-4.14/net-qmi_wwan-add-quectel-bg96-2c7c-0296.patch @@ -0,0 +1,33 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Sebastian Sjoholm +Date: Mon, 20 Nov 2017 19:05:17 +0100 +Subject: net: qmi_wwan: add Quectel BG96 2c7c:0296 + +From: Sebastian Sjoholm + + +[ Upstream commit f9409e7f086fa6c4623769b4b2f4f17a024d8143 ] + +Quectel BG96 is an Qualcomm MDM9206 based IoT modem, supporting both +CAT-M and NB-IoT. Tested hardware is BG96 mounted on Quectel development +board (EVB). The USB id is added to qmi_wwan.c to allow QMI +communication with the BG96. + +Signed-off-by: Sebastian Sjoholm +Acked-by: Bjørn Mork +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/usb/qmi_wwan.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/usb/qmi_wwan.c ++++ b/drivers/net/usb/qmi_wwan.c +@@ -1239,6 +1239,7 @@ static const struct usb_device_id produc + {QMI_FIXED_INTF(0x1e0e, 0x9001, 5)}, /* SIMCom 7230E */ + {QMI_QUIRK_SET_DTR(0x2c7c, 0x0125, 4)}, /* Quectel EC25, EC20 R2.0 Mini PCIe */ + {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */ ++ {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ + + /* 4. Gobi 1000 devices */ + {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ diff --git a/queue-4.14/net-realtek-r8169-implement-set_link_ksettings.patch b/queue-4.14/net-realtek-r8169-implement-set_link_ksettings.patch new file mode 100644 index 00000000000..38b2336343d --- /dev/null +++ b/queue-4.14/net-realtek-r8169-implement-set_link_ksettings.patch @@ -0,0 +1,98 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Tobias Jakobi +Date: Tue, 21 Nov 2017 16:15:57 +0100 +Subject: net: realtek: r8169: implement set_link_ksettings() + +From: Tobias Jakobi + + +[ Upstream commit 9e77d7a5549dc4d4999a60676373ab3fd1dae4db ] + +Commit 6fa1ba61520576cf1346c4ff09a056f2950cb3bf partially +implemented the new ethtool API, by replacing get_settings() +with get_link_ksettings(). This breaks ethtool, since the +userspace tool (according to the new API specs) never tries +the legacy set() call, when the new get() call succeeds. + +All attempts to chance some setting from userspace result in: +> Cannot set new settings: Operation not supported + +Implement the missing set() call. + +Signed-off-by: Tobias Jakobi +Tested-by: Holger Hoffstätte +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/realtek/r8169.c | 38 ++++++++++++++++++++--------------- + 1 file changed, 22 insertions(+), 16 deletions(-) + +--- a/drivers/net/ethernet/realtek/r8169.c ++++ b/drivers/net/ethernet/realtek/r8169.c +@@ -2025,21 +2025,6 @@ out: + return ret; + } + +-static int rtl8169_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +-{ +- struct rtl8169_private *tp = netdev_priv(dev); +- int ret; +- +- del_timer_sync(&tp->timer); +- +- rtl_lock_work(tp); +- ret = rtl8169_set_speed(dev, cmd->autoneg, ethtool_cmd_speed(cmd), +- cmd->duplex, cmd->advertising); +- rtl_unlock_work(tp); +- +- return ret; +-} +- + static netdev_features_t rtl8169_fix_features(struct net_device *dev, + netdev_features_t features) + { +@@ -2166,6 +2151,27 @@ static int rtl8169_get_link_ksettings(st + return rc; + } + ++static int rtl8169_set_link_ksettings(struct net_device *dev, ++ const struct ethtool_link_ksettings *cmd) ++{ ++ struct rtl8169_private *tp = netdev_priv(dev); ++ int rc; ++ u32 advertising; ++ ++ if (!ethtool_convert_link_mode_to_legacy_u32(&advertising, ++ cmd->link_modes.advertising)) ++ return -EINVAL; ++ ++ del_timer_sync(&tp->timer); ++ ++ rtl_lock_work(tp); ++ rc = rtl8169_set_speed(dev, cmd->base.autoneg, cmd->base.speed, ++ cmd->base.duplex, advertising); ++ rtl_unlock_work(tp); ++ ++ return rc; ++} ++ + static void rtl8169_get_regs(struct net_device *dev, struct ethtool_regs *regs, + void *p) + { +@@ -2367,7 +2373,6 @@ static const struct ethtool_ops rtl8169_ + .get_drvinfo = rtl8169_get_drvinfo, + .get_regs_len = rtl8169_get_regs_len, + .get_link = ethtool_op_get_link, +- .set_settings = rtl8169_set_settings, + .get_msglevel = rtl8169_get_msglevel, + .set_msglevel = rtl8169_set_msglevel, + .get_regs = rtl8169_get_regs, +@@ -2379,6 +2384,7 @@ static const struct ethtool_ops rtl8169_ + .get_ts_info = ethtool_op_get_ts_info, + .nway_reset = rtl8169_nway_reset, + .get_link_ksettings = rtl8169_get_link_ksettings, ++ .set_link_ksettings = rtl8169_set_link_ksettings, + }; + + static void rtl8169_get_mac_version(struct rtl8169_private *tp, diff --git a/queue-4.14/net-remove-hlist_nulls_add_tail_rcu.patch b/queue-4.14/net-remove-hlist_nulls_add_tail_rcu.patch new file mode 100644 index 00000000000..de5a6a2c269 --- /dev/null +++ b/queue-4.14/net-remove-hlist_nulls_add_tail_rcu.patch @@ -0,0 +1,149 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Eric Dumazet +Date: Tue, 5 Dec 2017 12:45:56 -0800 +Subject: net: remove hlist_nulls_add_tail_rcu() + +From: Eric Dumazet + + +[ Upstream commit d7efc6c11b277d9d80b99b1334a78bfe7d7edf10 ] + +Alexander Potapenko reported use of uninitialized memory [1] + +This happens when inserting a request socket into TCP ehash, +in __sk_nulls_add_node_rcu(), since sk_reuseport is not initialized. + +Bug was added by commit d894ba18d4e4 ("soreuseport: fix ordering for +mixed v4/v6 sockets") + +Note that d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6 +ordering fix") missed the opportunity to get rid of +hlist_nulls_add_tail_rcu() : + +Both UDP sockets and TCP/DCCP listeners no longer use +__sk_nulls_add_node_rcu() for their hash insertion. + +Since all other sockets have unique 4-tuple, the reuseport status +has no special meaning, so we can always use hlist_nulls_add_head_rcu() +for them and save few cycles/instructions. + +[1] + +================================================================== +BUG: KMSAN: use of uninitialized memory in inet_ehash_insert+0xd40/0x1050 +CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.13.0+ #3288 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 +Call Trace: +  + __dump_stack lib/dump_stack.c:16 + dump_stack+0x185/0x1d0 lib/dump_stack.c:52 + kmsan_report+0x13f/0x1c0 mm/kmsan/kmsan.c:1016 + __msan_warning_32+0x69/0xb0 mm/kmsan/kmsan_instr.c:766 + __sk_nulls_add_node_rcu ./include/net/sock.h:684 + inet_ehash_insert+0xd40/0x1050 net/ipv4/inet_hashtables.c:413 + reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:754 + inet_csk_reqsk_queue_hash_add+0x1cc/0x300 net/ipv4/inet_connection_sock.c:765 + tcp_conn_request+0x31e7/0x36f0 net/ipv4/tcp_input.c:6414 + tcp_v4_conn_request+0x16d/0x220 net/ipv4/tcp_ipv4.c:1314 + tcp_rcv_state_process+0x42a/0x7210 net/ipv4/tcp_input.c:5917 + tcp_v4_do_rcv+0xa6a/0xcd0 net/ipv4/tcp_ipv4.c:1483 + tcp_v4_rcv+0x3de0/0x4ab0 net/ipv4/tcp_ipv4.c:1763 + ip_local_deliver_finish+0x6bb/0xcb0 net/ipv4/ip_input.c:216 + NF_HOOK ./include/linux/netfilter.h:248 + ip_local_deliver+0x3fa/0x480 net/ipv4/ip_input.c:257 + dst_input ./include/net/dst.h:477 + ip_rcv_finish+0x6fb/0x1540 net/ipv4/ip_input.c:397 + NF_HOOK ./include/linux/netfilter.h:248 + ip_rcv+0x10f6/0x15c0 net/ipv4/ip_input.c:488 + __netif_receive_skb_core+0x36f6/0x3f60 net/core/dev.c:4298 + __netif_receive_skb net/core/dev.c:4336 + netif_receive_skb_internal+0x63c/0x19c0 net/core/dev.c:4497 + napi_skb_finish net/core/dev.c:4858 + napi_gro_receive+0x629/0xa50 net/core/dev.c:4889 + e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4018 + e1000_clean_rx_irq+0x1492/0x1d30 +drivers/net/ethernet/intel/e1000/e1000_main.c:4474 + e1000_clean+0x43aa/0x5970 drivers/net/ethernet/intel/e1000/e1000_main.c:3819 + napi_poll net/core/dev.c:5500 + net_rx_action+0x73c/0x1820 net/core/dev.c:5566 + __do_softirq+0x4b4/0x8dd kernel/softirq.c:284 + invoke_softirq kernel/softirq.c:364 + irq_exit+0x203/0x240 kernel/softirq.c:405 + exiting_irq+0xe/0x10 ./arch/x86/include/asm/apic.h:638 + do_IRQ+0x15e/0x1a0 arch/x86/kernel/irq.c:263 + common_interrupt+0x86/0x86 + +Fixes: d894ba18d4e4 ("soreuseport: fix ordering for mixed v4/v6 sockets") +Fixes: d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6 ordering fix") +Signed-off-by: Eric Dumazet +Reported-by: Alexander Potapenko +Acked-by: Craig Gallek +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/rculist_nulls.h | 38 -------------------------------------- + include/net/sock.h | 6 +----- + 2 files changed, 1 insertion(+), 43 deletions(-) + +--- a/include/linux/rculist_nulls.h ++++ b/include/linux/rculist_nulls.h +@@ -101,44 +101,6 @@ static inline void hlist_nulls_add_head_ + } + + /** +- * hlist_nulls_add_tail_rcu +- * @n: the element to add to the hash list. +- * @h: the list to add to. +- * +- * Description: +- * Adds the specified element to the end of the specified hlist_nulls, +- * while permitting racing traversals. NOTE: tail insertion requires +- * list traversal. +- * +- * The caller must take whatever precautions are necessary +- * (such as holding appropriate locks) to avoid racing +- * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() +- * or hlist_nulls_del_rcu(), running on this same list. +- * However, it is perfectly legal to run concurrently with +- * the _rcu list-traversal primitives, such as +- * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency +- * problems on Alpha CPUs. Regardless of the type of CPU, the +- * list-traversal primitive must be guarded by rcu_read_lock(). +- */ +-static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, +- struct hlist_nulls_head *h) +-{ +- struct hlist_nulls_node *i, *last = NULL; +- +- for (i = hlist_nulls_first_rcu(h); !is_a_nulls(i); +- i = hlist_nulls_next_rcu(i)) +- last = i; +- +- if (last) { +- n->next = last->next; +- n->pprev = &last->next; +- rcu_assign_pointer(hlist_nulls_next_rcu(last), n); +- } else { +- hlist_nulls_add_head_rcu(n, h); +- } +-} +- +-/** + * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_nulls_node to use as a loop cursor. +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -683,11 +683,7 @@ static inline void sk_add_node_rcu(struc + + static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) + { +- if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && +- sk->sk_family == AF_INET6) +- hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); +- else +- hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); ++ hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); + } + + static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) diff --git a/queue-4.14/net-sched-cbq-create-block-for-q-link.block.patch b/queue-4.14/net-sched-cbq-create-block-for-q-link.block.patch new file mode 100644 index 00000000000..f104d59bae5 --- /dev/null +++ b/queue-4.14/net-sched-cbq-create-block-for-q-link.block.patch @@ -0,0 +1,56 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Jiri Pirko +Date: Mon, 27 Nov 2017 18:37:21 +0100 +Subject: net: sched: cbq: create block for q->link.block + +From: Jiri Pirko + + +[ Upstream commit d51aae68b142f48232257e96ce317db25445418d ] + +q->link.block is not initialized, that leads to EINVAL when one tries to +add filter there. So initialize it properly. + +This can be reproduced by: +$ tc qdisc add dev eth0 root handle 1: cbq avpkt 1000 rate 1000Mbit bandwidth 1000Mbit +$ tc filter add dev eth0 parent 1: protocol ip prio 100 u32 match ip protocol 0 0x00 flowid 1:1 + +Reported-by: Jaroslav Aster +Reported-by: Ivan Vecera +Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") +Signed-off-by: Jiri Pirko +Acked-by: Eelco Chaudron +Reviewed-by: Ivan Vecera +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/sch_cbq.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/net/sched/sch_cbq.c ++++ b/net/sched/sch_cbq.c +@@ -1157,9 +1157,13 @@ static int cbq_init(struct Qdisc *sch, s + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL) + return -EINVAL; + ++ err = tcf_block_get(&q->link.block, &q->link.filter_list); ++ if (err) ++ goto put_rtab; ++ + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) +- goto put_rtab; ++ goto put_block; + + q->link.sibling = &q->link; + q->link.common.classid = sch->handle; +@@ -1193,6 +1197,9 @@ static int cbq_init(struct Qdisc *sch, s + cbq_addprio(q, &q->link); + return 0; + ++put_block: ++ tcf_block_put(q->link.block); ++ + put_rtab: + qdisc_put_rtab(q->link.R_tab); + return err; diff --git a/queue-4.14/net-thunderx-fix-tcp-udp-checksum-offload-for-ipv4-pkts.patch b/queue-4.14/net-thunderx-fix-tcp-udp-checksum-offload-for-ipv4-pkts.patch new file mode 100644 index 00000000000..a50ba1fa076 --- /dev/null +++ b/queue-4.14/net-thunderx-fix-tcp-udp-checksum-offload-for-ipv4-pkts.patch @@ -0,0 +1,47 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Florian Westphal +Date: Wed, 6 Dec 2017 01:04:50 +0100 +Subject: net: thunderx: Fix TCP/UDP checksum offload for IPv4 pkts + +From: Florian Westphal + + +[ Upstream commit 134059fd2775be79e26c2dff87d25cc2f6ea5626 ] + +Offload IP header checksum to NIC. + +This fixes a previous patch which disabled checksum offloading +for both IPv4 and IPv6 packets. So L3 checksum offload was +getting disabled for IPv4 pkts. And HW is dropping these pkts +for some reason. + +Without this patch, IPv4 TSO appears to be broken: + +WIthout this patch I get ~16kbyte/s, with patch close to 2mbyte/s +when copying files via scp from test box to my home workstation. + +Looking at tcpdump on sender it looks like hardware drops IPv4 TSO skbs. +This patch restores performance for me, ipv6 looks good too. + +Fixes: fa6d7cb5d76c ("net: thunderx: Fix TCP/UDP checksum offload for IPv6 pkts") +Cc: Sunil Goutham +Cc: Aleksey Makarov +Cc: Eric Dumazet +Signed-off-by: Florian Westphal +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c ++++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +@@ -1355,6 +1355,8 @@ nicvf_sq_add_hdr_subdesc(struct nicvf *n + + /* Offload checksum calculation to HW */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ if (ip.v4->version == 4) ++ hdr->csum_l3 = 1; /* Enable IP csum calculation */ + hdr->l3_offset = skb_network_offset(skb); + hdr->l4_offset = skb_transport_offset(skb); + diff --git a/queue-4.14/net-thunderx-fix-tcp-udp-checksum-offload-for-ipv6-pkts.patch b/queue-4.14/net-thunderx-fix-tcp-udp-checksum-offload-for-ipv6-pkts.patch new file mode 100644 index 00000000000..103d0661cb8 --- /dev/null +++ b/queue-4.14/net-thunderx-fix-tcp-udp-checksum-offload-for-ipv6-pkts.patch @@ -0,0 +1,38 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Sunil Goutham +Date: Thu, 23 Nov 2017 22:34:31 +0300 +Subject: net: thunderx: Fix TCP/UDP checksum offload for IPv6 pkts + +From: Sunil Goutham + + +[ Upstream commit fa6d7cb5d76cf0467c61420fc9238045aedfd379 ] + +Don't offload IP header checksum to NIC. + +This fixes a previous patch which enabled checksum offloading +for both IPv4 and IPv6 packets. So L3 checksum offload was +getting enabled for IPv6 pkts. And HW is dropping these pkts +as it assumes the pkt is IPv4 when IP csum offload is set +in the SQ descriptor. + +Fixes: 3a9024f52c2e ("net: thunderx: Enable TSO and checksum offloads for ipv6") +Signed-off-by: Sunil Goutham +Signed-off-by: Aleksey Makarov +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c ++++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +@@ -1355,7 +1355,6 @@ nicvf_sq_add_hdr_subdesc(struct nicvf *n + + /* Offload checksum calculation to HW */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { +- hdr->csum_l3 = 1; /* Enable IP csum calculation */ + hdr->l3_offset = skb_network_offset(skb); + hdr->l4_offset = skb_transport_offset(skb); + diff --git a/queue-4.14/packet-fix-crash-in-fanout_demux_rollover.patch b/queue-4.14/packet-fix-crash-in-fanout_demux_rollover.patch new file mode 100644 index 00000000000..44b1d0885bc --- /dev/null +++ b/queue-4.14/packet-fix-crash-in-fanout_demux_rollover.patch @@ -0,0 +1,155 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Mike Maloney +Date: Tue, 28 Nov 2017 10:44:29 -0500 +Subject: packet: fix crash in fanout_demux_rollover() + +From: Mike Maloney + + +syzkaller found a race condition fanout_demux_rollover() while removing +a packet socket from a fanout group. + +po->rollover is read and operated on during packet_rcv_fanout(), via +fanout_demux_rollover(), but the pointer is currently cleared before the +synchronization in packet_release(). It is safer to delay the cleanup +until after synchronize_net() has been called, ensuring all calls to +packet_rcv_fanout() for this socket have finished. + +To further simplify synchronization around the rollover structure, set +po->rollover in fanout_add() only if there are no errors. This removes +the need for rcu in the struct and in the call to +packet_getsockopt(..., PACKET_ROLLOVER_STATS, ...). + +Crashing stack trace: + fanout_demux_rollover+0xb6/0x4d0 net/packet/af_packet.c:1392 + packet_rcv_fanout+0x649/0x7c8 net/packet/af_packet.c:1487 + dev_queue_xmit_nit+0x835/0xc10 net/core/dev.c:1953 + xmit_one net/core/dev.c:2975 [inline] + dev_hard_start_xmit+0x16b/0xac0 net/core/dev.c:2995 + __dev_queue_xmit+0x17a4/0x2050 net/core/dev.c:3476 + dev_queue_xmit+0x17/0x20 net/core/dev.c:3509 + neigh_connected_output+0x489/0x720 net/core/neighbour.c:1379 + neigh_output include/net/neighbour.h:482 [inline] + ip6_finish_output2+0xad1/0x22a0 net/ipv6/ip6_output.c:120 + ip6_finish_output+0x2f9/0x920 net/ipv6/ip6_output.c:146 + NF_HOOK_COND include/linux/netfilter.h:239 [inline] + ip6_output+0x1f4/0x850 net/ipv6/ip6_output.c:163 + dst_output include/net/dst.h:459 [inline] + NF_HOOK.constprop.35+0xff/0x630 include/linux/netfilter.h:250 + mld_sendpack+0x6a8/0xcc0 net/ipv6/mcast.c:1660 + mld_send_initial_cr.part.24+0x103/0x150 net/ipv6/mcast.c:2072 + mld_send_initial_cr net/ipv6/mcast.c:2056 [inline] + ipv6_mc_dad_complete+0x99/0x130 net/ipv6/mcast.c:2079 + addrconf_dad_completed+0x595/0x970 net/ipv6/addrconf.c:4039 + addrconf_dad_work+0xac9/0x1160 net/ipv6/addrconf.c:3971 + process_one_work+0xbf0/0x1bc0 kernel/workqueue.c:2113 + worker_thread+0x223/0x1990 kernel/workqueue.c:2247 + kthread+0x35e/0x430 kernel/kthread.c:231 + ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:432 + +Fixes: 0648ab70afe6 ("packet: rollover prepare: per-socket state") +Fixes: 509c7a1ecc860 ("packet: avoid panic in packet_getsockopt()") +Reported-by: syzbot +Signed-off-by: Mike Maloney +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/packet/af_packet.c | 32 ++++++++++---------------------- + net/packet/internal.h | 1 - + 2 files changed, 10 insertions(+), 23 deletions(-) + +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -1697,7 +1697,6 @@ static int fanout_add(struct sock *sk, u + atomic_long_set(&rollover->num, 0); + atomic_long_set(&rollover->num_huge, 0); + atomic_long_set(&rollover->num_failed, 0); +- po->rollover = rollover; + } + + if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { +@@ -1755,6 +1754,8 @@ static int fanout_add(struct sock *sk, u + if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) { + __dev_remove_pack(&po->prot_hook); + po->fanout = match; ++ po->rollover = rollover; ++ rollover = NULL; + refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); + __fanout_link(sk, po); + err = 0; +@@ -1768,10 +1769,7 @@ static int fanout_add(struct sock *sk, u + } + + out: +- if (err && rollover) { +- kfree_rcu(rollover, rcu); +- po->rollover = NULL; +- } ++ kfree(rollover); + mutex_unlock(&fanout_mutex); + return err; + } +@@ -1795,11 +1793,6 @@ static struct packet_fanout *fanout_rele + list_del(&f->list); + else + f = NULL; +- +- if (po->rollover) { +- kfree_rcu(po->rollover, rcu); +- po->rollover = NULL; +- } + } + mutex_unlock(&fanout_mutex); + +@@ -3039,6 +3032,7 @@ static int packet_release(struct socket + synchronize_net(); + + if (f) { ++ kfree(po->rollover); + fanout_release_data(f); + kfree(f); + } +@@ -3853,7 +3847,6 @@ static int packet_getsockopt(struct sock + void *data = &val; + union tpacket_stats_u st; + struct tpacket_rollover_stats rstats; +- struct packet_rollover *rollover; + + if (level != SOL_PACKET) + return -ENOPROTOOPT; +@@ -3932,18 +3925,13 @@ static int packet_getsockopt(struct sock + 0); + break; + case PACKET_ROLLOVER_STATS: +- rcu_read_lock(); +- rollover = rcu_dereference(po->rollover); +- if (rollover) { +- rstats.tp_all = atomic_long_read(&rollover->num); +- rstats.tp_huge = atomic_long_read(&rollover->num_huge); +- rstats.tp_failed = atomic_long_read(&rollover->num_failed); +- data = &rstats; +- lv = sizeof(rstats); +- } +- rcu_read_unlock(); +- if (!rollover) ++ if (!po->rollover) + return -EINVAL; ++ rstats.tp_all = atomic_long_read(&po->rollover->num); ++ rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); ++ rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); ++ data = &rstats; ++ lv = sizeof(rstats); + break; + case PACKET_TX_HAS_OFF: + val = po->tp_tx_has_off; +--- a/net/packet/internal.h ++++ b/net/packet/internal.h +@@ -95,7 +95,6 @@ struct packet_fanout { + + struct packet_rollover { + int sock; +- struct rcu_head rcu; + atomic_long_t num; + atomic_long_t num_huge; + atomic_long_t num_failed; diff --git a/queue-4.14/rds-fix-null-pointer-dereference-in-__rds_rdma_map.patch b/queue-4.14/rds-fix-null-pointer-dereference-in-__rds_rdma_map.patch new file mode 100644 index 00000000000..917087116bc --- /dev/null +++ b/queue-4.14/rds-fix-null-pointer-dereference-in-__rds_rdma_map.patch @@ -0,0 +1,89 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Håkon Bugge +Date: Wed, 6 Dec 2017 17:18:28 +0100 +Subject: rds: Fix NULL pointer dereference in __rds_rdma_map + +From: Håkon Bugge + + +[ Upstream commit f3069c6d33f6ae63a1668737bc78aaaa51bff7ca ] + +This is a fix for syzkaller719569, where memory registration was +attempted without any underlying transport being loaded. + +Analysis of the case reveals that it is the setsockopt() RDS_GET_MR +(2) and RDS_GET_MR_FOR_DEST (7) that are vulnerable. + +Here is an example stack trace when the bug is hit: + +BUG: unable to handle kernel NULL pointer dereference at 00000000000000c0 +IP: __rds_rdma_map+0x36/0x440 [rds] +PGD 2f93d03067 P4D 2f93d03067 PUD 2f93d02067 PMD 0 +Oops: 0000 [#1] SMP +Modules linked in: bridge stp llc tun rpcsec_gss_krb5 nfsv4 +dns_resolver nfs fscache rds binfmt_misc sb_edac intel_powerclamp +coretemp kvm_intel kvm irqbypass crct10dif_pclmul c rc32_pclmul +ghash_clmulni_intel pcbc aesni_intel crypto_simd glue_helper cryptd +iTCO_wdt mei_me sg iTCO_vendor_support ipmi_si mei ipmi_devintf nfsd +shpchp pcspkr i2c_i801 ioatd ma ipmi_msghandler wmi lpc_ich mfd_core +auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 +mgag200 i2c_algo_bit drm_kms_helper ixgbe syscopyarea ahci sysfillrect +sysimgblt libahci mdio fb_sys_fops ttm ptp libata sd_mod mlx4_core drm +crc32c_intel pps_core megaraid_sas i2c_core dca dm_mirror +dm_region_hash dm_log dm_mod +CPU: 48 PID: 45787 Comm: repro_set2 Not tainted 4.14.2-3.el7uek.x86_64 #2 +Hardware name: Oracle Corporation ORACLE SERVER X5-2L/ASM,MOBO TRAY,2U, BIOS 31110000 03/03/2017 +task: ffff882f9190db00 task.stack: ffffc9002b994000 +RIP: 0010:__rds_rdma_map+0x36/0x440 [rds] +RSP: 0018:ffffc9002b997df0 EFLAGS: 00010202 +RAX: 0000000000000000 RBX: ffff882fa2182580 RCX: 0000000000000000 +RDX: 0000000000000000 RSI: ffffc9002b997e40 RDI: ffff882fa2182580 +RBP: ffffc9002b997e30 R08: 0000000000000000 R09: 0000000000000002 +R10: ffff885fb29e3838 R11: 0000000000000000 R12: ffff882fa2182580 +R13: ffff882fa2182580 R14: 0000000000000002 R15: 0000000020000ffc +FS: 00007fbffa20b700(0000) GS:ffff882fbfb80000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00000000000000c0 CR3: 0000002f98a66006 CR4: 00000000001606e0 +Call Trace: + rds_get_mr+0x56/0x80 [rds] + rds_setsockopt+0x172/0x340 [rds] + ? __fget_light+0x25/0x60 + ? __fdget+0x13/0x20 + SyS_setsockopt+0x80/0xe0 + do_syscall_64+0x67/0x1b0 + entry_SYSCALL64_slow_path+0x25/0x25 +RIP: 0033:0x7fbff9b117f9 +RSP: 002b:00007fbffa20aed8 EFLAGS: 00000293 ORIG_RAX: 0000000000000036 +RAX: ffffffffffffffda RBX: 00000000000c84a4 RCX: 00007fbff9b117f9 +RDX: 0000000000000002 RSI: 0000400000000114 RDI: 000000000000109b +RBP: 00007fbffa20af10 R08: 0000000000000020 R09: 00007fbff9dd7860 +R10: 0000000020000ffc R11: 0000000000000293 R12: 0000000000000000 +R13: 00007fbffa20b9c0 R14: 00007fbffa20b700 R15: 0000000000000021 + +Code: 41 56 41 55 49 89 fd 41 54 53 48 83 ec 18 8b 87 f0 02 00 00 48 +89 55 d0 48 89 4d c8 85 c0 0f 84 2d 03 00 00 48 8b 87 00 03 00 00 <48> +83 b8 c0 00 00 00 00 0f 84 25 03 00 0 0 48 8b 06 48 8b 56 08 + +The fix is to check the existence of an underlying transport in +__rds_rdma_map(). + +Signed-off-by: HÃ¥kon Bugge +Reported-by: syzbot +Acked-by: Santosh Shilimkar +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/rds/rdma.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/rds/rdma.c ++++ b/net/rds/rdma.c +@@ -183,7 +183,7 @@ static int __rds_rdma_map(struct rds_soc + long i; + int ret; + +- if (rs->rs_bound_addr == 0) { ++ if (rs->rs_bound_addr == 0 || !rs->rs_transport) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } diff --git a/queue-4.14/s390-qeth-build-max-size-gso-skbs-on-l2-devices.patch b/queue-4.14/s390-qeth-build-max-size-gso-skbs-on-l2-devices.patch new file mode 100644 index 00000000000..ceeff57964b --- /dev/null +++ b/queue-4.14/s390-qeth-build-max-size-gso-skbs-on-l2-devices.patch @@ -0,0 +1,49 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Julian Wiedmann +Date: Fri, 1 Dec 2017 10:14:51 +0100 +Subject: s390/qeth: build max size GSO skbs on L2 devices + +From: Julian Wiedmann + + +[ Upstream commit 0cbff6d4546613330a1c5f139f5c368e4ce33ca1 ] + +The current GSO skb size limit was copy&pasted over from the L3 path, +where it is needed due to a TSO limitation. +As L2 devices don't offer TSO support (and thus all GSO skbs are +segmented before they reach the driver), there's no reason to restrict +the stack in how large it may build the GSO skbs. + +Fixes: d52aec97e5bc ("qeth: enable scatter/gather in layer 2 mode") +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_l2_main.c | 2 -- + drivers/s390/net/qeth_l3_main.c | 4 ++-- + 2 files changed, 2 insertions(+), 4 deletions(-) + +--- a/drivers/s390/net/qeth_l2_main.c ++++ b/drivers/s390/net/qeth_l2_main.c +@@ -1027,8 +1027,6 @@ static int qeth_l2_setup_netdev(struct q + + card->info.broadcast_capable = 1; + qeth_l2_request_initial_mac(card); +- card->dev->gso_max_size = (QETH_MAX_BUFFER_ELEMENTS(card) - 1) * +- PAGE_SIZE; + SET_NETDEV_DEV(card->dev, &card->gdev->dev); + netif_napi_add(card->dev, &card->napi, qeth_poll, QETH_NAPI_WEIGHT); + netif_carrier_off(card->dev); +--- a/drivers/s390/net/qeth_l3_main.c ++++ b/drivers/s390/net/qeth_l3_main.c +@@ -2989,8 +2989,8 @@ static int qeth_l3_setup_netdev(struct q + NETIF_F_HW_VLAN_CTAG_RX | + NETIF_F_HW_VLAN_CTAG_FILTER; + netif_keep_dst(card->dev); +- card->dev->gso_max_size = (QETH_MAX_BUFFER_ELEMENTS(card) - 1) * +- PAGE_SIZE; ++ netif_set_gso_max_size(card->dev, (QETH_MAX_BUFFER_ELEMENTS(card) - 1) * ++ PAGE_SIZE); + + SET_NETDEV_DEV(card->dev, &card->gdev->dev); + netif_napi_add(card->dev, &card->napi, qeth_poll, QETH_NAPI_WEIGHT); diff --git a/queue-4.14/s390-qeth-fix-early-exit-from-error-path.patch b/queue-4.14/s390-qeth-fix-early-exit-from-error-path.patch new file mode 100644 index 00000000000..0950611bc77 --- /dev/null +++ b/queue-4.14/s390-qeth-fix-early-exit-from-error-path.patch @@ -0,0 +1,56 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Julian Wiedmann +Date: Wed, 18 Oct 2017 17:40:17 +0200 +Subject: s390/qeth: fix early exit from error path + +From: Julian Wiedmann + + +[ Upstream commit 83cf79a2fec3cf499eb6cb9eb608656fc2a82776 ] + +When the allocation of the addr buffer fails, we need to free +our refcount on the inetdevice before returning. + +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_l3_main.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/s390/net/qeth_l3_main.c ++++ b/drivers/s390/net/qeth_l3_main.c +@@ -1553,7 +1553,7 @@ static void qeth_l3_free_vlan_addresses4 + + addr = qeth_l3_get_addr_buffer(QETH_PROT_IPV4); + if (!addr) +- return; ++ goto out; + + spin_lock_bh(&card->ip_lock); + +@@ -1567,6 +1567,7 @@ static void qeth_l3_free_vlan_addresses4 + spin_unlock_bh(&card->ip_lock); + + kfree(addr); ++out: + in_dev_put(in_dev); + } + +@@ -1591,7 +1592,7 @@ static void qeth_l3_free_vlan_addresses6 + + addr = qeth_l3_get_addr_buffer(QETH_PROT_IPV6); + if (!addr) +- return; ++ goto out; + + spin_lock_bh(&card->ip_lock); + +@@ -1606,6 +1607,7 @@ static void qeth_l3_free_vlan_addresses6 + spin_unlock_bh(&card->ip_lock); + + kfree(addr); ++out: + in6_dev_put(in6_dev); + #endif /* CONFIG_QETH_IPV6 */ + } diff --git a/queue-4.14/s390-qeth-fix-gso-throughput-regression.patch b/queue-4.14/s390-qeth-fix-gso-throughput-regression.patch new file mode 100644 index 00000000000..ec2de2687b7 --- /dev/null +++ b/queue-4.14/s390-qeth-fix-gso-throughput-regression.patch @@ -0,0 +1,140 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Julian Wiedmann +Date: Fri, 1 Dec 2017 10:14:50 +0100 +Subject: s390/qeth: fix GSO throughput regression + +From: Julian Wiedmann + + +[ Upstream commit 6d69b1f1eb7a2edf8a3547f361c61f2538e054bb ] + +Using GSO with small MTUs currently results in a substantial throughput +regression - which is caused by how qeth needs to map non-linear skbs +into its IO buffer elements: +compared to a linear skb, each GSO-segmented skb effectively consumes +twice as many buffer elements (ie two instead of one) due to the +additional header-only part. This causes the Output Queue to be +congested with low-utilized IO buffers. + +Fix this as follows: +If the MSS is low enough so that a non-SG GSO segmentation produces +order-0 skbs (currently ~3500 byte), opt out from NETIF_F_SG. This is +where we anticipate the biggest savings, since an SG-enabled +GSO segmentation produces skbs that always consume at least two +buffer elements. + +Larger MSS values continue to get a SG-enabled GSO segmentation, since +1) the relative overhead of the additional header-only buffer element +becomes less noticeable, and +2) the linearization overhead increases. + +With the throughput regression fixed, re-enable NETIF_F_SG by default to +reap the significant CPU savings of GSO. + +Fixes: 5722963a8e83 ("qeth: do not turn on SG per default") +Reported-by: Nils Hoppmann +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_core.h | 3 +++ + drivers/s390/net/qeth_core_main.c | 31 +++++++++++++++++++++++++++++++ + drivers/s390/net/qeth_l2_main.c | 2 ++ + drivers/s390/net/qeth_l3_main.c | 2 ++ + 4 files changed, 38 insertions(+) + +--- a/drivers/s390/net/qeth_core.h ++++ b/drivers/s390/net/qeth_core.h +@@ -985,6 +985,9 @@ struct qeth_cmd_buffer *qeth_get_setassp + int qeth_set_features(struct net_device *, netdev_features_t); + int qeth_recover_features(struct net_device *); + netdev_features_t qeth_fix_features(struct net_device *, netdev_features_t); ++netdev_features_t qeth_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features); + int qeth_vm_request_mac(struct qeth_card *card); + int qeth_push_hdr(struct sk_buff *skb, struct qeth_hdr **hdr, unsigned int len); + +--- a/drivers/s390/net/qeth_core_main.c ++++ b/drivers/s390/net/qeth_core_main.c +@@ -19,6 +19,11 @@ + #include + #include + #include ++#include ++#include ++#include ++#include ++ + #include + #include + +@@ -6505,6 +6510,32 @@ netdev_features_t qeth_fix_features(stru + } + EXPORT_SYMBOL_GPL(qeth_fix_features); + ++netdev_features_t qeth_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) ++{ ++ /* GSO segmentation builds skbs with ++ * a (small) linear part for the headers, and ++ * page frags for the data. ++ * Compared to a linear skb, the header-only part consumes an ++ * additional buffer element. This reduces buffer utilization, and ++ * hurts throughput. So compress small segments into one element. ++ */ ++ if (netif_needs_gso(skb, features)) { ++ /* match skb_segment(): */ ++ unsigned int doffset = skb->data - skb_mac_header(skb); ++ unsigned int hsize = skb_shinfo(skb)->gso_size; ++ unsigned int hroom = skb_headroom(skb); ++ ++ /* linearize only if resulting skb allocations are order-0: */ ++ if (SKB_DATA_ALIGN(hroom + doffset + hsize) <= SKB_MAX_HEAD(0)) ++ features &= ~NETIF_F_SG; ++ } ++ ++ return vlan_features_check(skb, features); ++} ++EXPORT_SYMBOL_GPL(qeth_features_check); ++ + static int __init qeth_core_init(void) + { + int rc; +--- a/drivers/s390/net/qeth_l2_main.c ++++ b/drivers/s390/net/qeth_l2_main.c +@@ -963,6 +963,7 @@ static const struct net_device_ops qeth_ + .ndo_stop = qeth_l2_stop, + .ndo_get_stats = qeth_get_stats, + .ndo_start_xmit = qeth_l2_hard_start_xmit, ++ .ndo_features_check = qeth_features_check, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_rx_mode = qeth_l2_set_rx_mode, + .ndo_do_ioctl = qeth_do_ioctl, +@@ -1009,6 +1010,7 @@ static int qeth_l2_setup_netdev(struct q + if (card->info.type == QETH_CARD_TYPE_OSD && !card->info.guestlan) { + card->dev->hw_features = NETIF_F_SG; + card->dev->vlan_features = NETIF_F_SG; ++ card->dev->features |= NETIF_F_SG; + /* OSA 3S and earlier has no RX/TX support */ + if (qeth_is_supported(card, IPA_OUTBOUND_CHECKSUM)) { + card->dev->hw_features |= NETIF_F_IP_CSUM; +--- a/drivers/s390/net/qeth_l3_main.c ++++ b/drivers/s390/net/qeth_l3_main.c +@@ -2923,6 +2923,7 @@ static const struct net_device_ops qeth_ + .ndo_stop = qeth_l3_stop, + .ndo_get_stats = qeth_get_stats, + .ndo_start_xmit = qeth_l3_hard_start_xmit, ++ .ndo_features_check = qeth_features_check, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_rx_mode = qeth_l3_set_multicast_list, + .ndo_do_ioctl = qeth_do_ioctl, +@@ -2963,6 +2964,7 @@ static int qeth_l3_setup_netdev(struct q + card->dev->vlan_features = NETIF_F_SG | + NETIF_F_RXCSUM | NETIF_F_IP_CSUM | + NETIF_F_TSO; ++ card->dev->features |= NETIF_F_SG; + } + } + } else if (card->info.type == QETH_CARD_TYPE_IQD) { diff --git a/queue-4.14/s390-qeth-fix-thinko-in-ipv4-multicast-address-tracking.patch b/queue-4.14/s390-qeth-fix-thinko-in-ipv4-multicast-address-tracking.patch new file mode 100644 index 00000000000..ce890c45afc --- /dev/null +++ b/queue-4.14/s390-qeth-fix-thinko-in-ipv4-multicast-address-tracking.patch @@ -0,0 +1,46 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Julian Wiedmann +Date: Fri, 1 Dec 2017 10:14:49 +0100 +Subject: s390/qeth: fix thinko in IPv4 multicast address tracking + +From: Julian Wiedmann + + +[ Upsteam commit bc3ab70584696cb798b9e1e0ac8e6ced5fd4c3b8 ] + +Commit 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback") +reworked how secondary addresses are managed for qeth devices. +Instead of dropping & subsequently re-adding all addresses on every +ndo_set_rx_mode() call, qeth now keeps track of the addresses that are +currently registered with the HW. +On a ndo_set_rx_mode(), we thus only need to do (de-)registration +requests for the addresses that have actually changed. + +On L3 devices, the lookup for IPv4 Multicast addresses checks the wrong +hashtable - and thus never finds a match. As a result, we first delete +*all* such addresses, and then re-add them again. So each set_rx_mode() +causes a short period where the IPv4 Multicast addresses are not +registered, and the card stops forwarding inbound traffic for them. + +Fix this by setting the ->is_multicast flag on the lookup object, thus +enabling qeth_l3_ip_from_hash() to search the correct hashtable and +find a match there. + +Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback") +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_l3_main.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/s390/net/qeth_l3_main.c ++++ b/drivers/s390/net/qeth_l3_main.c +@@ -1376,6 +1376,7 @@ qeth_l3_add_mc_to_hash(struct qeth_card + + tmp->u.a4.addr = be32_to_cpu(im4->multiaddr); + memcpy(tmp->mac, buf, sizeof(tmp->mac)); ++ tmp->is_multicast = 1; + + ipm = qeth_l3_ip_from_hash(card, tmp); + if (ipm) { diff --git a/queue-4.14/sctp-use-right-member-as-the-param-of-list_for_each_entry.patch b/queue-4.14/sctp-use-right-member-as-the-param-of-list_for_each_entry.patch new file mode 100644 index 00000000000..4d2f2300c09 --- /dev/null +++ b/queue-4.14/sctp-use-right-member-as-the-param-of-list_for_each_entry.patch @@ -0,0 +1,50 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Xin Long +Date: Sun, 26 Nov 2017 20:56:07 +0800 +Subject: sctp: use right member as the param of list_for_each_entry + +From: Xin Long + + +[ Upstream commit a8dd397903a6e57157f6265911f7d35681364427 ] + +Commit d04adf1b3551 ("sctp: reset owner sk for data chunks on out queues +when migrating a sock") made a mistake that using 'list' as the param of +list_for_each_entry to traverse the retransmit, sacked and abandoned +queues, while chunks are using 'transmitted_list' to link into these +queues. + +It could cause NULL dereference panic if there are chunks in any of these +queues when peeling off one asoc. + +So use the chunk member 'transmitted_list' instead in this patch. + +Fixes: d04adf1b3551 ("sctp: reset owner sk for data chunks on out queues when migrating a sock") +Signed-off-by: Xin Long +Acked-by: Marcelo Ricardo Leitner +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sctp/socket.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/net/sctp/socket.c ++++ b/net/sctp/socket.c +@@ -187,13 +187,13 @@ static void sctp_for_each_tx_datachunk(s + list_for_each_entry(chunk, &t->transmitted, transmitted_list) + cb(chunk); + +- list_for_each_entry(chunk, &q->retransmit, list) ++ list_for_each_entry(chunk, &q->retransmit, transmitted_list) + cb(chunk); + +- list_for_each_entry(chunk, &q->sacked, list) ++ list_for_each_entry(chunk, &q->sacked, transmitted_list) + cb(chunk); + +- list_for_each_entry(chunk, &q->abandoned, list) ++ list_for_each_entry(chunk, &q->abandoned, transmitted_list) + cb(chunk); + + list_for_each_entry(chunk, &q->out_chunk_list, list) diff --git a/queue-4.14/series b/queue-4.14/series new file mode 100644 index 00000000000..5d0f32a38c2 --- /dev/null +++ b/queue-4.14/series @@ -0,0 +1,33 @@ +net-qmi_wwan-add-quectel-bg96-2c7c-0296.patch +net-thunderx-fix-tcp-udp-checksum-offload-for-ipv6-pkts.patch +net-thunderx-fix-tcp-udp-checksum-offload-for-ipv4-pkts.patch +net-realtek-r8169-implement-set_link_ksettings.patch +s390-qeth-fix-early-exit-from-error-path.patch +tipc-fix-memory-leak-in-tipc_accept_from_sock.patch +vhost-fix-skb-leak-in-handle_rx.patch +rds-fix-null-pointer-dereference-in-__rds_rdma_map.patch +sit-update-frag_off-info.patch +tcp-add-tcp_v4_fill_cb-tcp_v4_restore_cb.patch +packet-fix-crash-in-fanout_demux_rollover.patch +net-packet-fix-a-race-in-packet_bind-and-packet_notifier.patch +tcp-remove-buggy-call-to-tcp_v6_restore_cb.patch +usbnet-fix-alignment-for-frames-with-no-ethernet-header.patch +net-remove-hlist_nulls_add_tail_rcu.patch +stmmac-reset-last-tso-segment-size-after-device-open.patch +tcp-dccp-block-bh-before-arming-time_wait-timer.patch +s390-qeth-build-max-size-gso-skbs-on-l2-devices.patch +s390-qeth-fix-thinko-in-ipv4-multicast-address-tracking.patch +s390-qeth-fix-gso-throughput-regression.patch +tcp-use-ipcb-instead-of-tcp_skb_cb-in-inet_exact_dif_match.patch +tipc-call-tipc_rcv-only-if-bearer-is-up-in-tipc_udp_recv.patch +tcp-use-current-time-in-tcp_rcv_space_adjust.patch +net-sched-cbq-create-block-for-q-link.block.patch +tap-free-skb-if-flags-error.patch +tcp-when-scheduling-tlp-time-of-rto-should-account-for-current-ack.patch +tun-free-skb-in-early-errors.patch +net-ipv6-fixup-device-for-anycast-routes-during-copy.patch +tun-fix-rcu_read_lock-imbalance-in-tun_build_skb.patch +net-accept-ufo-datagrams-from-tuntap-and-packet.patch +net-openvswitch-datapath-fix-data-type-in-queue_gso_packets.patch +cls_bpf-don-t-decrement-net-s-refcount-when-offload-fails.patch +sctp-use-right-member-as-the-param-of-list_for_each_entry.patch diff --git a/queue-4.14/sit-update-frag_off-info.patch b/queue-4.14/sit-update-frag_off-info.patch new file mode 100644 index 00000000000..b1728dc82f1 --- /dev/null +++ b/queue-4.14/sit-update-frag_off-info.patch @@ -0,0 +1,32 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Hangbin Liu +Date: Thu, 30 Nov 2017 10:41:14 +0800 +Subject: sit: update frag_off info + +From: Hangbin Liu + + +[ Upstream commit f859b4af1c52493ec21173ccc73d0b60029b5b88 ] + +After parsing the sit netlink change info, we forget to update frag_off in +ipip6_tunnel_update(). Fix it by assigning frag_off with new value. + +Reported-by: Jianlin Shi +Signed-off-by: Hangbin Liu +Acked-by: Nicolas Dichtel +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/sit.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ipv6/sit.c ++++ b/net/ipv6/sit.c +@@ -1087,6 +1087,7 @@ static void ipip6_tunnel_update(struct i + ipip6_tunnel_link(sitn, t); + t->parms.iph.ttl = p->iph.ttl; + t->parms.iph.tos = p->iph.tos; ++ t->parms.iph.frag_off = p->iph.frag_off; + if (t->parms.link != p->link || t->fwmark != fwmark) { + t->parms.link = p->link; + t->fwmark = fwmark; diff --git a/queue-4.14/stmmac-reset-last-tso-segment-size-after-device-open.patch b/queue-4.14/stmmac-reset-last-tso-segment-size-after-device-open.patch new file mode 100644 index 00000000000..d98146acf8c --- /dev/null +++ b/queue-4.14/stmmac-reset-last-tso-segment-size-after-device-open.patch @@ -0,0 +1,41 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Lars Persson +Date: Fri, 1 Dec 2017 11:12:44 +0100 +Subject: stmmac: reset last TSO segment size after device open + +From: Lars Persson + + +[ Upstream commit 45ab4b13e46325d00f4acdb365d406e941a15f81 ] + +The mss variable tracks the last max segment size sent to the TSO +engine. We do not update the hardware as long as we receive skb:s with +the same value in gso_size. + +During a network device down/up cycle (mapped to stmmac_release() and +stmmac_open() callbacks) we issue a reset to the hardware and it +forgets the setting for mss. However we did not zero out our mss +variable so the next transmission of a gso packet happens with an +undefined hardware setting. + +This triggers a hang in the TSO engine and eventuelly the netdev +watchdog will bark. + +Fixes: f748be531d70 ("stmmac: support new GMAC4") +Signed-off-by: Lars Persson +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c ++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +@@ -2564,6 +2564,7 @@ static int stmmac_open(struct net_device + + priv->dma_buf_sz = STMMAC_ALIGN(buf_sz); + priv->rx_copybreak = STMMAC_RX_COPYBREAK; ++ priv->mss = 0; + + ret = alloc_dma_desc_resources(priv); + if (ret < 0) { diff --git a/queue-4.14/tap-free-skb-if-flags-error.patch b/queue-4.14/tap-free-skb-if-flags-error.patch new file mode 100644 index 00000000000..2e5e4145095 --- /dev/null +++ b/queue-4.14/tap-free-skb-if-flags-error.patch @@ -0,0 +1,57 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Wei Xu +Date: Fri, 1 Dec 2017 05:10:38 -0500 +Subject: tap: free skb if flags error + +From: Wei Xu + + +[ Upstream commit 61d78537843e676e7f56ac6db333db0c0529b892 ] + +tap_recvmsg() supports accepting skb by msg_control after +commit 3b4ba04acca8 ("tap: support receiving skb from msg_control"), +the skb if presented should be freed within the function, otherwise +it would be leaked. + +Signed-off-by: Wei Xu +Reported-by: Matthew Rosato +Acked-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tap.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/drivers/net/tap.c ++++ b/drivers/net/tap.c +@@ -829,8 +829,11 @@ static ssize_t tap_do_read(struct tap_qu + DEFINE_WAIT(wait); + ssize_t ret = 0; + +- if (!iov_iter_count(to)) ++ if (!iov_iter_count(to)) { ++ if (skb) ++ kfree_skb(skb); + return 0; ++ } + + if (skb) + goto put; +@@ -1154,11 +1157,14 @@ static int tap_recvmsg(struct socket *so + size_t total_len, int flags) + { + struct tap_queue *q = container_of(sock, struct tap_queue, sock); ++ struct sk_buff *skb = m->msg_control; + int ret; +- if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) ++ if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { ++ if (skb) ++ kfree_skb(skb); + return -EINVAL; +- ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, +- m->msg_control); ++ } ++ ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb); + if (ret > total_len) { + m->msg_flags |= MSG_TRUNC; + ret = flags & MSG_TRUNC ? ret : total_len; diff --git a/queue-4.14/tcp-add-tcp_v4_fill_cb-tcp_v4_restore_cb.patch b/queue-4.14/tcp-add-tcp_v4_fill_cb-tcp_v4_restore_cb.patch new file mode 100644 index 00000000000..b23e5f59122 --- /dev/null +++ b/queue-4.14/tcp-add-tcp_v4_fill_cb-tcp_v4_restore_cb.patch @@ -0,0 +1,256 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Eric Dumazet +Date: Sun, 3 Dec 2017 09:32:59 -0800 +Subject: tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb() + +From: Eric Dumazet + + +[ Upstream commit eeea10b83a139451130df1594f26710c8fa390c8 ] + +James Morris reported kernel stack corruption bug [1] while +running the SELinux testsuite, and bisected to a recent +commit bffa72cf7f9d ("net: sk_buff rbnode reorg") + +We believe this commit is fine, but exposes an older bug. + +SELinux code runs from tcp_filter() and might send an ICMP, +expecting IP options to be found in skb->cb[] using regular IPCB placement. + +We need to defer TCP mangling of skb->cb[] after tcp_filter() calls. + +This patch adds tcp_v4_fill_cb()/tcp_v4_restore_cb() in a very +similar way we added them for IPv6. + +[1] +[ 339.806024] SELinux: failure in selinux_parse_skb(), unable to parse packet +[ 339.822505] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff81745af5 +[ 339.822505] +[ 339.852250] CPU: 4 PID: 3642 Comm: client Not tainted 4.15.0-rc1-test #15 +[ 339.868498] Hardware name: LENOVO 10FGS0VA1L/30BC, BIOS FWKT68A 01/19/2017 +[ 339.885060] Call Trace: +[ 339.896875] +[ 339.908103] dump_stack+0x63/0x87 +[ 339.920645] panic+0xe8/0x248 +[ 339.932668] ? ip_push_pending_frames+0x33/0x40 +[ 339.946328] ? icmp_send+0x525/0x530 +[ 339.958861] ? kfree_skbmem+0x60/0x70 +[ 339.971431] __stack_chk_fail+0x1b/0x20 +[ 339.984049] icmp_send+0x525/0x530 +[ 339.996205] ? netlbl_skbuff_err+0x36/0x40 +[ 340.008997] ? selinux_netlbl_err+0x11/0x20 +[ 340.021816] ? selinux_socket_sock_rcv_skb+0x211/0x230 +[ 340.035529] ? security_sock_rcv_skb+0x3b/0x50 +[ 340.048471] ? sk_filter_trim_cap+0x44/0x1c0 +[ 340.061246] ? tcp_v4_inbound_md5_hash+0x69/0x1b0 +[ 340.074562] ? tcp_filter+0x2c/0x40 +[ 340.086400] ? tcp_v4_rcv+0x820/0xa20 +[ 340.098329] ? ip_local_deliver_finish+0x71/0x1a0 +[ 340.111279] ? ip_local_deliver+0x6f/0xe0 +[ 340.123535] ? ip_rcv_finish+0x3a0/0x3a0 +[ 340.135523] ? ip_rcv_finish+0xdb/0x3a0 +[ 340.147442] ? ip_rcv+0x27c/0x3c0 +[ 340.158668] ? inet_del_offload+0x40/0x40 +[ 340.170580] ? __netif_receive_skb_core+0x4ac/0x900 +[ 340.183285] ? rcu_accelerate_cbs+0x5b/0x80 +[ 340.195282] ? __netif_receive_skb+0x18/0x60 +[ 340.207288] ? process_backlog+0x95/0x140 +[ 340.218948] ? net_rx_action+0x26c/0x3b0 +[ 340.230416] ? __do_softirq+0xc9/0x26a +[ 340.241625] ? do_softirq_own_stack+0x2a/0x40 +[ 340.253368] +[ 340.262673] ? do_softirq+0x50/0x60 +[ 340.273450] ? __local_bh_enable_ip+0x57/0x60 +[ 340.285045] ? ip_finish_output2+0x175/0x350 +[ 340.296403] ? ip_finish_output+0x127/0x1d0 +[ 340.307665] ? nf_hook_slow+0x3c/0xb0 +[ 340.318230] ? ip_output+0x72/0xe0 +[ 340.328524] ? ip_fragment.constprop.54+0x80/0x80 +[ 340.340070] ? ip_local_out+0x35/0x40 +[ 340.350497] ? ip_queue_xmit+0x15c/0x3f0 +[ 340.361060] ? __kmalloc_reserve.isra.40+0x31/0x90 +[ 340.372484] ? __skb_clone+0x2e/0x130 +[ 340.382633] ? tcp_transmit_skb+0x558/0xa10 +[ 340.393262] ? tcp_connect+0x938/0xad0 +[ 340.403370] ? ktime_get_with_offset+0x4c/0xb0 +[ 340.414206] ? tcp_v4_connect+0x457/0x4e0 +[ 340.424471] ? __inet_stream_connect+0xb3/0x300 +[ 340.435195] ? inet_stream_connect+0x3b/0x60 +[ 340.445607] ? SYSC_connect+0xd9/0x110 +[ 340.455455] ? __audit_syscall_entry+0xaf/0x100 +[ 340.466112] ? syscall_trace_enter+0x1d0/0x2b0 +[ 340.476636] ? __audit_syscall_exit+0x209/0x290 +[ 340.487151] ? SyS_connect+0xe/0x10 +[ 340.496453] ? do_syscall_64+0x67/0x1b0 +[ 340.506078] ? entry_SYSCALL64_slow_path+0x25/0x25 + +Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") +Signed-off-by: Eric Dumazet +Reported-by: James Morris +Tested-by: James Morris +Tested-by: Casey Schaufler +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_ipv4.c | 59 +++++++++++++++++++++++++++++++++++----------------- + net/ipv6/tcp_ipv6.c | 10 +++++--- + 2 files changed, 46 insertions(+), 23 deletions(-) + +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -1587,6 +1587,34 @@ int tcp_filter(struct sock *sk, struct s + } + EXPORT_SYMBOL(tcp_filter); + ++static void tcp_v4_restore_cb(struct sk_buff *skb) ++{ ++ memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, ++ sizeof(struct inet_skb_parm)); ++} ++ ++static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, ++ const struct tcphdr *th) ++{ ++ /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() ++ * barrier() makes sure compiler wont play fool^Waliasing games. ++ */ ++ memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), ++ sizeof(struct inet_skb_parm)); ++ barrier(); ++ ++ TCP_SKB_CB(skb)->seq = ntohl(th->seq); ++ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + ++ skb->len - th->doff * 4); ++ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); ++ TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); ++ TCP_SKB_CB(skb)->tcp_tw_isn = 0; ++ TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); ++ TCP_SKB_CB(skb)->sacked = 0; ++ TCP_SKB_CB(skb)->has_rxtstamp = ++ skb->tstamp || skb_hwtstamps(skb)->hwtstamp; ++} ++ + /* + * From tcp_input.c + */ +@@ -1627,24 +1655,6 @@ int tcp_v4_rcv(struct sk_buff *skb) + + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); +- /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() +- * barrier() makes sure compiler wont play fool^Waliasing games. +- */ +- memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), +- sizeof(struct inet_skb_parm)); +- barrier(); +- +- TCP_SKB_CB(skb)->seq = ntohl(th->seq); +- TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + +- skb->len - th->doff * 4); +- TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); +- TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); +- TCP_SKB_CB(skb)->tcp_tw_isn = 0; +- TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); +- TCP_SKB_CB(skb)->sacked = 0; +- TCP_SKB_CB(skb)->has_rxtstamp = +- skb->tstamp || skb_hwtstamps(skb)->hwtstamp; +- + lookup: + sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, + th->dest, sdif, &refcounted); +@@ -1675,14 +1685,19 @@ process: + sock_hold(sk); + refcounted = true; + nsk = NULL; +- if (!tcp_filter(sk, skb)) ++ if (!tcp_filter(sk, skb)) { ++ th = (const struct tcphdr *)skb->data; ++ iph = ip_hdr(skb); ++ tcp_v4_fill_cb(skb, iph, th); + nsk = tcp_check_req(sk, skb, req, false); ++ } + if (!nsk) { + reqsk_put(req); + goto discard_and_relse; + } + if (nsk == sk) { + reqsk_put(req); ++ tcp_v4_restore_cb(skb); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v4_send_reset(nsk, skb); + goto discard_and_relse; +@@ -1708,6 +1723,7 @@ process: + goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); ++ tcp_v4_fill_cb(skb, iph, th); + + skb->dev = NULL; + +@@ -1738,6 +1754,8 @@ no_tcp_socket: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + ++ tcp_v4_fill_cb(skb, iph, th); ++ + if (tcp_checksum_complete(skb)) { + csum_error: + __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); +@@ -1764,6 +1782,8 @@ do_time_wait: + goto discard_it; + } + ++ tcp_v4_fill_cb(skb, iph, th); ++ + if (tcp_checksum_complete(skb)) { + inet_twsk_put(inet_twsk(sk)); + goto csum_error; +@@ -1780,6 +1800,7 @@ do_time_wait: + if (sk2) { + inet_twsk_deschedule_put(inet_twsk(sk)); + sk = sk2; ++ tcp_v4_restore_cb(skb); + refcounted = false; + goto process; + } +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1448,7 +1448,6 @@ process: + struct sock *nsk; + + sk = req->rsk_listener; +- tcp_v6_fill_cb(skb, hdr, th); + if (tcp_v6_inbound_md5_hash(sk, skb)) { + sk_drops_add(sk, skb); + reqsk_put(req); +@@ -1461,8 +1460,12 @@ process: + sock_hold(sk); + refcounted = true; + nsk = NULL; +- if (!tcp_filter(sk, skb)) ++ if (!tcp_filter(sk, skb)) { ++ th = (const struct tcphdr *)skb->data; ++ hdr = ipv6_hdr(skb); ++ tcp_v6_fill_cb(skb, hdr, th); + nsk = tcp_check_req(sk, skb, req, false); ++ } + if (!nsk) { + reqsk_put(req); + goto discard_and_relse; +@@ -1486,8 +1489,6 @@ process: + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + +- tcp_v6_fill_cb(skb, hdr, th); +- + if (tcp_v6_inbound_md5_hash(sk, skb)) + goto discard_and_relse; + +@@ -1495,6 +1496,7 @@ process: + goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + hdr = ipv6_hdr(skb); ++ tcp_v6_fill_cb(skb, hdr, th); + + skb->dev = NULL; + diff --git a/queue-4.14/tcp-dccp-block-bh-before-arming-time_wait-timer.patch b/queue-4.14/tcp-dccp-block-bh-before-arming-time_wait-timer.patch new file mode 100644 index 00000000000..828587a1eaa --- /dev/null +++ b/queue-4.14/tcp-dccp-block-bh-before-arming-time_wait-timer.patch @@ -0,0 +1,79 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Eric Dumazet +Date: Fri, 1 Dec 2017 10:06:56 -0800 +Subject: tcp/dccp: block bh before arming time_wait timer + +From: Eric Dumazet + + +[ Upstream commit cfac7f836a715b91f08c851df915d401a4d52783 ] + +Maciej Å»enczykowski reported some panics in tcp_twsk_destructor() +that might be caused by the following bug. + +timewait timer is pinned to the cpu, because we want to transition +timwewait refcount from 0 to 4 in one go, once everything has been +initialized. + +At the time commit ed2e92394589 ("tcp/dccp: fix timewait races in timer +handling") was merged, TCP was always running from BH habdler. + +After commit 5413d1babe8f ("net: do not block BH while processing +socket backlog") we definitely can run tcp_time_wait() from process +context. + +We need to block BH in the critical section so that the pinned timer +has still its purpose. + +This bug is more likely to happen under stress and when very small RTO +are used in datacenter flows. + +Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") +Signed-off-by: Eric Dumazet +Reported-by: Maciej Å»enczykowski +Acked-by: Maciej Å»enczykowski +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/dccp/minisocks.c | 6 ++++++ + net/ipv4/tcp_minisocks.c | 6 ++++++ + 2 files changed, 12 insertions(+) + +--- a/net/dccp/minisocks.c ++++ b/net/dccp/minisocks.c +@@ -57,10 +57,16 @@ void dccp_time_wait(struct sock *sk, int + if (state == DCCP_TIME_WAIT) + timeo = DCCP_TIMEWAIT_LEN; + ++ /* tw_timer is pinned, so we need to make sure BH are disabled ++ * in following section, otherwise timer handler could run before ++ * we complete the initialization. ++ */ ++ local_bh_disable(); + inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); + inet_twsk_put(tw); ++ local_bh_enable(); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -312,10 +312,16 @@ void tcp_time_wait(struct sock *sk, int + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; + ++ /* tw_timer is pinned, so we need to make sure BH are disabled ++ * in following section, otherwise timer handler could run before ++ * we complete the initialization. ++ */ ++ local_bh_disable(); + inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); + inet_twsk_put(tw); ++ local_bh_enable(); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than diff --git a/queue-4.14/tcp-remove-buggy-call-to-tcp_v6_restore_cb.patch b/queue-4.14/tcp-remove-buggy-call-to-tcp_v6_restore_cb.patch new file mode 100644 index 00000000000..8e02c8d6417 --- /dev/null +++ b/queue-4.14/tcp-remove-buggy-call-to-tcp_v6_restore_cb.patch @@ -0,0 +1,41 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Eric Dumazet +Date: Wed, 29 Nov 2017 17:43:57 -0800 +Subject: tcp: remove buggy call to tcp_v6_restore_cb() + +From: Eric Dumazet + + +[ Upstream commit 3016dad75b48279e579117ee3ed566ba90a3b023 ] + +tcp_v6_send_reset() expects to receive an skb with skb->cb[] layout as +used in TCP stack. +MD5 lookup uses tcp_v6_iif() and tcp_v6_sdif() and thus +TCP_SKB_CB(skb)->header.h6 + +This patch probably fixes RST packets sent on behalf of a timewait md5 +ipv6 socket. + +Before Florian patch, tcp_v6_restore_cb() was needed before jumping to +no_tcp_socket label. + +Fixes: 271c3b9b7bda ("tcp: honour SO_BINDTODEVICE for TW_RST case too") +Signed-off-by: Eric Dumazet +Cc: Florian Westphal +Acked-by: Florian Westphal +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/tcp_ipv6.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1585,7 +1585,6 @@ do_time_wait: + tcp_v6_timewait_ack(sk, skb); + break; + case TCP_TW_RST: +- tcp_v6_restore_cb(skb); + tcp_v6_send_reset(sk, skb); + inet_twsk_deschedule_put(inet_twsk(sk)); + goto discard_it; diff --git a/queue-4.14/tcp-use-current-time-in-tcp_rcv_space_adjust.patch b/queue-4.14/tcp-use-current-time-in-tcp_rcv_space_adjust.patch new file mode 100644 index 00000000000..929c7f74099 --- /dev/null +++ b/queue-4.14/tcp-use-current-time-in-tcp_rcv_space_adjust.patch @@ -0,0 +1,37 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Eric Dumazet +Date: Wed, 6 Dec 2017 11:08:19 -0800 +Subject: tcp: use current time in tcp_rcv_space_adjust() + +From: Eric Dumazet + + +[ Upstream commit 8632385022f2b05a6ca0b9e0f95575865de0e2ce ] + +When I switched rcv_rtt_est to high resolution timestamps, I forgot +that tp->tcp_mstamp needed to be refreshed in tcp_rcv_space_adjust() + +Using an old timestamp leads to autotuning lags. + +Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps") +Signed-off-by: Eric Dumazet +Cc: Wei Wang +Cc: Neal Cardwell +Cc: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -592,6 +592,7 @@ void tcp_rcv_space_adjust(struct sock *s + int time; + int copied; + ++ tcp_mstamp_refresh(tp); + time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); + if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) + return; diff --git a/queue-4.14/tcp-use-ipcb-instead-of-tcp_skb_cb-in-inet_exact_dif_match.patch b/queue-4.14/tcp-use-ipcb-instead-of-tcp_skb_cb-in-inet_exact_dif_match.patch new file mode 100644 index 00000000000..4ef3a1614b8 --- /dev/null +++ b/queue-4.14/tcp-use-ipcb-instead-of-tcp_skb_cb-in-inet_exact_dif_match.patch @@ -0,0 +1,38 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: David Ahern +Date: Sun, 3 Dec 2017 09:33:00 -0800 +Subject: tcp: use IPCB instead of TCP_SKB_CB in inet_exact_dif_match() + +From: David Ahern + + +[ Usptream commit b4d1605a8ea608fd7dc45b926a05d75d340bde4b ] + +After this fix : ("tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb()"), +socket lookups happen while skb->cb[] has not been mangled yet by TCP. + +Fixes: a04a480d4392 ("net: Require exact match for TCP socket lookups if dif is l3mdev") +Signed-off-by: David Ahern +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -874,12 +874,11 @@ static inline int tcp_v6_sdif(const stru + } + #endif + +-/* TCP_SKB_CB reference means this can not be used from early demux */ + static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) + { + #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_tcp_l3mdev_accept && +- skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) ++ skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) + return true; + #endif + return false; diff --git a/queue-4.14/tcp-when-scheduling-tlp-time-of-rto-should-account-for-current-ack.patch b/queue-4.14/tcp-when-scheduling-tlp-time-of-rto-should-account-for-current-ack.patch new file mode 100644 index 00000000000..9867d3ab3c0 --- /dev/null +++ b/queue-4.14/tcp-when-scheduling-tlp-time-of-rto-should-account-for-current-ack.patch @@ -0,0 +1,128 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Neal Cardwell +Date: Fri, 17 Nov 2017 21:06:14 -0500 +Subject: tcp: when scheduling TLP, time of RTO should account for current ACK + +From: Neal Cardwell + + +[ Upstream commit ed66dfaf236c04d414de1d218441296e57fb2bd2 ] + +Fix the TLP scheduling logic so that when scheduling a TLP probe, we +ensure that the estimated time at which an RTO would fire accounts for +the fact that ACKs indicating forward progress should push back RTO +times. + +After the following fix: + +df92c8394e6e ("tcp: fix xmit timer to only be reset if data ACKed/SACKed") + +we had an unintentional behavior change in the following kind of +scenario: suppose the RTT variance has been very low recently. Then +suppose we send out a flight of N packets and our RTT is 100ms: + +t=0: send a flight of N packets +t=100ms: receive an ACK for N-1 packets + +The response before df92c8394e6e that was: + -> schedule a TLP for now + RTO_interval + +The response after df92c8394e6e is: + -> schedule a TLP for t=0 + RTO_interval + +Since RTO_interval = srtt + RTT_variance, this means that we have +scheduled a TLP timer at a point in the future that only accounts for +RTT_variance. If the RTT_variance term is small, this means that the +timer fires soon. + +Before df92c8394e6e this would not happen, because in that code, when +we receive an ACK for a prefix of flight, we did: + + 1) Near the top of tcp_ack(), switch from TLP timer to RTO + at write_queue_head->paket_tx_time + RTO_interval: + if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) + tcp_rearm_rto(sk); + + 2) In tcp_clean_rtx_queue(), update the RTO to now + RTO_interval: + if (flag & FLAG_ACKED) { + tcp_rearm_rto(sk); + + 3) In tcp_ack() after tcp_fastretrans_alert() switch from RTO + to TLP at now + RTO_interval: + if (icsk->icsk_pending == ICSK_TIME_RETRANS) + tcp_schedule_loss_probe(sk); + +In df92c8394e6e we removed that 3-phase dance, and instead directly +set the TLP timer once: we set the TLP timer in cases like this to +write_queue_head->packet_tx_time + RTO_interval. So if the RTT +variance is small, then this means that this is setting the TLP timer +to fire quite soon. This means if the ACK for the tail of the flight +takes longer than an RTT to arrive (often due to delayed ACKs), then +the TLP timer fires too quickly. + +Fixes: df92c8394e6e ("tcp: fix xmit timer to only be reset if data ACKed/SACKed") +Signed-off-by: Neal Cardwell +Signed-off-by: Yuchung Cheng +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 2 +- + net/ipv4/tcp_input.c | 2 +- + net/ipv4/tcp_output.c | 8 +++++--- + 3 files changed, 7 insertions(+), 5 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -563,7 +563,7 @@ void tcp_push_one(struct sock *, unsigne + void tcp_send_ack(struct sock *sk); + void tcp_send_delayed_ack(struct sock *sk); + void tcp_send_loss_probe(struct sock *sk); +-bool tcp_schedule_loss_probe(struct sock *sk); ++bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto); + void tcp_skb_collapse_tstamp(struct sk_buff *skb, + const struct sk_buff *next_skb); + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -3021,7 +3021,7 @@ void tcp_rearm_rto(struct sock *sk) + /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ + static void tcp_set_xmit_timer(struct sock *sk) + { +- if (!tcp_schedule_loss_probe(sk)) ++ if (!tcp_schedule_loss_probe(sk, true)) + tcp_rearm_rto(sk); + } + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2337,7 +2337,7 @@ repair: + + /* Send one loss probe per tail loss episode. */ + if (push_one != 2) +- tcp_schedule_loss_probe(sk); ++ tcp_schedule_loss_probe(sk, false); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); + tcp_cwnd_validate(sk, is_cwnd_limited); + return false; +@@ -2345,7 +2345,7 @@ repair: + return !tp->packets_out && tcp_send_head(sk); + } + +-bool tcp_schedule_loss_probe(struct sock *sk) ++bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) + { + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); +@@ -2384,7 +2384,9 @@ bool tcp_schedule_loss_probe(struct sock + } + + /* If the RTO formula yields an earlier time, then use that time. */ +- rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ ++ rto_delta_us = advancing_rto ? ++ jiffies_to_usecs(inet_csk(sk)->icsk_rto) : ++ tcp_rto_delta_us(sk); /* How far in future is RTO? */ + if (rto_delta_us > 0) + timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); + diff --git a/queue-4.14/tipc-call-tipc_rcv-only-if-bearer-is-up-in-tipc_udp_recv.patch b/queue-4.14/tipc-call-tipc_rcv-only-if-bearer-is-up-in-tipc_udp_recv.patch new file mode 100644 index 00000000000..529e9169893 --- /dev/null +++ b/queue-4.14/tipc-call-tipc_rcv-only-if-bearer-is-up-in-tipc_udp_recv.patch @@ -0,0 +1,126 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Tommi Rantala +Date: Wed, 29 Nov 2017 12:48:42 +0200 +Subject: tipc: call tipc_rcv() only if bearer is up in tipc_udp_recv() + +From: Tommi Rantala + + +[ Upstream commit c7799c067c2ae33e348508c8afec354f3257ff25 ] + +Remove the second tipc_rcv() call in tipc_udp_recv(). We have just +checked that the bearer is not up, and calling tipc_rcv() with a bearer +that is not up leads to a TIPC div-by-zero crash in +tipc_node_calculate_timer(). The crash is rare in practice, but can +happen like this: + + We're enabling a bearer, but it's not yet up and fully initialized. + At the same time we receive a discovery packet, and in tipc_udp_recv() + we end up calling tipc_rcv() with the not-yet-initialized bearer, + causing later the div-by-zero crash in tipc_node_calculate_timer(). + +Jon Maloy explains the impact of removing the second tipc_rcv() call: + "link setup in the worst case will be delayed until the next arriving + discovery messages, 1 sec later, and this is an acceptable delay." + +As the tipc_rcv() call is removed, just leave the function via the +rcu_out label, so that we will kfree_skb(). + +[ 12.590450] Own node address <1.1.1>, network identity 1 +[ 12.668088] divide error: 0000 [#1] SMP +[ 12.676952] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.14.2-dirty #1 +[ 12.679225] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 +[ 12.682095] task: ffff8c2a761edb80 task.stack: ffffa41cc0cac000 +[ 12.684087] RIP: 0010:tipc_node_calculate_timer.isra.12+0x45/0x60 [tipc] +[ 12.686486] RSP: 0018:ffff8c2a7fc838a0 EFLAGS: 00010246 +[ 12.688451] RAX: 0000000000000000 RBX: ffff8c2a5b382600 RCX: 0000000000000000 +[ 12.691197] RDX: 0000000000000000 RSI: ffff8c2a5b382600 RDI: ffff8c2a5b382600 +[ 12.693945] RBP: ffff8c2a7fc838b0 R08: 0000000000000001 R09: 0000000000000001 +[ 12.696632] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8c2a5d8949d8 +[ 12.699491] R13: ffffffff95ede400 R14: 0000000000000000 R15: ffff8c2a5d894800 +[ 12.702338] FS: 0000000000000000(0000) GS:ffff8c2a7fc80000(0000) knlGS:0000000000000000 +[ 12.705099] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 12.706776] CR2: 0000000001bb9440 CR3: 00000000bd009001 CR4: 00000000003606e0 +[ 12.708847] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 12.711016] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 12.712627] Call Trace: +[ 12.713390] +[ 12.714011] tipc_node_check_dest+0x2e8/0x350 [tipc] +[ 12.715286] tipc_disc_rcv+0x14d/0x1d0 [tipc] +[ 12.716370] tipc_rcv+0x8b0/0xd40 [tipc] +[ 12.717396] ? minmax_running_min+0x2f/0x60 +[ 12.718248] ? dst_alloc+0x4c/0xa0 +[ 12.718964] ? tcp_ack+0xaf1/0x10b0 +[ 12.719658] ? tipc_udp_is_known_peer+0xa0/0xa0 [tipc] +[ 12.720634] tipc_udp_recv+0x71/0x1d0 [tipc] +[ 12.721459] ? dst_alloc+0x4c/0xa0 +[ 12.722130] udp_queue_rcv_skb+0x264/0x490 +[ 12.722924] __udp4_lib_rcv+0x21e/0x990 +[ 12.723670] ? ip_route_input_rcu+0x2dd/0xbf0 +[ 12.724442] ? tcp_v4_rcv+0x958/0xa40 +[ 12.725039] udp_rcv+0x1a/0x20 +[ 12.725587] ip_local_deliver_finish+0x97/0x1d0 +[ 12.726323] ip_local_deliver+0xaf/0xc0 +[ 12.726959] ? ip_route_input_noref+0x19/0x20 +[ 12.727689] ip_rcv_finish+0xdd/0x3b0 +[ 12.728307] ip_rcv+0x2ac/0x360 +[ 12.728839] __netif_receive_skb_core+0x6fb/0xa90 +[ 12.729580] ? udp4_gro_receive+0x1a7/0x2c0 +[ 12.730274] __netif_receive_skb+0x1d/0x60 +[ 12.730953] ? __netif_receive_skb+0x1d/0x60 +[ 12.731637] netif_receive_skb_internal+0x37/0xd0 +[ 12.732371] napi_gro_receive+0xc7/0xf0 +[ 12.732920] receive_buf+0x3c3/0xd40 +[ 12.733441] virtnet_poll+0xb1/0x250 +[ 12.733944] net_rx_action+0x23e/0x370 +[ 12.734476] __do_softirq+0xc5/0x2f8 +[ 12.734922] irq_exit+0xfa/0x100 +[ 12.735315] do_IRQ+0x4f/0xd0 +[ 12.735680] common_interrupt+0xa2/0xa2 +[ 12.736126] +[ 12.736416] RIP: 0010:native_safe_halt+0x6/0x10 +[ 12.736925] RSP: 0018:ffffa41cc0cafe90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff4d +[ 12.737756] RAX: 0000000000000000 RBX: ffff8c2a761edb80 RCX: 0000000000000000 +[ 12.738504] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 +[ 12.739258] RBP: ffffa41cc0cafe90 R08: 0000014b5b9795e5 R09: ffffa41cc12c7e88 +[ 12.740118] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002 +[ 12.740964] R13: ffff8c2a761edb80 R14: 0000000000000000 R15: 0000000000000000 +[ 12.741831] default_idle+0x2a/0x100 +[ 12.742323] arch_cpu_idle+0xf/0x20 +[ 12.742796] default_idle_call+0x28/0x40 +[ 12.743312] do_idle+0x179/0x1f0 +[ 12.743761] cpu_startup_entry+0x1d/0x20 +[ 12.744291] start_secondary+0x112/0x120 +[ 12.744816] secondary_startup_64+0xa5/0xa5 +[ 12.745367] Code: b9 f4 01 00 00 48 89 c2 48 c1 ea 02 48 3d d3 07 00 +00 48 0f 47 d1 49 8b 0c 24 48 39 d1 76 07 49 89 14 24 48 89 d1 31 d2 48 +89 df <48> f7 f1 89 c6 e8 81 6e ff ff 5b 41 5c 5d c3 66 90 66 2e 0f 1f +[ 12.747527] RIP: tipc_node_calculate_timer.isra.12+0x45/0x60 [tipc] RSP: ffff8c2a7fc838a0 +[ 12.748555] ---[ end trace 1399ab83390650fd ]--- +[ 12.749296] Kernel panic - not syncing: Fatal exception in interrupt +[ 12.750123] Kernel Offset: 0x13200000 from 0xffffffff82000000 +(relocation range: 0xffffffff80000000-0xffffffffbfffffff) +[ 12.751215] Rebooting in 60 seconds.. + +Fixes: c9b64d492b1f ("tipc: add replicast peer discovery") +Signed-off-by: Tommi Rantala +Cc: Jon Maloy +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/tipc/udp_media.c | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/net/tipc/udp_media.c ++++ b/net/tipc/udp_media.c +@@ -371,10 +371,6 @@ static int tipc_udp_recv(struct sock *sk + goto rcu_out; + } + +- tipc_rcv(sock_net(sk), skb, b); +- rcu_read_unlock(); +- return 0; +- + rcu_out: + rcu_read_unlock(); + out: diff --git a/queue-4.14/tipc-fix-memory-leak-in-tipc_accept_from_sock.patch b/queue-4.14/tipc-fix-memory-leak-in-tipc_accept_from_sock.patch new file mode 100644 index 00000000000..af65cbe8bad --- /dev/null +++ b/queue-4.14/tipc-fix-memory-leak-in-tipc_accept_from_sock.patch @@ -0,0 +1,34 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Jon Maloy +Date: Mon, 4 Dec 2017 22:00:20 +0100 +Subject: tipc: fix memory leak in tipc_accept_from_sock() + +From: Jon Maloy + + +[ Upstream commit a7d5f107b4978e08eeab599ee7449af34d034053 ] + +When the function tipc_accept_from_sock() fails to create an instance of +struct tipc_subscriber it omits to free the already created instance of +struct tipc_conn instance before it returns. + +We fix that with this commit. + +Reported-by: David S. Miller +Signed-off-by: Jon Maloy +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/tipc/server.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/tipc/server.c ++++ b/net/tipc/server.c +@@ -313,6 +313,7 @@ static int tipc_accept_from_sock(struct + newcon->usr_data = s->tipc_conn_new(newcon->conid); + if (!newcon->usr_data) { + sock_release(newsock); ++ conn_put(newcon); + return -ENOMEM; + } + diff --git a/queue-4.14/tun-fix-rcu_read_lock-imbalance-in-tun_build_skb.patch b/queue-4.14/tun-fix-rcu_read_lock-imbalance-in-tun_build_skb.patch new file mode 100644 index 00000000000..34b96fb9adf --- /dev/null +++ b/queue-4.14/tun-fix-rcu_read_lock-imbalance-in-tun_build_skb.patch @@ -0,0 +1,46 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Xin Long +Date: Sun, 19 Nov 2017 19:31:04 +0800 +Subject: tun: fix rcu_read_lock imbalance in tun_build_skb + +From: Xin Long + + +[ Upstream commit 654d573845f35017dc397840fa03610fef3d08b0 ] + +rcu_read_lock in tun_build_skb is used to rcu_dereference tun->xdp_prog +safely, rcu_read_unlock should be done in every return path. + +Now I could see one place missing it, where it returns NULL in switch-case +XDP_REDIRECT, another palce using rcu_read_lock wrongly, where it returns +NULL in if (xdp_xmit) chunk. + +So fix both in this patch. + +Fixes: 761876c857cb ("tap: XDP support") +Signed-off-by: Xin Long +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1326,6 +1326,7 @@ static struct sk_buff *tun_build_skb(str + err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); + if (err) + goto err_redirect; ++ rcu_read_unlock(); + return NULL; + case XDP_TX: + xdp_xmit = true; +@@ -1358,7 +1359,7 @@ static struct sk_buff *tun_build_skb(str + if (xdp_xmit) { + skb->dev = tun->dev; + generic_xdp_tx(skb, xdp_prog); +- rcu_read_lock(); ++ rcu_read_unlock(); + return NULL; + } + diff --git a/queue-4.14/tun-free-skb-in-early-errors.patch b/queue-4.14/tun-free-skb-in-early-errors.patch new file mode 100644 index 00000000000..74e705253fc --- /dev/null +++ b/queue-4.14/tun-free-skb-in-early-errors.patch @@ -0,0 +1,85 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Wei Xu +Date: Fri, 1 Dec 2017 05:10:37 -0500 +Subject: tun: free skb in early errors + +From: Wei Xu + + +[ Upstream commit c33ee15b3820a03cf8229ba9415084197b827f8c ] + +tun_recvmsg() supports accepting skb by msg_control after +commit ac77cfd4258f ("tun: support receiving skb through msg_control"), +the skb if presented should be freed no matter how far it can go +along, otherwise it would be leaked. + +This patch fixes several missed cases. + +Signed-off-by: Wei Xu +Reported-by: Matthew Rosato +Acked-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1734,8 +1734,11 @@ static ssize_t tun_do_read(struct tun_st + + tun_debug(KERN_INFO, tun, "tun_do_read\n"); + +- if (!iov_iter_count(to)) ++ if (!iov_iter_count(to)) { ++ if (skb) ++ kfree_skb(skb); + return 0; ++ } + + if (!skb) { + /* Read frames from ring */ +@@ -1851,22 +1854,24 @@ static int tun_recvmsg(struct socket *so + { + struct tun_file *tfile = container_of(sock, struct tun_file, socket); + struct tun_struct *tun = __tun_get(tfile); ++ struct sk_buff *skb = m->msg_control; + int ret; + +- if (!tun) +- return -EBADFD; ++ if (!tun) { ++ ret = -EBADFD; ++ goto out_free_skb; ++ } + + if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { + ret = -EINVAL; +- goto out; ++ goto out_put_tun; + } + if (flags & MSG_ERRQUEUE) { + ret = sock_recv_errqueue(sock->sk, m, total_len, + SOL_PACKET, TUN_TX_TIMESTAMP); + goto out; + } +- ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, +- m->msg_control); ++ ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb); + if (ret > (ssize_t)total_len) { + m->msg_flags |= MSG_TRUNC; + ret = flags & MSG_TRUNC ? ret : total_len; +@@ -1874,6 +1879,13 @@ static int tun_recvmsg(struct socket *so + out: + tun_put(tun); + return ret; ++ ++out_put_tun: ++ tun_put(tun); ++out_free_skb: ++ if (skb) ++ kfree_skb(skb); ++ return ret; + } + + static int tun_peek_len(struct socket *sock) diff --git a/queue-4.14/usbnet-fix-alignment-for-frames-with-no-ethernet-header.patch b/queue-4.14/usbnet-fix-alignment-for-frames-with-no-ethernet-header.patch new file mode 100644 index 00000000000..c79092069c4 --- /dev/null +++ b/queue-4.14/usbnet-fix-alignment-for-frames-with-no-ethernet-header.patch @@ -0,0 +1,67 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Bjørn Mork +Date: Wed, 6 Dec 2017 20:21:24 +0100 +Subject: usbnet: fix alignment for frames with no ethernet header + +From: Bjørn Mork + + +[ Upstream commit a4abd7a80addb4a9547f7dfc7812566b60ec505c ] + +The qmi_wwan minidriver support a 'raw-ip' mode where frames are +received without any ethernet header. This causes alignment issues +because the skbs allocated by usbnet are "IP aligned". + +Fix by allowing minidrivers to disable the additional alignment +offset. This is implemented using a per-device flag, since the same +minidriver also supports 'ethernet' mode. + +Fixes: 32f7adf633b9 ("net: qmi_wwan: support "raw IP" mode") +Reported-and-tested-by: Jay Foster +Signed-off-by: Bjørn Mork +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/usb/qmi_wwan.c | 2 ++ + drivers/net/usb/usbnet.c | 5 ++++- + include/linux/usb/usbnet.h | 1 + + 3 files changed, 7 insertions(+), 1 deletion(-) + +--- a/drivers/net/usb/qmi_wwan.c ++++ b/drivers/net/usb/qmi_wwan.c +@@ -261,9 +261,11 @@ static void qmi_wwan_netdev_setup(struct + net->hard_header_len = 0; + net->addr_len = 0; + net->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; ++ set_bit(EVENT_NO_IP_ALIGN, &dev->flags); + netdev_dbg(net, "mode: raw IP\n"); + } else if (!net->header_ops) { /* don't bother if already set */ + ether_setup(net); ++ clear_bit(EVENT_NO_IP_ALIGN, &dev->flags); + netdev_dbg(net, "mode: Ethernet\n"); + } + +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -484,7 +484,10 @@ static int rx_submit (struct usbnet *dev + return -ENOLINK; + } + +- skb = __netdev_alloc_skb_ip_align(dev->net, size, flags); ++ if (test_bit(EVENT_NO_IP_ALIGN, &dev->flags)) ++ skb = __netdev_alloc_skb(dev->net, size, flags); ++ else ++ skb = __netdev_alloc_skb_ip_align(dev->net, size, flags); + if (!skb) { + netif_dbg(dev, rx_err, dev->net, "no rx skb\n"); + usbnet_defer_kevent (dev, EVENT_RX_MEMORY); +--- a/include/linux/usb/usbnet.h ++++ b/include/linux/usb/usbnet.h +@@ -81,6 +81,7 @@ struct usbnet { + # define EVENT_RX_KILL 10 + # define EVENT_LINK_CHANGE 11 + # define EVENT_SET_RX_MODE 12 ++# define EVENT_NO_IP_ALIGN 13 + }; + + static inline struct usb_driver *driver_of(struct usb_interface *intf) diff --git a/queue-4.14/vhost-fix-skb-leak-in-handle_rx.patch b/queue-4.14/vhost-fix-skb-leak-in-handle_rx.patch new file mode 100644 index 00000000000..8f773c652b4 --- /dev/null +++ b/queue-4.14/vhost-fix-skb-leak-in-handle_rx.patch @@ -0,0 +1,71 @@ +From foo@baz Thu Dec 14 11:45:40 CET 2017 +From: Wei Xu +Date: Fri, 1 Dec 2017 05:10:36 -0500 +Subject: vhost: fix skb leak in handle_rx() + +From: Wei Xu + + +[ Upstream commit 6e474083f3daf3a3546737f5d7d502ad12eb257c ] + +Matthew found a roughly 40% tcp throughput regression with commit +c67df11f(vhost_net: try batch dequing from skb array) as discussed +in the following thread: +https://www.mail-archive.com/netdev@vger.kernel.org/msg187936.html + +Eventually we figured out that it was a skb leak in handle_rx() +when sending packets to the VM. This usually happens when a guest +can not drain out vq as fast as vhost fills in, afterwards it sets +off the traffic jam and leaks skb(s) which occurs as no headcount +to send on the vq from vhost side. + +This can be avoided by making sure we have got enough headcount +before actually consuming a skb from the batched rx array while +transmitting, which is simply done by moving checking the zero +headcount a bit ahead. + +Signed-off-by: Wei Xu +Reported-by: Matthew Rosato +Acked-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/vhost/net.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -782,16 +782,6 @@ static void handle_rx(struct vhost_net * + /* On error, stop handling until the next kick. */ + if (unlikely(headcount < 0)) + goto out; +- if (nvq->rx_array) +- msg.msg_control = vhost_net_buf_consume(&nvq->rxq); +- /* On overrun, truncate and discard */ +- if (unlikely(headcount > UIO_MAXIOV)) { +- iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); +- err = sock->ops->recvmsg(sock, &msg, +- 1, MSG_DONTWAIT | MSG_TRUNC); +- pr_debug("Discarded rx packet: len %zd\n", sock_len); +- continue; +- } + /* OK, now we need to know about added descriptors. */ + if (!headcount) { + if (unlikely(vhost_enable_notify(&net->dev, vq))) { +@@ -804,6 +794,16 @@ static void handle_rx(struct vhost_net * + * they refilled. */ + goto out; + } ++ if (nvq->rx_array) ++ msg.msg_control = vhost_net_buf_consume(&nvq->rxq); ++ /* On overrun, truncate and discard */ ++ if (unlikely(headcount > UIO_MAXIOV)) { ++ iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); ++ err = sock->ops->recvmsg(sock, &msg, ++ 1, MSG_DONTWAIT | MSG_TRUNC); ++ pr_debug("Discarded rx packet: len %zd\n", sock_len); ++ continue; ++ } + /* We don't need to be notified again. */ + iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len); + fixup = msg.msg_iter;