releases/3.4.83/net-ip-ipv6-handle-gso-skbs-in-forwarding-path.patch

   1 From foo@baz Wed Feb 26 20:38:29 PST 2014
   2 From: Florian Westphal <fw@strlen.de>
   3 Date: Sat, 22 Feb 2014 10:33:26 +0100
   4 Subject: net: ip, ipv6: handle gso skbs in forwarding path
   5
   6 From: Florian Westphal <fw@strlen.de>
   7
   8 commit fe6cc55f3a9a053482a76f5a6b2257cee51b4663 upstream.
   9
  10 [ use zero netdev_feature mask to avoid backport of
  11   netif_skb_dev_features function ]
  12
  13 Marcelo Ricardo Leitner reported problems when the forwarding link path
  14 has a lower mtu than the incoming one if the inbound interface supports GRO.
  15
  16 Given:
  17 Host <mtu1500> R1 <mtu1200> R2
  18
  19 Host sends tcp stream which is routed via R1 and R2.  R1 performs GRO.
  20
  21 In this case, the kernel will fail to send ICMP fragmentation needed
  22 messages (or pkt too big for ipv6), as GSO packets currently bypass dstmtu
  23 checks in forward path. Instead, Linux tries to send out packets exceeding
  24 the mtu.
  25
  26 When locking route MTU on Host (i.e., no ipv4 DF bit set), R1 does
  27 not fragment the packets when forwarding, and again tries to send out
  28 packets exceeding R1-R2 link mtu.
  29
  30 This alters the forwarding dstmtu checks to take the individual gso
  31 segment lengths into account.
  32
  33 For ipv6, we send out pkt too big error for gso if the individual
  34 segments are too big.
  35
  36 For ipv4, we either send icmp fragmentation needed, or, if the DF bit
  37 is not set, perform software segmentation and let the output path
  38 create fragments when the packet is leaving the machine.
  39 It is not 100% correct as the error message will contain the headers of
  40 the GRO skb instead of the original/segmented one, but it seems to
  41 work fine in my (limited) tests.
  42
  43 Eric Dumazet suggested to simply shrink mss via ->gso_size to avoid
  44 sofware segmentation.
  45
  46 However it turns out that skb_segment() assumes skb nr_frags is related
  47 to mss size so we would BUG there.  I don't want to mess with it considering
  48 Herbert and Eric disagree on what the correct behavior should be.
  49
  50 Hannes Frederic Sowa notes that when we would shrink gso_size
  51 skb_segment would then also need to deal with the case where
  52 SKB_MAX_FRAGS would be exceeded.
  53
  54 This uses sofware segmentation in the forward path when we hit ipv4
  55 non-DF packets and the outgoing link mtu is too small.  Its not perfect,
  56 but given the lack of bug reports wrt. GRO fwd being broken this is a
  57 rare case anyway.  Also its not like this could not be improved later
  58 once the dust settles.
  59
  60 Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
  61 Reported-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
  62 Signed-off-by: Florian Westphal <fw@strlen.de>
  63 Signed-off-by: David S. Miller <davem@davemloft.net>
  64 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  65 ---
  66  include/linux/skbuff.h |   17 ++++++++++++
  67  net/ipv4/ip_forward.c  |   68 +++++++++++++++++++++++++++++++++++++++++++++++--
  68  net/ipv6/ip6_output.c  |   13 ++++++++-
  69  3 files changed, 95 insertions(+), 3 deletions(-)
  70
  71 --- a/include/linux/skbuff.h
  72 +++ b/include/linux/skbuff.h
  73 @@ -2582,5 +2582,22 @@ static inline bool skb_is_recycleable(co
  74
  75         return true;
  76  }
  77 +
  78 +/**
  79 + * skb_gso_network_seglen - Return length of individual segments of a gso packet
  80 + *
  81 + * @skb: GSO skb
  82 + *
  83 + * skb_gso_network_seglen is used to determine the real size of the
  84 + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
  85 + *
  86 + * The MAC/L2 header is not accounted for.
  87 + */
  88 +static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
  89 +{
  90 +       unsigned int hdr_len = skb_transport_header(skb) -
  91 +                              skb_network_header(skb);
  92 +       return hdr_len + skb_gso_transport_seglen(skb);
  93 +}
  94  #endif /* __KERNEL__ */
  95  #endif /* _LINUX_SKBUFF_H */
  96 --- a/net/ipv4/ip_forward.c
  97 +++ b/net/ipv4/ip_forward.c
  98 @@ -39,6 +39,68 @@
  99  #include <net/route.h>
 100  #include <net/xfrm.h>
 101
 102 +static bool ip_may_fragment(const struct sk_buff *skb)
 103 +{
 104 +       return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
 105 +              !skb->local_df;
 106 +}
 107 +
 108 +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 109 +{
 110 +       if (skb->len <= mtu || skb->local_df)
 111 +               return false;
 112 +
 113 +       if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
 114 +               return false;
 115 +
 116 +       return true;
 117 +}
 118 +
 119 +static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb)
 120 +{
 121 +       unsigned int mtu;
 122 +
 123 +       if (skb->local_df || !skb_is_gso(skb))
 124 +               return false;
 125 +
 126 +       mtu = dst_mtu(skb_dst(skb));
 127 +
 128 +       /* if seglen > mtu, do software segmentation for IP fragmentation on
 129 +        * output.  DF bit cannot be set since ip_forward would have sent
 130 +        * icmp error.
 131 +        */
 132 +       return skb_gso_network_seglen(skb) > mtu;
 133 +}
 134 +
 135 +/* called if GSO skb needs to be fragmented on forward */
 136 +static int ip_forward_finish_gso(struct sk_buff *skb)
 137 +{
 138 +       struct sk_buff *segs;
 139 +       int ret = 0;
 140 +
 141 +       segs = skb_gso_segment(skb, 0);
 142 +       if (IS_ERR(segs)) {
 143 +               kfree_skb(skb);
 144 +               return -ENOMEM;
 145 +       }
 146 +
 147 +       consume_skb(skb);
 148 +
 149 +       do {
 150 +               struct sk_buff *nskb = segs->next;
 151 +               int err;
 152 +
 153 +               segs->next = NULL;
 154 +               err = dst_output(segs);
 155 +
 156 +               if (err && ret == 0)
 157 +                       ret = err;
 158 +               segs = nskb;
 159 +       } while (segs);
 160 +
 161 +       return ret;
 162 +}
 163 +
 164  static int ip_forward_finish(struct sk_buff *skb)
 165  {
 166         struct ip_options * opt = &(IPCB(skb)->opt);
 167 @@ -48,6 +110,9 @@ static int ip_forward_finish(struct sk_b
 168         if (unlikely(opt->optlen))
 169                 ip_forward_options(skb);
 170
 171 +       if (ip_gso_exceeds_dst_mtu(skb))
 172 +               return ip_forward_finish_gso(skb);
 173 +
 174         return dst_output(skb);
 175  }
 176
 177 @@ -87,8 +152,7 @@ int ip_forward(struct sk_buff *skb)
 178         if (opt->is_strictroute && opt->nexthop != rt->rt_gateway)
 179                 goto sr_failed;
 180
 181 -       if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
 182 -                    (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
 183 +       if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, dst_mtu(&rt->dst))) {
 184                 IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
 185                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 186                           htonl(dst_mtu(&rt->dst)));
 187 --- a/net/ipv6/ip6_output.c
 188 +++ b/net/ipv6/ip6_output.c
 189 @@ -382,6 +382,17 @@ static inline int ip6_forward_finish(str
 190         return dst_output(skb);
 191  }
 192
 193 +static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 194 +{
 195 +       if (skb->len <= mtu || skb->local_df)
 196 +               return false;
 197 +
 198 +       if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
 199 +               return false;
 200 +
 201 +       return true;
 202 +}
 203 +
 204  int ip6_forward(struct sk_buff *skb)
 205  {
 206         struct dst_entry *dst = skb_dst(skb);
 207 @@ -503,7 +514,7 @@ int ip6_forward(struct sk_buff *skb)
 208         if (mtu < IPV6_MIN_MTU)
 209                 mtu = IPV6_MIN_MTU;
 210
 211 -       if (skb->len > mtu && !skb_is_gso(skb)) {
 212 +       if (ip6_pkt_too_big(skb, mtu)) {
 213                 /* Again, force OUTPUT device used as source address */
 214                 skb->dev = dst->dev;
 215                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);