git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob

   1 From foo@baz Wed May 28 21:03:54 PDT 2014
   2 From: Daniel Borkmann <dborkman@redhat.com>
   3 Date: Mon, 14 Apr 2014 21:45:17 +0200
   4 Subject: Revert "net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer"
   5
   6 From: Daniel Borkmann <dborkman@redhat.com>
   7
   8 [ Upstream commit 362d52040c71f6e8d8158be48c812d7729cb8df1 ]
   9
  10 This reverts commit ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management
  11 to reflect real state of the receiver's buffer") as it introduced a
  12 serious performance regression on SCTP over IPv4 and IPv6, though a not
  13 as dramatic on the latter. Measurements are on 10Gbit/s with ixgbe NICs.
  14
  15 Current state:
  16
  17 [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.241.3 -V -l 1452 -t 60
  18 iperf version 3.0.1 (10 January 2014)
  19 Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64
  20 Time: Fri, 11 Apr 2014 17:56:21 GMT
  21 Connecting to host 192.168.241.3, port 5201
  22       Cookie: Lab200slot2.1397238981.812898.548918
  23 [  4] local 192.168.241.2 port 38616 connected to 192.168.241.3 port 5201
  24 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test
  25 [ ID] Interval           Transfer     Bandwidth
  26 [  4]   0.00-1.09   sec  20.8 MBytes   161 Mbits/sec
  27 [  4]   1.09-2.13   sec  10.8 MBytes  86.8 Mbits/sec
  28 [  4]   2.13-3.15   sec  3.57 MBytes  29.5 Mbits/sec
  29 [  4]   3.15-4.16   sec  4.33 MBytes  35.7 Mbits/sec
  30 [  4]   4.16-6.21   sec  10.4 MBytes  42.7 Mbits/sec
  31 [  4]   6.21-6.21   sec  0.00 Bytes    0.00 bits/sec
  32 [  4]   6.21-7.35   sec  34.6 MBytes   253 Mbits/sec
  33 [  4]   7.35-11.45  sec  22.0 MBytes  45.0 Mbits/sec
  34 [  4]  11.45-11.45  sec  0.00 Bytes    0.00 bits/sec
  35 [  4]  11.45-11.45  sec  0.00 Bytes    0.00 bits/sec
  36 [  4]  11.45-11.45  sec  0.00 Bytes    0.00 bits/sec
  37 [  4]  11.45-12.51  sec  16.0 MBytes   126 Mbits/sec
  38 [  4]  12.51-13.59  sec  20.3 MBytes   158 Mbits/sec
  39 [  4]  13.59-14.65  sec  13.4 MBytes   107 Mbits/sec
  40 [  4]  14.65-16.79  sec  33.3 MBytes   130 Mbits/sec
  41 [  4]  16.79-16.79  sec  0.00 Bytes    0.00 bits/sec
  42 [  4]  16.79-17.82  sec  5.94 MBytes  48.7 Mbits/sec
  43 (etc)
  44
  45 [root@Lab200slot2 ~]#  iperf3 --sctp -6 -c 2001:db8:0:f101::1 -V -l 1400 -t 60
  46 iperf version 3.0.1 (10 January 2014)
  47 Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64
  48 Time: Fri, 11 Apr 2014 19:08:41 GMT
  49 Connecting to host 2001:db8:0:f101::1, port 5201
  50       Cookie: Lab200slot2.1397243321.714295.2b3f7c
  51 [  4] local 2001:db8:0:f101::2 port 55804 connected to 2001:db8:0:f101::1 port 5201
  52 Starting Test: protocol: SCTP, 1 streams, 1400 byte blocks, omitting 0 seconds, 60 second test
  53 [ ID] Interval           Transfer     Bandwidth
  54 [  4]   0.00-1.00   sec   169 MBytes  1.42 Gbits/sec
  55 [  4]   1.00-2.00   sec   201 MBytes  1.69 Gbits/sec
  56 [  4]   2.00-3.00   sec   188 MBytes  1.58 Gbits/sec
  57 [  4]   3.00-4.00   sec   174 MBytes  1.46 Gbits/sec
  58 [  4]   4.00-5.00   sec   165 MBytes  1.39 Gbits/sec
  59 [  4]   5.00-6.00   sec   199 MBytes  1.67 Gbits/sec
  60 [  4]   6.00-7.00   sec   163 MBytes  1.36 Gbits/sec
  61 [  4]   7.00-8.00   sec   174 MBytes  1.46 Gbits/sec
  62 [  4]   8.00-9.00   sec   193 MBytes  1.62 Gbits/sec
  63 [  4]   9.00-10.00  sec   196 MBytes  1.65 Gbits/sec
  64 [  4]  10.00-11.00  sec   157 MBytes  1.31 Gbits/sec
  65 [  4]  11.00-12.00  sec   175 MBytes  1.47 Gbits/sec
  66 [  4]  12.00-13.00  sec   192 MBytes  1.61 Gbits/sec
  67 [  4]  13.00-14.00  sec   199 MBytes  1.67 Gbits/sec
  68 (etc)
  69
  70 After patch:
  71
  72 [root@Lab200slot2 ~]#  iperf3 --sctp -4 -c 192.168.240.3 -V -l 1452 -t 60
  73 iperf version 3.0.1 (10 January 2014)
  74 Linux Lab200slot2 3.14.0+ #1 SMP Mon Apr 14 12:06:40 EDT 2014 x86_64
  75 Time: Mon, 14 Apr 2014 16:40:48 GMT
  76 Connecting to host 192.168.240.3, port 5201
  77       Cookie: Lab200slot2.1397493648.413274.65e131
  78 [  4] local 192.168.240.2 port 50548 connected to 192.168.240.3 port 5201
  79 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test
  80 [ ID] Interval           Transfer     Bandwidth
  81 [  4]   0.00-1.00   sec   240 MBytes  2.02 Gbits/sec
  82 [  4]   1.00-2.00   sec   239 MBytes  2.01 Gbits/sec
  83 [  4]   2.00-3.00   sec   240 MBytes  2.01 Gbits/sec
  84 [  4]   3.00-4.00   sec   239 MBytes  2.00 Gbits/sec
  85 [  4]   4.00-5.00   sec   245 MBytes  2.05 Gbits/sec
  86 [  4]   5.00-6.00   sec   240 MBytes  2.01 Gbits/sec
  87 [  4]   6.00-7.00   sec   240 MBytes  2.02 Gbits/sec
  88 [  4]   7.00-8.00   sec   239 MBytes  2.01 Gbits/sec
  89
  90 With the reverted patch applied, the SCTP/IPv4 performance is back
  91 to normal on latest upstream for IPv4 and IPv6 and has same throughput
  92 as 3.4.2 test kernel, steady and interval reports are smooth again.
  93
  94 Fixes: ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer")
  95 Reported-by: Peter Butler <pbutler@sonusnet.com>
  96 Reported-by: Dongsheng Song <dongsheng.song@gmail.com>
  97 Reported-by: Fengguang Wu <fengguang.wu@intel.com>
  98 Tested-by: Peter Butler <pbutler@sonusnet.com>
  99 Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
 100 Cc: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com>
 101 Cc: Alexander Sverdlin <alexander.sverdlin@nsn.com>
 102 Cc: Vlad Yasevich <vyasevich@gmail.com>
 103 Acked-by: Vlad Yasevich <vyasevich@gmail.com>
 104 Signed-off-by: David S. Miller <davem@davemloft.net>
 105 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 106 ---
 107  include/net/sctp/structs.h |   14 +++++++
 108  net/sctp/associola.c       |   82 +++++++++++++++++++++++++++++++++++----------
 109  net/sctp/sm_statefuns.c    |    2 -
 110  net/sctp/socket.c          |    6 +++
 111  net/sctp/ulpevent.c        |    8 +---
 112  5 files changed, 87 insertions(+), 25 deletions(-)
 113
 114 --- a/include/net/sctp/structs.h
 115 +++ b/include/net/sctp/structs.h
 116 @@ -1653,6 +1653,17 @@ struct sctp_association {
 117         /* This is the last advertised value of rwnd over a SACK chunk. */
 118         __u32 a_rwnd;
 119
 120 +       /* Number of bytes by which the rwnd has slopped.  The rwnd is allowed
 121 +        * to slop over a maximum of the association's frag_point.
 122 +        */
 123 +       __u32 rwnd_over;
 124 +
 125 +       /* Keeps treack of rwnd pressure.  This happens when we have
 126 +        * a window, but not recevie buffer (i.e small packets).  This one
 127 +        * is releases slowly (1 PMTU at a time ).
 128 +        */
 129 +       __u32 rwnd_press;
 130 +
 131         /* This is the sndbuf size in use for the association.
 132          * This corresponds to the sndbuf size for the association,
 133          * as specified in the sk->sndbuf.
 134 @@ -1881,7 +1892,8 @@ void sctp_assoc_update(struct sctp_assoc
 135  __u32 sctp_association_get_next_tsn(struct sctp_association *);
 136
 137  void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *);
 138 -void sctp_assoc_rwnd_update(struct sctp_association *, bool);
 139 +void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
 140 +void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int);
 141  void sctp_assoc_set_primary(struct sctp_association *,
 142                             struct sctp_transport *);
 143  void sctp_assoc_del_nonprimary_peers(struct sctp_association *,
 144 --- a/net/sctp/associola.c
 145 +++ b/net/sctp/associola.c
 146 @@ -1396,35 +1396,44 @@ static inline bool sctp_peer_needs_updat
 147         return false;
 148  }
 149
 150 -/* Update asoc's rwnd for the approximated state in the buffer,
 151 - * and check whether SACK needs to be sent.
 152 - */
 153 -void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer)
 154 +/* Increase asoc's rwnd by len and send any window update SACK if needed. */
 155 +void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
 156  {
 157 -       int rx_count;
 158         struct sctp_chunk *sack;
 159         struct timer_list *timer;
 160
 161 -       if (asoc->ep->rcvbuf_policy)
 162 -               rx_count = atomic_read(&asoc->rmem_alloc);
 163 -       else
 164 -               rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
 165 +       if (asoc->rwnd_over) {
 166 +               if (asoc->rwnd_over >= len) {
 167 +                       asoc->rwnd_over -= len;
 168 +               } else {
 169 +                       asoc->rwnd += (len - asoc->rwnd_over);
 170 +                       asoc->rwnd_over = 0;
 171 +               }
 172 +       } else {
 173 +               asoc->rwnd += len;
 174 +       }
 175
 176 -       if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0)
 177 -               asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1;
 178 -       else
 179 -               asoc->rwnd = 0;
 180 +       /* If we had window pressure, start recovering it
 181 +        * once our rwnd had reached the accumulated pressure
 182 +        * threshold.  The idea is to recover slowly, but up
 183 +        * to the initial advertised window.
 184 +        */
 185 +       if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) {
 186 +               int change = min(asoc->pathmtu, asoc->rwnd_press);
 187 +               asoc->rwnd += change;
 188 +               asoc->rwnd_press -= change;
 189 +       }
 190
 191 -       pr_debug("%s: asoc:%p rwnd=%u, rx_count=%d, sk_rcvbuf=%d\n",
 192 -                __func__, asoc, asoc->rwnd, rx_count,
 193 -                asoc->base.sk->sk_rcvbuf);
 194 +       pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n",
 195 +                __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
 196 +                asoc->a_rwnd);
 197
 198         /* Send a window update SACK if the rwnd has increased by at least the
 199          * minimum of the association's PMTU and half of the receive buffer.
 200          * The algorithm used is similar to the one described in
 201          * Section 4.2.3.3 of RFC 1122.
 202          */
 203 -       if (update_peer && sctp_peer_needs_update(asoc)) {
 204 +       if (sctp_peer_needs_update(asoc)) {
 205                 asoc->a_rwnd = asoc->rwnd;
 206
 207                 pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u "
 208 @@ -1446,6 +1455,45 @@ void sctp_assoc_rwnd_update(struct sctp_
 209         }
 210  }
 211
 212 +/* Decrease asoc's rwnd by len. */
 213 +void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
 214 +{
 215 +       int rx_count;
 216 +       int over = 0;
 217 +
 218 +       if (unlikely(!asoc->rwnd || asoc->rwnd_over))
 219 +               pr_debug("%s: association:%p has asoc->rwnd:%u, "
 220 +                        "asoc->rwnd_over:%u!\n", __func__, asoc,
 221 +                        asoc->rwnd, asoc->rwnd_over);
 222 +
 223 +       if (asoc->ep->rcvbuf_policy)
 224 +               rx_count = atomic_read(&asoc->rmem_alloc);
 225 +       else
 226 +               rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
 227 +
 228 +       /* If we've reached or overflowed our receive buffer, announce
 229 +        * a 0 rwnd if rwnd would still be positive.  Store the
 230 +        * the potential pressure overflow so that the window can be restored
 231 +        * back to original value.
 232 +        */
 233 +       if (rx_count >= asoc->base.sk->sk_rcvbuf)
 234 +               over = 1;
 235 +
 236 +       if (asoc->rwnd >= len) {
 237 +               asoc->rwnd -= len;
 238 +               if (over) {
 239 +                       asoc->rwnd_press += asoc->rwnd;
 240 +                       asoc->rwnd = 0;
 241 +               }
 242 +       } else {
 243 +               asoc->rwnd_over = len - asoc->rwnd;
 244 +               asoc->rwnd = 0;
 245 +       }
 246 +
 247 +       pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n",
 248 +                __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
 249 +                asoc->rwnd_press);
 250 +}
 251
 252  /* Build the bind address list for the association based on info from the
 253   * local endpoint and the remote peer.
 254 --- a/net/sctp/sm_statefuns.c
 255 +++ b/net/sctp/sm_statefuns.c
 256 @@ -6178,7 +6178,7 @@ static int sctp_eat_data(const struct sc
 257          * PMTU.  In cases, such as loopback, this might be a rather
 258          * large spill over.
 259          */
 260 -       if ((!chunk->data_accepted) && (!asoc->rwnd ||
 261 +       if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over ||
 262             (datalen > asoc->rwnd + asoc->frag_point))) {
 263
 264                 /* If this is the next TSN, consider reneging to make
 265 --- a/net/sctp/socket.c
 266 +++ b/net/sctp/socket.c
 267 @@ -2115,6 +2115,12 @@ static int sctp_recvmsg(struct kiocb *io
 268                 sctp_skb_pull(skb, copied);
 269                 skb_queue_head(&sk->sk_receive_queue, skb);
 270
 271 +               /* When only partial message is copied to the user, increase
 272 +                * rwnd by that amount. If all the data in the skb is read,
 273 +                * rwnd is updated when the event is freed.
 274 +                */
 275 +               if (!sctp_ulpevent_is_notification(event))
 276 +                       sctp_assoc_rwnd_increase(event->asoc, copied);
 277                 goto out;
 278         } else if ((event->msg_flags & MSG_NOTIFICATION) ||
 279                    (event->msg_flags & MSG_EOR))
 280 --- a/net/sctp/ulpevent.c
 281 +++ b/net/sctp/ulpevent.c
 282 @@ -989,7 +989,7 @@ static void sctp_ulpevent_receive_data(s
 283         skb = sctp_event2skb(event);
 284         /* Set the owner and charge rwnd for bytes received.  */
 285         sctp_ulpevent_set_owner(event, asoc);
 286 -       sctp_assoc_rwnd_update(asoc, false);
 287 +       sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb));
 288
 289         if (!skb->data_len)
 290                 return;
 291 @@ -1011,7 +1011,6 @@ static void sctp_ulpevent_release_data(s
 292  {
 293         struct sk_buff *skb, *frag;
 294         unsigned int    len;
 295 -       struct sctp_association *asoc;
 296
 297         /* Current stack structures assume that the rcv buffer is
 298          * per socket.   For UDP style sockets this is not true as
 299 @@ -1036,11 +1035,8 @@ static void sctp_ulpevent_release_data(s
 300         }
 301
 302  done:
 303 -       asoc = event->asoc;
 304 -       sctp_association_hold(asoc);
 305 +       sctp_assoc_rwnd_increase(event->asoc, len);
 306         sctp_ulpevent_release_owner(event);
 307 -       sctp_assoc_rwnd_update(asoc, true);
 308 -       sctp_association_put(asoc);
 309  }
 310
 311  static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)