From 965ffac67c3b89f2a70d9315dc575d80b6973560 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 3 Jan 2012 12:50:48 -0800 Subject: [PATCH] 3.0-stable patches added patches: ipv4-flush-route-cache-after-change-accept_local.patch ipv4-reintroduce-route-cache-garbage-collector.patch ipv4-using-prefetch-requires-including-prefetch.h.patch llc-llc_cmsg_rcv-was-getting-called-after-sk_eat_skb.patch mqprio-avoid-panic-if-no-options-are-provided.patch net-bpf_jit-fix-an-off-one-bug-in-x86_64-cond-jump-target.patch net-have-ipconfig-not-wait-if-no-dev-is-available.patch ppp-fix-pptp-double-release_sock-in-pptp_bind.patch sch_gred-should-not-use-gfp_kernel-while-holding-a-spinlock.patch sctp-do-not-account-for-sizeof-struct-sk_buff-in-estimated-rwnd.patch sctp-fix-incorrect-overflow-check-on-autoclose.patch --- ...oute-cache-after-change-accept_local.patch | 43 +++++ ...roduce-route-cache-garbage-collector.patch | 165 ++++++++++++++++++ ...efetch-requires-including-prefetch.h.patch | 29 +++ ...-was-getting-called-after-sk_eat_skb.patch | 57 ++++++ ...oid-panic-if-no-options-are-provided.patch | 32 ++++ ...f-one-bug-in-x86_64-cond-jump-target.patch | 37 ++++ ...nfig-not-wait-if-no-dev-is-available.patch | 40 +++++ ...ptp-double-release_sock-in-pptp_bind.patch | 32 ++++ ...-gfp_kernel-while-holding-a-spinlock.patch | 36 ++++ ...eof-struct-sk_buff-in-estimated-rwnd.patch | 97 ++++++++++ ...ncorrect-overflow-check-on-autoclose.patch | 121 +++++++++++++ queue-3.0/series | 11 ++ 12 files changed, 700 insertions(+) create mode 100644 queue-3.0/ipv4-flush-route-cache-after-change-accept_local.patch create mode 100644 queue-3.0/ipv4-reintroduce-route-cache-garbage-collector.patch create mode 100644 queue-3.0/ipv4-using-prefetch-requires-including-prefetch.h.patch create mode 100644 queue-3.0/llc-llc_cmsg_rcv-was-getting-called-after-sk_eat_skb.patch create mode 100644 queue-3.0/mqprio-avoid-panic-if-no-options-are-provided.patch create mode 100644 queue-3.0/net-bpf_jit-fix-an-off-one-bug-in-x86_64-cond-jump-target.patch create mode 100644 queue-3.0/net-have-ipconfig-not-wait-if-no-dev-is-available.patch create mode 100644 queue-3.0/ppp-fix-pptp-double-release_sock-in-pptp_bind.patch create mode 100644 queue-3.0/sch_gred-should-not-use-gfp_kernel-while-holding-a-spinlock.patch create mode 100644 queue-3.0/sctp-do-not-account-for-sizeof-struct-sk_buff-in-estimated-rwnd.patch create mode 100644 queue-3.0/sctp-fix-incorrect-overflow-check-on-autoclose.patch diff --git a/queue-3.0/ipv4-flush-route-cache-after-change-accept_local.patch b/queue-3.0/ipv4-flush-route-cache-after-change-accept_local.patch new file mode 100644 index 00000000000..5fcbde09c1a --- /dev/null +++ b/queue-3.0/ipv4-flush-route-cache-after-change-accept_local.patch @@ -0,0 +1,43 @@ +From f9e1e7e59d60d85684a05366d282b6d6bdfdb687 Mon Sep 17 00:00:00 2001 +From: "Peter Pan(潘卫平)" +Date: Thu, 1 Dec 2011 15:47:06 +0000 +Subject: ipv4: flush route cache after change accept_local + + +From: Weiping Pan + +[ Upstream commit d01ff0a049f749e0bf10a35bb23edd012718c8c2 ] + +After reset ipv4_devconf->data[IPV4_DEVCONF_ACCEPT_LOCAL] to 0, +we should flush route cache, or it will continue receive packets with local +source address, which should be dropped. + +Signed-off-by: Weiping Pan +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/devinet.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/net/ipv4/devinet.c ++++ b/net/ipv4/devinet.c +@@ -1490,7 +1490,9 @@ static int devinet_conf_proc(ctl_table * + void __user *buffer, + size_t *lenp, loff_t *ppos) + { ++ int old_value = *(int *)ctl->data; + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); ++ int new_value = *(int *)ctl->data; + + if (write) { + struct ipv4_devconf *cnf = ctl->extra1; +@@ -1501,6 +1503,9 @@ static int devinet_conf_proc(ctl_table * + + if (cnf == net->ipv4.devconf_dflt) + devinet_copy_dflt_conf(net, i); ++ if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) ++ if ((new_value == 0) && (old_value != 0)) ++ rt_cache_flush(net, 0); + } + + return ret; diff --git a/queue-3.0/ipv4-reintroduce-route-cache-garbage-collector.patch b/queue-3.0/ipv4-reintroduce-route-cache-garbage-collector.patch new file mode 100644 index 00000000000..7d8f85280b7 --- /dev/null +++ b/queue-3.0/ipv4-reintroduce-route-cache-garbage-collector.patch @@ -0,0 +1,165 @@ +From 92023c1e1af8d82e1dce92a17efd0f1adffa2dd2 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Wed, 21 Dec 2011 15:47:16 -0500 +Subject: ipv4: reintroduce route cache garbage collector + + +From: Eric Dumazet + +[ Upstream commit 9f28a2fc0bd77511f649c0a788c7bf9a5fd04edb ] + +Commit 2c8cec5c10b (ipv4: Cache learned PMTU information in inetpeer) +removed IP route cache garbage collector a bit too soon, as this gc was +responsible for expired routes cleanup, releasing their neighbour +reference. + +As pointed out by Robert Gladewitz, recent kernels can fill and exhaust +their neighbour cache. + +Reintroduce the garbage collection, since we'll have to wait our +neighbour lookups become refcount-less to not depend on this stuff. + +Reported-by: Robert Gladewitz +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/route.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 106 insertions(+) + +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -132,6 +132,9 @@ static int ip_rt_min_pmtu __read_mostly + static int ip_rt_min_advmss __read_mostly = 256; + static int rt_chain_length_max __read_mostly = 20; + ++static struct delayed_work expires_work; ++static unsigned long expires_ljiffies; ++ + /* + * Interface to generic destination cache. + */ +@@ -821,6 +824,97 @@ static int has_noalias(const struct rtab + return ONE; + } + ++static void rt_check_expire(void) ++{ ++ static unsigned int rover; ++ unsigned int i = rover, goal; ++ struct rtable *rth; ++ struct rtable __rcu **rthp; ++ unsigned long samples = 0; ++ unsigned long sum = 0, sum2 = 0; ++ unsigned long delta; ++ u64 mult; ++ ++ delta = jiffies - expires_ljiffies; ++ expires_ljiffies = jiffies; ++ mult = ((u64)delta) << rt_hash_log; ++ if (ip_rt_gc_timeout > 1) ++ do_div(mult, ip_rt_gc_timeout); ++ goal = (unsigned int)mult; ++ if (goal > rt_hash_mask) ++ goal = rt_hash_mask + 1; ++ for (; goal > 0; goal--) { ++ unsigned long tmo = ip_rt_gc_timeout; ++ unsigned long length; ++ ++ i = (i + 1) & rt_hash_mask; ++ rthp = &rt_hash_table[i].chain; ++ ++ if (need_resched()) ++ cond_resched(); ++ ++ samples++; ++ ++ if (rcu_dereference_raw(*rthp) == NULL) ++ continue; ++ length = 0; ++ spin_lock_bh(rt_hash_lock_addr(i)); ++ while ((rth = rcu_dereference_protected(*rthp, ++ lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { ++ prefetch(rth->dst.rt_next); ++ if (rt_is_expired(rth)) { ++ *rthp = rth->dst.rt_next; ++ rt_free(rth); ++ continue; ++ } ++ if (rth->dst.expires) { ++ /* Entry is expired even if it is in use */ ++ if (time_before_eq(jiffies, rth->dst.expires)) { ++nofree: ++ tmo >>= 1; ++ rthp = &rth->dst.rt_next; ++ /* ++ * We only count entries on ++ * a chain with equal hash inputs once ++ * so that entries for different QOS ++ * levels, and other non-hash input ++ * attributes don't unfairly skew ++ * the length computation ++ */ ++ length += has_noalias(rt_hash_table[i].chain, rth); ++ continue; ++ } ++ } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) ++ goto nofree; ++ ++ /* Cleanup aged off entries. */ ++ *rthp = rth->dst.rt_next; ++ rt_free(rth); ++ } ++ spin_unlock_bh(rt_hash_lock_addr(i)); ++ sum += length; ++ sum2 += length*length; ++ } ++ if (samples) { ++ unsigned long avg = sum / samples; ++ unsigned long sd = int_sqrt(sum2 / samples - avg*avg); ++ rt_chain_length_max = max_t(unsigned long, ++ ip_rt_gc_elasticity, ++ (avg + 4*sd) >> FRACT_BITS); ++ } ++ rover = i; ++} ++ ++/* ++ * rt_worker_func() is run in process context. ++ * we call rt_check_expire() to scan part of the hash table ++ */ ++static void rt_worker_func(struct work_struct *work) ++{ ++ rt_check_expire(); ++ schedule_delayed_work(&expires_work, ip_rt_gc_interval); ++} ++ + /* + * Perturbation of rt_genid by a small quantity [1..256] + * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() +@@ -3088,6 +3182,13 @@ static ctl_table ipv4_route_table[] = { + .proc_handler = proc_dointvec_jiffies, + }, + { ++ .procname = "gc_interval", ++ .data = &ip_rt_gc_interval, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_jiffies, ++ }, ++ { + .procname = "redirect_load", + .data = &ip_rt_redirect_load, + .maxlen = sizeof(int), +@@ -3297,6 +3398,11 @@ int __init ip_rt_init(void) + devinet_init(); + ip_fib_init(); + ++ INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); ++ expires_ljiffies = jiffies; ++ schedule_delayed_work(&expires_work, ++ net_random() % ip_rt_gc_interval + ip_rt_gc_interval); ++ + if (ip_rt_proc_init()) + printk(KERN_ERR "Unable to create route proc files\n"); + #ifdef CONFIG_XFRM diff --git a/queue-3.0/ipv4-using-prefetch-requires-including-prefetch.h.patch b/queue-3.0/ipv4-using-prefetch-requires-including-prefetch.h.patch new file mode 100644 index 00000000000..1a7124baa41 --- /dev/null +++ b/queue-3.0/ipv4-using-prefetch-requires-including-prefetch.h.patch @@ -0,0 +1,29 @@ +From 31a4e30815354c5e49eb20752409ef7c721fc6b0 Mon Sep 17 00:00:00 2001 +From: Stephen Rothwell +Date: Thu, 22 Dec 2011 17:03:29 +1100 +Subject: ipv4: using prefetch requires including prefetch.h + + +From: Stephen Rothwell + +[ Upstream commit b9eda06f80b0db61a73bd87c6b0eb67d8aca55ad ] + +Signed-off-by: Stephen Rothwell +Acked-by: Eric Dumazet +Acked-by: David Miller +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/route.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -91,6 +91,7 @@ + #include + #include + #include ++#include + #include + #include + #include diff --git a/queue-3.0/llc-llc_cmsg_rcv-was-getting-called-after-sk_eat_skb.patch b/queue-3.0/llc-llc_cmsg_rcv-was-getting-called-after-sk_eat_skb.patch new file mode 100644 index 00000000000..16f3288b5f5 --- /dev/null +++ b/queue-3.0/llc-llc_cmsg_rcv-was-getting-called-after-sk_eat_skb.patch @@ -0,0 +1,57 @@ +From 76bcc2af1348ccf5a40421e1181f2547718a1e51 Mon Sep 17 00:00:00 2001 +From: Alex Juncu +Date: Thu, 15 Dec 2011 23:01:25 +0000 +Subject: llc: llc_cmsg_rcv was getting called after sk_eat_skb. + + +From: Alex Juncu + +[ Upstream commit 9cef310fcdee12b49b8b4c96fd8f611c8873d284 ] + +Received non stream protocol packets were calling llc_cmsg_rcv that used a +skb after that skb was released by sk_eat_skb. This caused received STP +packets to generate kernel panics. + +Signed-off-by: Alexandru Juncu +Signed-off-by: Kunjan Naik +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/llc/af_llc.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/net/llc/af_llc.c ++++ b/net/llc/af_llc.c +@@ -833,15 +833,15 @@ static int llc_ui_recvmsg(struct kiocb * + copied += used; + len -= used; + ++ /* For non stream protcols we get one packet per recvmsg call */ ++ if (sk->sk_type != SOCK_STREAM) ++ goto copy_uaddr; ++ + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, 0); + *seq = 0; + } + +- /* For non stream protcols we get one packet per recvmsg call */ +- if (sk->sk_type != SOCK_STREAM) +- goto copy_uaddr; +- + /* Partial read */ + if (used + offset < skb->len) + continue; +@@ -857,6 +857,12 @@ copy_uaddr: + } + if (llc_sk(sk)->cmsg_flags) + llc_cmsg_rcv(msg, skb); ++ ++ if (!(flags & MSG_PEEK)) { ++ sk_eat_skb(sk, skb, 0); ++ *seq = 0; ++ } ++ + goto out; + } + diff --git a/queue-3.0/mqprio-avoid-panic-if-no-options-are-provided.patch b/queue-3.0/mqprio-avoid-panic-if-no-options-are-provided.patch new file mode 100644 index 00000000000..655666e2fce --- /dev/null +++ b/queue-3.0/mqprio-avoid-panic-if-no-options-are-provided.patch @@ -0,0 +1,32 @@ +From 5e59a51890a259701718b9328560934407176b46 Mon Sep 17 00:00:00 2001 +From: Thomas Graf +Date: Thu, 22 Dec 2011 02:05:07 +0000 +Subject: mqprio: Avoid panic if no options are provided + + +From: Thomas Graf + +[ Upstream commit 7838f2ce36b6ab5c13ef20b1857e3bbd567f1759 ] + +Userspace may not provide TCA_OPTIONS, in fact tc currently does +so not do so if no arguments are specified on the command line. +Return EINVAL instead of panicing. + +Signed-off-by: Thomas Graf +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/sch_mqprio.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/sched/sch_mqprio.c ++++ b/net/sched/sch_mqprio.c +@@ -106,7 +106,7 @@ static int mqprio_init(struct Qdisc *sch + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + +- if (nla_len(opt) < sizeof(*qopt)) ++ if (!opt || nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = nla_data(opt); diff --git a/queue-3.0/net-bpf_jit-fix-an-off-one-bug-in-x86_64-cond-jump-target.patch b/queue-3.0/net-bpf_jit-fix-an-off-one-bug-in-x86_64-cond-jump-target.patch new file mode 100644 index 00000000000..9b3d69a4940 --- /dev/null +++ b/queue-3.0/net-bpf_jit-fix-an-off-one-bug-in-x86_64-cond-jump-target.patch @@ -0,0 +1,37 @@ +From c9443fa522619b70cc9ea12d7f58e4cf633500c3 Mon Sep 17 00:00:00 2001 +From: Markus Kötter +Date: Sat, 17 Dec 2011 11:39:08 +0000 +Subject: net: bpf_jit: fix an off-one bug in x86_64 cond jump target + + +From: Markus Kötter + +[ Upstream commit a03ffcf873fe0f2565386ca8ef832144c42e67fa ] + +x86 jump instruction size is 2 or 5 bytes (near/long jump), not 2 or 6 +bytes. + +In case a conditional jump is followed by a long jump, conditional jump +target is one byte past the start of target instruction. + +Signed-off-by: Markus Kötter +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/net/bpf_jit_comp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -568,8 +568,8 @@ cond_branch: f_offset = addrs[i + filt + break; + } + if (filter[i].jt != 0) { +- if (filter[i].jf) +- t_offset += is_near(f_offset) ? 2 : 6; ++ if (filter[i].jf && f_offset) ++ t_offset += is_near(f_offset) ? 2 : 5; + EMIT_COND_JMP(t_op, t_offset); + if (filter[i].jf) + EMIT_JMP(f_offset); diff --git a/queue-3.0/net-have-ipconfig-not-wait-if-no-dev-is-available.patch b/queue-3.0/net-have-ipconfig-not-wait-if-no-dev-is-available.patch new file mode 100644 index 00000000000..f1adb9cdd0b --- /dev/null +++ b/queue-3.0/net-have-ipconfig-not-wait-if-no-dev-is-available.patch @@ -0,0 +1,40 @@ +From 2ec3264c7298c2326f33c482e787ec449638ed85 Mon Sep 17 00:00:00 2001 +From: Gerlando Falauto +Date: Mon, 19 Dec 2011 22:58:04 +0000 +Subject: net: have ipconfig not wait if no dev is available + + +From: Gerlando Falauto + +[ Upstream commit cd7816d14953c8af910af5bb92f488b0b277e29d ] + +previous commit 3fb72f1e6e6165c5f495e8dc11c5bbd14c73385c +makes IP-Config wait for carrier on at least one network device. + +Before waiting (predefined value 120s), check that at least one device +was successfully brought up. Otherwise (e.g. buggy bootloader +which does not set the MAC address) there is no point in waiting +for carrier. + +Cc: Micha Nelissen +Cc: Holger Brunck +Signed-off-by: Gerlando Falauto +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ipconfig.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/net/ipv4/ipconfig.c ++++ b/net/ipv4/ipconfig.c +@@ -252,6 +252,10 @@ static int __init ic_open_devs(void) + } + } + ++ /* no point in waiting if we could not bring up at least one device */ ++ if (!ic_first_dev) ++ goto have_carrier; ++ + /* wait for a carrier on at least one device */ + start = jiffies; + while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { diff --git a/queue-3.0/ppp-fix-pptp-double-release_sock-in-pptp_bind.patch b/queue-3.0/ppp-fix-pptp-double-release_sock-in-pptp_bind.patch new file mode 100644 index 00000000000..3910d66b76b --- /dev/null +++ b/queue-3.0/ppp-fix-pptp-double-release_sock-in-pptp_bind.patch @@ -0,0 +1,32 @@ +From 884aad5cf67fed3cc15003dbb9c8cdf9b6833b57 Mon Sep 17 00:00:00 2001 +From: Djalal Harouni +Date: Tue, 6 Dec 2011 15:47:12 +0000 +Subject: ppp: fix pptp double release_sock in pptp_bind() + + +From: Djalal Harouni + +[ Upstream commit a454daceb78844a09c08b6e2d8badcb76a5d73b9 ] + +Signed-off-by: Djalal Harouni +Acked-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/pptp.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/drivers/net/pptp.c ++++ b/drivers/net/pptp.c +@@ -418,10 +418,8 @@ static int pptp_bind(struct socket *sock + lock_sock(sk); + + opt->src_addr = sp->sa_addr.pptp; +- if (add_chan(po)) { +- release_sock(sk); ++ if (add_chan(po)) + error = -EBUSY; +- } + + release_sock(sk); + return error; diff --git a/queue-3.0/sch_gred-should-not-use-gfp_kernel-while-holding-a-spinlock.patch b/queue-3.0/sch_gred-should-not-use-gfp_kernel-while-holding-a-spinlock.patch new file mode 100644 index 00000000000..e65ef494900 --- /dev/null +++ b/queue-3.0/sch_gred-should-not-use-gfp_kernel-while-holding-a-spinlock.patch @@ -0,0 +1,36 @@ +From 8c7968a17315bcfa4b907334a35675129d84025f Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Sun, 11 Dec 2011 23:42:53 +0000 +Subject: sch_gred: should not use GFP_KERNEL while holding a spinlock + + +From: Eric Dumazet + +[ Upstream commit 3f1e6d3fd37bd4f25e5b19f1c7ca21850426c33f ] + +gred_change_vq() is called under sch_tree_lock(sch). + +This means a spinlock is held, and we are not allowed to sleep in this +context. + +We might pre-allocate memory using GFP_KERNEL before taking spinlock, +but this is not suitable for stable material. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/sch_gred.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/sched/sch_gred.c ++++ b/net/sched/sch_gred.c +@@ -385,7 +385,7 @@ static inline int gred_change_vq(struct + struct gred_sched_data *q; + + if (table->tab[dp] == NULL) { +- table->tab[dp] = kzalloc(sizeof(*q), GFP_KERNEL); ++ table->tab[dp] = kzalloc(sizeof(*q), GFP_ATOMIC); + if (table->tab[dp] == NULL) + return -ENOMEM; + } diff --git a/queue-3.0/sctp-do-not-account-for-sizeof-struct-sk_buff-in-estimated-rwnd.patch b/queue-3.0/sctp-do-not-account-for-sizeof-struct-sk_buff-in-estimated-rwnd.patch new file mode 100644 index 00000000000..665021d837b --- /dev/null +++ b/queue-3.0/sctp-do-not-account-for-sizeof-struct-sk_buff-in-estimated-rwnd.patch @@ -0,0 +1,97 @@ +From 094b242b70f67ac3996934432f0a0d564e791cf6 Mon Sep 17 00:00:00 2001 +From: Thomas Graf +Date: Mon, 19 Dec 2011 04:11:40 +0000 +Subject: sctp: Do not account for sizeof(struct sk_buff) in estimated rwnd + + +From: Thomas Graf + +[ Upstream commit a76c0adf60f6ca5ff3481992e4ea0383776b24d2 ] + +When checking whether a DATA chunk fits into the estimated rwnd a +full sizeof(struct sk_buff) is added to the needed chunk size. This +quickly exhausts the available rwnd space and leads to packets being +sent which are much below the PMTU limit. This can lead to much worse +performance. + +The reason for this behaviour was to avoid putting too much memory +pressure on the receiver. The concept is not completely irational +because a Linux receiver does in fact clone an skb for each DATA chunk +delivered. However, Linux also reserves half the available socket +buffer space for data structures therefore usage of it is already +accounted for. + +When proposing to change this the last time it was noted that this +behaviour was introduced to solve a performance issue caused by rwnd +overusage in combination with small DATA chunks. + +Trying to reproduce this I found that with the sk_buff overhead removed, +the performance would improve significantly unless socket buffer limits +are increased. + +The following numbers have been gathered using a patched iperf +supporting SCTP over a live 1 Gbit ethernet network. The -l option +was used to limit DATA chunk sizes. The numbers listed are based on +the average of 3 test runs each. Default values have been used for +sk_(r|w)mem. + +Chunk +Size Unpatched No Overhead +------------------------------------- + 4 15.2 Kbit [!] 12.2 Mbit [!] + 8 35.8 Kbit [!] 26.0 Mbit [!] + 16 95.5 Kbit [!] 54.4 Mbit [!] + 32 106.7 Mbit 102.3 Mbit + 64 189.2 Mbit 188.3 Mbit + 128 331.2 Mbit 334.8 Mbit + 256 537.7 Mbit 536.0 Mbit + 512 766.9 Mbit 766.6 Mbit +1024 810.1 Mbit 808.6 Mbit + +Signed-off-by: Thomas Graf +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sctp/output.c | 8 +------- + net/sctp/outqueue.c | 6 ++---- + 2 files changed, 3 insertions(+), 11 deletions(-) + +--- a/net/sctp/output.c ++++ b/net/sctp/output.c +@@ -697,13 +697,7 @@ static void sctp_packet_append_data(stru + /* Keep track of how many bytes are in flight to the receiver. */ + asoc->outqueue.outstanding_bytes += datasize; + +- /* Update our view of the receiver's rwnd. Include sk_buff overhead +- * while updating peer.rwnd so that it reduces the chances of a +- * receiver running out of receive buffer space even when receive +- * window is still open. This can happen when a sender is sending +- * sending small messages. +- */ +- datasize += sizeof(struct sk_buff); ++ /* Update our view of the receiver's rwnd. */ + if (datasize < rwnd) + rwnd -= datasize; + else +--- a/net/sctp/outqueue.c ++++ b/net/sctp/outqueue.c +@@ -411,8 +411,7 @@ void sctp_retransmit_mark(struct sctp_ou + chunk->transport->flight_size -= + sctp_data_size(chunk); + q->outstanding_bytes -= sctp_data_size(chunk); +- q->asoc->peer.rwnd += (sctp_data_size(chunk) + +- sizeof(struct sk_buff)); ++ q->asoc->peer.rwnd += sctp_data_size(chunk); + } + continue; + } +@@ -432,8 +431,7 @@ void sctp_retransmit_mark(struct sctp_ou + * (Section 7.2.4)), add the data size of those + * chunks to the rwnd. + */ +- q->asoc->peer.rwnd += (sctp_data_size(chunk) + +- sizeof(struct sk_buff)); ++ q->asoc->peer.rwnd += sctp_data_size(chunk); + q->outstanding_bytes -= sctp_data_size(chunk); + if (chunk->transport) + transport->flight_size -= sctp_data_size(chunk); diff --git a/queue-3.0/sctp-fix-incorrect-overflow-check-on-autoclose.patch b/queue-3.0/sctp-fix-incorrect-overflow-check-on-autoclose.patch new file mode 100644 index 00000000000..dc982f8cd02 --- /dev/null +++ b/queue-3.0/sctp-fix-incorrect-overflow-check-on-autoclose.patch @@ -0,0 +1,121 @@ +From 4dff6c35f9ff1cf62dd4435517adae40a3dd2d8a Mon Sep 17 00:00:00 2001 +From: Xi Wang +Date: Fri, 16 Dec 2011 12:44:15 +0000 +Subject: sctp: fix incorrect overflow check on autoclose + + +From: Xi Wang + +[ Upstream commit 2692ba61a82203404abd7dd2a027bda962861f74 ] + +Commit 8ffd3208 voids the previous patches f6778aab and 810c0719 for +limiting the autoclose value. If userspace passes in -1 on 32-bit +platform, the overflow check didn't work and autoclose would be set +to 0xffffffff. + +This patch defines a max_autoclose (in seconds) for limiting the value +and exposes it through sysctl, with the following intentions. + +1) Avoid overflowing autoclose * HZ. + +2) Keep the default autoclose bound consistent across 32- and 64-bit + platforms (INT_MAX / HZ in this patch). + +3) Keep the autoclose value consistent between setsockopt() and + getsockopt() calls. + +Suggested-by: Vlad Yasevich +Signed-off-by: Xi Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/sctp/structs.h | 4 ++++ + net/sctp/associola.c | 2 +- + net/sctp/protocol.c | 3 +++ + net/sctp/socket.c | 2 -- + net/sctp/sysctl.c | 13 +++++++++++++ + 5 files changed, 21 insertions(+), 3 deletions(-) + +--- a/include/net/sctp/structs.h ++++ b/include/net/sctp/structs.h +@@ -236,6 +236,9 @@ extern struct sctp_globals { + * bits is an indicator of when to send and window update SACK. + */ + int rwnd_update_shift; ++ ++ /* Threshold for autoclose timeout, in seconds. */ ++ unsigned long max_autoclose; + } sctp_globals; + + #define sctp_rto_initial (sctp_globals.rto_initial) +@@ -271,6 +274,7 @@ extern struct sctp_globals { + #define sctp_auth_enable (sctp_globals.auth_enable) + #define sctp_checksum_disable (sctp_globals.checksum_disable) + #define sctp_rwnd_upd_shift (sctp_globals.rwnd_update_shift) ++#define sctp_max_autoclose (sctp_globals.max_autoclose) + + /* SCTP Socket type: UDP or TCP style. */ + typedef enum { +--- a/net/sctp/associola.c ++++ b/net/sctp/associola.c +@@ -173,7 +173,7 @@ static struct sctp_association *sctp_ass + asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = +- (unsigned long)sp->autoclose * HZ; ++ min_t(unsigned long, sp->autoclose, sctp_max_autoclose) * HZ; + + /* Initializes the timers */ + for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) +--- a/net/sctp/protocol.c ++++ b/net/sctp/protocol.c +@@ -1144,6 +1144,9 @@ SCTP_STATIC __init int sctp_init(void) + sctp_max_instreams = SCTP_DEFAULT_INSTREAMS; + sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS; + ++ /* Initialize maximum autoclose timeout. */ ++ sctp_max_autoclose = INT_MAX / HZ; ++ + /* Initialize handle used for association ids. */ + idr_init(&sctp_assocs_id); + +--- a/net/sctp/socket.c ++++ b/net/sctp/socket.c +@@ -2129,8 +2129,6 @@ static int sctp_setsockopt_autoclose(str + return -EINVAL; + if (copy_from_user(&sp->autoclose, optval, optlen)) + return -EFAULT; +- /* make sure it won't exceed MAX_SCHEDULE_TIMEOUT */ +- sp->autoclose = min_t(long, sp->autoclose, MAX_SCHEDULE_TIMEOUT / HZ); + + return 0; + } +--- a/net/sctp/sysctl.c ++++ b/net/sctp/sysctl.c +@@ -53,6 +53,10 @@ static int sack_timer_min = 1; + static int sack_timer_max = 500; + static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ + static int rwnd_scale_max = 16; ++static unsigned long max_autoclose_min = 0; ++static unsigned long max_autoclose_max = ++ (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX) ++ ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; + + extern long sysctl_sctp_mem[3]; + extern int sysctl_sctp_rmem[3]; +@@ -251,6 +255,15 @@ static ctl_table sctp_table[] = { + .extra1 = &one, + .extra2 = &rwnd_scale_max, + }, ++ { ++ .procname = "max_autoclose", ++ .data = &sctp_max_autoclose, ++ .maxlen = sizeof(unsigned long), ++ .mode = 0644, ++ .proc_handler = &proc_doulongvec_minmax, ++ .extra1 = &max_autoclose_min, ++ .extra2 = &max_autoclose_max, ++ }, + + { /* sentinel */ } + }; diff --git a/queue-3.0/series b/queue-3.0/series index 08a268a0ece..15874a21c1b 100644 --- a/queue-3.0/series +++ b/queue-3.0/series @@ -48,3 +48,14 @@ sparc32-remove-uses-of-g7-in-memcpy-implementation.patch sparc32-correct-the-return-value-of-memcpy.patch sparc64-fix-masking-and-shifting-in-vis-fpcmp-emulation.patch sparc-fix-handling-of-orig_i0-wrt.-debugging-when-restarting-syscalls.patch +net-bpf_jit-fix-an-off-one-bug-in-x86_64-cond-jump-target.patch +ppp-fix-pptp-double-release_sock-in-pptp_bind.patch +llc-llc_cmsg_rcv-was-getting-called-after-sk_eat_skb.patch +mqprio-avoid-panic-if-no-options-are-provided.patch +net-have-ipconfig-not-wait-if-no-dev-is-available.patch +sch_gred-should-not-use-gfp_kernel-while-holding-a-spinlock.patch +sctp-fix-incorrect-overflow-check-on-autoclose.patch +sctp-do-not-account-for-sizeof-struct-sk_buff-in-estimated-rwnd.patch +ipv4-flush-route-cache-after-change-accept_local.patch +ipv4-reintroduce-route-cache-garbage-collector.patch +ipv4-using-prefetch-requires-including-prefetch.h.patch -- 2.47.3