From aa8edcb997ff605bd1424630356866b78ef06cdc Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Mon, 9 Jun 2008 14:36:09 -0700 Subject: [PATCH] Add networking patches to 2.6.25 queue --- ...y-fix-selector-family-initialization.patch | 34 ++ ...-null-pointer-dereference-and-lockup.patch | 50 +++ ...n-the-rfcomm-socket-cleanup-handling.patch | 58 ++++ ...opy_from_user-results-interpretation.patch | 77 +++++ ...y-use-chip-checksum-for-ipv4-packets.patch | 54 +++ ...se-the-correct-ip_local_out-function.patch | 54 +++ ...ruesize-bug-if-headroom-is-increased.patch | 75 ++++ ...g-or-receiving-when-tunnel-goes-down.patch | 323 ++++++++++++++++++ ...ocket-code-when-udp-socket-is-closed.patch | 66 ++++ ...-change_rx_flags-in-dev_change_flags.patch | 36 ++ ...n-value-for-non-existant-classifiers.patch | 31 ++ ...ed_compat-to-call-nla_parse-directly.patch | 50 +++ queue-2.6.25/series | 20 ++ ...ow-up-to-max_burst-when-gso-disabled.patch | 42 +++ ...-vs-fack_count-out-of-sync-condition.patch | 70 ++++ ...ix-fallback-to-conventional-recovery.patch | 56 +++ ...nt-is-errorneously-used-with-newreno.patch | 105 ++++++ ...p-frto-work-around-inorder-receivers.patch | 68 ++++ ...t-cwnd-growth-when-deferring-for-gso.patch | 34 ++ ...s-out-if-icmp-frag-needed-is-delayed.patch | 74 ++++ ...tifications-for-layered-vlan-devices.patch | 38 +++ 21 files changed, 1415 insertions(+) create mode 100644 queue-2.6.25/af_key-fix-selector-family-initialization.patch create mode 100644 queue-2.6.25/ax25-fix-null-pointer-dereference-and-lockup.patch create mode 100644 queue-2.6.25/bluetooth-fix-locking-bug-in-the-rfcomm-socket-cleanup-handling.patch create mode 100644 queue-2.6.25/can-fix-copy_from_user-results-interpretation.patch create mode 100644 queue-2.6.25/cassini-only-use-chip-checksum-for-ipv4-packets.patch create mode 100644 queue-2.6.25/ipsec-use-the-correct-ip_local_out-function.patch create mode 100644 queue-2.6.25/l2tp-avoid-skb-truesize-bug-if-headroom-is-increased.patch create mode 100644 queue-2.6.25/l2tp-fix-possible-oops-if-transmitting-or-receiving-when-tunnel-goes-down.patch create mode 100644 queue-2.6.25/l2tp-fix-possible-warn_on-from-socket-code-when-udp-socket-is-closed.patch create mode 100644 queue-2.6.25/net-fix-call-to-change_rx_flags-in-dev_change_flags.patch create mode 100644 queue-2.6.25/net_sched-cls_api-fix-return-value-for-non-existant-classifiers.patch create mode 100644 queue-2.6.25/netlink-fix-nla_parse_nested_compat-to-call-nla_parse-directly.patch create mode 100644 queue-2.6.25/tcp-allow-send-limited-cwnd-to-grow-up-to-max_burst-when-gso-disabled.patch create mode 100644 queue-2.6.25/tcp-fix-skb-vs-fack_count-out-of-sync-condition.patch create mode 100644 queue-2.6.25/tcp-frto-fix-fallback-to-conventional-recovery.patch create mode 100644 queue-2.6.25/tcp-frto-sack-variant-is-errorneously-used-with-newreno.patch create mode 100644 queue-2.6.25/tcp-frto-work-around-inorder-receivers.patch create mode 100644 queue-2.6.25/tcp-limit-cwnd-growth-when-deferring-for-gso.patch create mode 100644 queue-2.6.25/tcp-tcp-connection-times-out-if-icmp-frag-needed-is-delayed.patch create mode 100644 queue-2.6.25/vlan-correctly-handle-device-notifications-for-layered-vlan-devices.patch diff --git a/queue-2.6.25/af_key-fix-selector-family-initialization.patch b/queue-2.6.25/af_key-fix-selector-family-initialization.patch new file mode 100644 index 0000000000..693987e1e7 --- /dev/null +++ b/queue-2.6.25/af_key-fix-selector-family-initialization.patch @@ -0,0 +1,34 @@ +From ead8aded50c2a631955fcf0fb6c477c02f7b9e46 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Kazunori MIYAZAWA +Date: Wed, 21 May 2008 13:26:11 -0700 +Subject: af_key: Fix selector family initialization. + +From: Kazunori MIYAZAWA + +[ upstream commit: 4da5105687e0993a3bbdcffd89b2b94d9377faab ] + +This propagates the xfrm_user fix made in commit +bcf0dda8d2408fe1c1040cdec5a98e5fcad2ac72 ("[XFRM]: xfrm_user: fix +selector family initialization") + +Based upon a bug report from, and tested by, Alan Swanson. + +Signed-off-by: Kazunori MIYAZAWA +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/key/af_key.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/key/af_key.c ++++ b/net/key/af_key.c +@@ -1219,7 +1219,7 @@ static struct xfrm_state * pfkey_msg2xfr + x->sel.prefixlen_s = addr->sadb_address_prefixlen; + } + +- if (x->props.mode == XFRM_MODE_TRANSPORT) ++ if (!x->sel.family) + x->sel.family = x->props.family; + + if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) { diff --git a/queue-2.6.25/ax25-fix-null-pointer-dereference-and-lockup.patch b/queue-2.6.25/ax25-fix-null-pointer-dereference-and-lockup.patch new file mode 100644 index 0000000000..be4ed91127 --- /dev/null +++ b/queue-2.6.25/ax25-fix-null-pointer-dereference-and-lockup.patch @@ -0,0 +1,50 @@ +From 0d8322352083476cb62367887ecf0722549c92f2 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Jarek Poplawski +Date: Tue, 3 Jun 2008 14:53:46 -0700 +Subject: ax25: Fix NULL pointer dereference and lockup. + +From: Jarek Poplawski + +[ Upstream commit: 7dccf1f4e1696c79bff064c3770867cc53cbc71c ] + +There is only one function in AX25 calling skb_append(), and it really +looks suspicious: appends skb after previously enqueued one, but in +the meantime this previous skb could be removed from the queue. + +This patch Fixes it the simple way, so this is not fully compatible with +the current method, but testing hasn't shown any problems. + +Signed-off-by: Ralf Baechle +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ax25/ax25_subr.c | 11 +++-------- + 1 file changed, 3 insertions(+), 8 deletions(-) + +--- a/net/ax25/ax25_subr.c ++++ b/net/ax25/ax25_subr.c +@@ -64,20 +64,15 @@ void ax25_frames_acked(ax25_cb *ax25, un + + void ax25_requeue_frames(ax25_cb *ax25) + { +- struct sk_buff *skb, *skb_prev = NULL; ++ struct sk_buff *skb; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by ax25_kick called from the timer. This arrangement handles the + * possibility of an empty output queue. + */ +- while ((skb = skb_dequeue(&ax25->ack_queue)) != NULL) { +- if (skb_prev == NULL) +- skb_queue_head(&ax25->write_queue, skb); +- else +- skb_append(skb_prev, skb, &ax25->write_queue); +- skb_prev = skb; +- } ++ while ((skb = skb_dequeue_tail(&ax25->ack_queue)) != NULL) ++ skb_queue_head(&ax25->write_queue, skb); + } + + /* diff --git a/queue-2.6.25/bluetooth-fix-locking-bug-in-the-rfcomm-socket-cleanup-handling.patch b/queue-2.6.25/bluetooth-fix-locking-bug-in-the-rfcomm-socket-cleanup-handling.patch new file mode 100644 index 0000000000..abb3d283d2 --- /dev/null +++ b/queue-2.6.25/bluetooth-fix-locking-bug-in-the-rfcomm-socket-cleanup-handling.patch @@ -0,0 +1,58 @@ +From 59cec518a8109d2c696210fc6c761174d9b42df9 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Arjan van de Ven +Date: Thu, 29 May 2008 01:32:47 -0700 +Subject: bluetooth: fix locking bug in the rfcomm socket cleanup handling + +From: Arjan van de Ven + +[ Upstream commit: 7dccf1f4e1696c79bff064c3770867cc53cbc71c ] + +in net/bluetooth/rfcomm/sock.c, rfcomm_sk_state_change() does the +following operation: + + if (parent && sock_flag(sk, SOCK_ZAPPED)) { + /* We have to drop DLC lock here, otherwise + * rfcomm_sock_destruct() will dead lock. */ + rfcomm_dlc_unlock(d); + rfcomm_sock_kill(sk); + rfcomm_dlc_lock(d); + } +} + +which is fine, since rfcomm_sock_kill() will call sk_free() which will call +rfcomm_sock_destruct() which takes the rfcomm_dlc_lock()... so far so good. + +HOWEVER, this assumes that the rfcomm_sk_state_change() function always gets +called with the rfcomm_dlc_lock() taken. This is the case for all but one +case, and in that case where we don't have the lock, we do a double unlock +followed by an attempt to take the lock, which due to underflow isn't +going anywhere fast. + +This patch fixes this by moving the stragling case inside the lock, like +the other usages of the same call are doing in this code. + +This was found with the help of the www.kerneloops.org project, where this +deadlock was observed 51 times at this point in time: +http://www.kerneloops.org/search.php?search=rfcomm_sock_destruct + +Signed-off-by: Arjan van de Ven +Acked-by: Marcel Holtmann +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/bluetooth/rfcomm/core.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/bluetooth/rfcomm/core.c ++++ b/net/bluetooth/rfcomm/core.c +@@ -423,8 +423,8 @@ static int __rfcomm_dlc_close(struct rfc + + rfcomm_dlc_lock(d); + d->state = BT_CLOSED; +- rfcomm_dlc_unlock(d); + d->state_change(d, err); ++ rfcomm_dlc_unlock(d); + + skb_queue_purge(&d->tx_queue); + rfcomm_dlc_unlink(d); diff --git a/queue-2.6.25/can-fix-copy_from_user-results-interpretation.patch b/queue-2.6.25/can-fix-copy_from_user-results-interpretation.patch new file mode 100644 index 0000000000..3f4c756833 --- /dev/null +++ b/queue-2.6.25/can-fix-copy_from_user-results-interpretation.patch @@ -0,0 +1,77 @@ +From 3966365cee3c6322936248050eddb10f765b2032 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Sam Ravnborg +Date: Mon, 9 Jun 2008 11:22:01 -0700 +Subject: can: Fix copy_from_user() results interpretation + +From: Sam Ravnborg + +[ Upstream commit: 3f91bd420a955803421f2db17b2e04aacfbb2bb8 ] + +Both copy_to_ and _from_user return the number of bytes, that failed to +reach their destination, not the 0/-EXXX values. + +Based on patch from Pavel Emelyanov + +Signed-off-by: Sam Ravnborg +Acked-by: Oliver Hartkopp +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/can/raw.c | 21 ++++++++++----------- + 1 file changed, 10 insertions(+), 11 deletions(-) + +--- a/net/can/raw.c ++++ b/net/can/raw.c +@@ -435,15 +435,13 @@ static int raw_setsockopt(struct socket + if (!filter) + return -ENOMEM; + +- err = copy_from_user(filter, optval, optlen); +- if (err) { ++ if (copy_from_user(filter, optval, optlen)) { + kfree(filter); +- return err; ++ return -EFAULT; + } + } else if (count == 1) { +- err = copy_from_user(&sfilter, optval, optlen); +- if (err) +- return err; ++ if (copy_from_user(&sfilter, optval, optlen)) ++ return -EFAULT; + } + + lock_sock(sk); +@@ -493,9 +491,8 @@ static int raw_setsockopt(struct socket + if (optlen != sizeof(err_mask)) + return -EINVAL; + +- err = copy_from_user(&err_mask, optval, optlen); +- if (err) +- return err; ++ if (copy_from_user(&err_mask, optval, optlen)) ++ return -EFAULT; + + err_mask &= CAN_ERR_MASK; + +@@ -531,7 +528,8 @@ static int raw_setsockopt(struct socket + if (optlen != sizeof(ro->loopback)) + return -EINVAL; + +- err = copy_from_user(&ro->loopback, optval, optlen); ++ if (copy_from_user(&ro->loopback, optval, optlen)) ++ return -EFAULT; + + break; + +@@ -539,7 +537,8 @@ static int raw_setsockopt(struct socket + if (optlen != sizeof(ro->recv_own_msgs)) + return -EINVAL; + +- err = copy_from_user(&ro->recv_own_msgs, optval, optlen); ++ if (copy_from_user(&ro->recv_own_msgs, optval, optlen)) ++ return -EFAULT; + + break; + diff --git a/queue-2.6.25/cassini-only-use-chip-checksum-for-ipv4-packets.patch b/queue-2.6.25/cassini-only-use-chip-checksum-for-ipv4-packets.patch new file mode 100644 index 0000000000..1f2ced6f85 --- /dev/null +++ b/queue-2.6.25/cassini-only-use-chip-checksum-for-ipv4-packets.patch @@ -0,0 +1,54 @@ +From 3be7c5ab93cf875aff5d91974d1df0851c329fd7 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: David S. Miller +Date: Wed, 21 May 2008 17:05:34 -0700 +Subject: cassini: Only use chip checksum for ipv4 packets. + +From: David S. Miller + +[ upstream commit: b1443e2f6501f06930a162ff1ff08382a98bf23e ] + +According to David Monro, at least with Natsemi Saturn chips the +cassini driver has some trouble with ipv6 checksums. + +Until we have more information about what's going on here, only +use the chip checksums for ipv4. + +This workaround was suggested and tested by David. + +Update version and release date. + +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + drivers/net/cassini.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/drivers/net/cassini.c ++++ b/drivers/net/cassini.c +@@ -142,8 +142,8 @@ + + #define DRV_MODULE_NAME "cassini" + #define PFX DRV_MODULE_NAME ": " +-#define DRV_MODULE_VERSION "1.5" +-#define DRV_MODULE_RELDATE "4 Jan 2008" ++#define DRV_MODULE_VERSION "1.6" ++#define DRV_MODULE_RELDATE "21 May 2008" + + #define CAS_DEF_MSG_ENABLE \ + (NETIF_MSG_DRV | \ +@@ -2140,9 +2140,12 @@ end_copy_pkt: + if (addr) + cas_page_unmap(addr); + } +- skb->csum = csum_unfold(~csum); +- skb->ip_summed = CHECKSUM_COMPLETE; + skb->protocol = eth_type_trans(skb, cp->dev); ++ if (skb->protocol == htons(ETH_P_IP)) { ++ skb->csum = csum_unfold(~csum); ++ skb->ip_summed = CHECKSUM_COMPLETE; ++ } else ++ skb->ip_summed = CHECKSUM_NONE; + return len; + } + diff --git a/queue-2.6.25/ipsec-use-the-correct-ip_local_out-function.patch b/queue-2.6.25/ipsec-use-the-correct-ip_local_out-function.patch new file mode 100644 index 0000000000..fbf72f9624 --- /dev/null +++ b/queue-2.6.25/ipsec-use-the-correct-ip_local_out-function.patch @@ -0,0 +1,54 @@ +From 2c2864845e1348c1e04919130c3152d60301fee2 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Herbert Xu +Date: Tue, 20 May 2008 14:32:14 -0700 +Subject: ipsec: Use the correct ip_local_out function + +From: Herbert Xu + +[ upstream commit: 1ac06e0306d0192a7a4d9ea1c9e06d355ce7e7d3 ] + +Because the IPsec output function xfrm_output_resume does its +own dst_output call it should always call __ip_local_output +instead of ip_local_output as the latter may invoke dst_output +directly. Otherwise the return values from nf_hook and dst_output +may clash as they both use the value 1 but for different purposes. + +When that clash occurs this can cause a packet to be used after +it has been freed which usually leads to a crash. Because the +offending value is only returned from dst_output with qdiscs +such as HTB, this bug is normally not visible. + +Thanks to Marco Berizzi for his perseverance in tracking this +down. + +Signed-off-by: Herbert Xu +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ipv4/route.c | 2 +- + net/ipv6/route.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -162,7 +162,7 @@ static struct dst_ops ipv4_dst_ops = { + .negative_advice = ipv4_negative_advice, + .link_failure = ipv4_link_failure, + .update_pmtu = ip_rt_update_pmtu, +- .local_out = ip_local_out, ++ .local_out = __ip_local_out, + .entry_size = sizeof(struct rtable), + .entries = ATOMIC_INIT(0), + }; +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -105,7 +105,7 @@ static struct dst_ops ip6_dst_ops = { + .negative_advice = ip6_negative_advice, + .link_failure = ip6_link_failure, + .update_pmtu = ip6_rt_update_pmtu, +- .local_out = ip6_local_out, ++ .local_out = __ip6_local_out, + .entry_size = sizeof(struct rt6_info), + .entries = ATOMIC_INIT(0), + }; diff --git a/queue-2.6.25/l2tp-avoid-skb-truesize-bug-if-headroom-is-increased.patch b/queue-2.6.25/l2tp-avoid-skb-truesize-bug-if-headroom-is-increased.patch new file mode 100644 index 0000000000..200cdf32e2 --- /dev/null +++ b/queue-2.6.25/l2tp-avoid-skb-truesize-bug-if-headroom-is-increased.patch @@ -0,0 +1,75 @@ +From 6ea010d4b9ba1e701f170ad769df8764876f1fd5 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: James Chapman +Date: Mon, 19 May 2008 14:10:01 -0700 +Subject: l2tp: avoid skb truesize bug if headroom is increased + +From: James Chapman + +[ upstream commit: 090c48d3dd5ea90b37350334aaed9a93b0c1e0a1 ] + +A user reported seeing occasional bugs such as the following when +using the L2TP driver. + + SKB BUG: Invalid truesize (272) len=72, sizeof(sk_buff)=208 + +When L2TP adds its header in the transmit path, it might need to +increase the headroom of the skb. In some cases, the increased +headroom trips a kernel bug when the skb is freed because the skb has +grown beyond its truesize value. The fix is to increase the truesize +by the amount of headroom added, after orphaning the skb. + +While here, fix a misleading comment. + +Thanks to Iouri Kharon for the initial +report and testing the fix. + +Signed-off-by: James Chapman +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + drivers/net/pppol2tp.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/drivers/net/pppol2tp.c ++++ b/drivers/net/pppol2tp.c +@@ -980,6 +980,8 @@ static int pppol2tp_xmit(struct ppp_chan + __wsum csum = 0; + struct udphdr *uh; + unsigned int len; ++ int old_headroom; ++ int new_headroom; + + if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) + goto abort; +@@ -1001,16 +1003,18 @@ static int pppol2tp_xmit(struct ppp_chan + + /* Check that there's enough headroom in the skb to insert IP, + * UDP and L2TP and PPP headers. If not enough, expand it to +- * make room. Note that a new skb (or a clone) is +- * allocated. If we return an error from this point on, make +- * sure we free the new skb but do not free the original skb +- * since that is done by the caller for the error case. ++ * make room. Adjust truesize. + */ + headroom = NET_SKB_PAD + sizeof(struct iphdr) + + sizeof(struct udphdr) + hdr_len + sizeof(ppph); ++ old_headroom = skb_headroom(skb); + if (skb_cow_head(skb, headroom)) + goto abort; + ++ new_headroom = skb_headroom(skb); ++ skb_orphan(skb); ++ skb->truesize += new_headroom - old_headroom; ++ + /* Setup PPP header */ + __skb_push(skb, sizeof(ppph)); + skb->data[0] = ppph[0]; +@@ -1065,7 +1069,6 @@ static int pppol2tp_xmit(struct ppp_chan + /* Get routing info from the tunnel socket */ + dst_release(skb->dst); + skb->dst = dst_clone(__sk_dst_get(sk_tun)); +- skb_orphan(skb); + skb->sk = sk_tun; + + /* Queue the packet to IP for output */ diff --git a/queue-2.6.25/l2tp-fix-possible-oops-if-transmitting-or-receiving-when-tunnel-goes-down.patch b/queue-2.6.25/l2tp-fix-possible-oops-if-transmitting-or-receiving-when-tunnel-goes-down.patch new file mode 100644 index 0000000000..525290b7be --- /dev/null +++ b/queue-2.6.25/l2tp-fix-possible-oops-if-transmitting-or-receiving-when-tunnel-goes-down.patch @@ -0,0 +1,323 @@ +From b3e1a39be32ec30dacfa545b4e019180c390d141 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: James Chapman +Date: Mon, 9 Jun 2008 13:35:41 -0700 +Subject: l2tp: Fix possible oops if transmitting or receiving when tunnel goes down + +From: James Chapman + +[ upstream commit: 24b95685ffcdb3dc28f64b9e8af6ea3e8360fbc5 ] + +Some problems have been experienced in the field which cause an oops +in the pppol2tp driver if L2TP tunnels fail while passing data. + +The pppol2tp driver uses private data that is referenced via the +sk->sk_user_data of its UDP and PPPoL2TP sockets. This patch makes +sure that the driver uses sock_hold() when it holds a reference to the +sk pointer. This affects its sendmsg(), recvmsg(), getname(), +[gs]etsockopt() and ioctl() handlers. + +Tested by ISP where problem was seen. System has been up 10 days with +no oops since running this patch. Without the patch, an oops would +occur every 1-2 days. + +Signed-off-by: James Chapman +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + drivers/net/pppol2tp.c | 101 +++++++++++++++++++++++++++++++++++++------------ + 1 file changed, 78 insertions(+), 23 deletions(-) + +--- a/drivers/net/pppol2tp.c ++++ b/drivers/net/pppol2tp.c +@@ -240,12 +240,15 @@ static inline struct pppol2tp_session *p + if (sk == NULL) + return NULL; + ++ sock_hold(sk); + session = (struct pppol2tp_session *)(sk->sk_user_data); +- if (session == NULL) +- return NULL; ++ if (session == NULL) { ++ sock_put(sk); ++ goto out; ++ } + + BUG_ON(session->magic != L2TP_SESSION_MAGIC); +- ++out: + return session; + } + +@@ -256,12 +259,15 @@ static inline struct pppol2tp_tunnel *pp + if (sk == NULL) + return NULL; + ++ sock_hold(sk); + tunnel = (struct pppol2tp_tunnel *)(sk->sk_user_data); +- if (tunnel == NULL) +- return NULL; ++ if (tunnel == NULL) { ++ sock_put(sk); ++ goto out; ++ } + + BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); +- ++out: + return tunnel; + } + +@@ -716,12 +722,14 @@ discard: + session->stats.rx_errors++; + kfree_skb(skb); + sock_put(session->sock); ++ sock_put(sock); + + return 0; + + error: + /* Put UDP header back */ + __skb_push(skb, sizeof(struct udphdr)); ++ sock_put(sock); + + no_tunnel: + return 1; +@@ -745,10 +753,13 @@ static int pppol2tp_udp_encap_recv(struc + "%s: received %d bytes\n", tunnel->name, skb->len); + + if (pppol2tp_recv_core(sk, skb)) +- goto pass_up; ++ goto pass_up_put; + ++ sock_put(sk); + return 0; + ++pass_up_put: ++ sock_put(sk); + pass_up: + return 1; + } +@@ -858,7 +869,7 @@ static int pppol2tp_sendmsg(struct kiocb + + tunnel = pppol2tp_sock_to_tunnel(session->tunnel_sock); + if (tunnel == NULL) +- goto error; ++ goto error_put_sess; + + /* What header length is configured for this session? */ + hdr_len = pppol2tp_l2tp_header_len(session); +@@ -870,7 +881,7 @@ static int pppol2tp_sendmsg(struct kiocb + sizeof(ppph) + total_len, + 0, GFP_KERNEL); + if (!skb) +- goto error; ++ goto error_put_sess_tun; + + /* Reserve space for headers. */ + skb_reserve(skb, NET_SKB_PAD); +@@ -900,7 +911,7 @@ static int pppol2tp_sendmsg(struct kiocb + error = memcpy_fromiovec(skb->data, m->msg_iov, total_len); + if (error < 0) { + kfree_skb(skb); +- goto error; ++ goto error_put_sess_tun; + } + skb_put(skb, total_len); + +@@ -947,10 +958,33 @@ static int pppol2tp_sendmsg(struct kiocb + session->stats.tx_errors++; + } + ++ return error; ++ ++error_put_sess_tun: ++ sock_put(session->tunnel_sock); ++error_put_sess: ++ sock_put(sk); + error: + return error; + } + ++/* Automatically called when the skb is freed. ++ */ ++static void pppol2tp_sock_wfree(struct sk_buff *skb) ++{ ++ sock_put(skb->sk); ++} ++ ++/* For data skbs that we transmit, we associate with the tunnel socket ++ * but don't do accounting. ++ */ ++static inline void pppol2tp_skb_set_owner_w(struct sk_buff *skb, struct sock *sk) ++{ ++ sock_hold(sk); ++ skb->sk = sk; ++ skb->destructor = pppol2tp_sock_wfree; ++} ++ + /* Transmit function called by generic PPP driver. Sends PPP frame + * over PPPoL2TP socket. + * +@@ -993,10 +1027,10 @@ static int pppol2tp_xmit(struct ppp_chan + + sk_tun = session->tunnel_sock; + if (sk_tun == NULL) +- goto abort; ++ goto abort_put_sess; + tunnel = pppol2tp_sock_to_tunnel(sk_tun); + if (tunnel == NULL) +- goto abort; ++ goto abort_put_sess; + + /* What header length is configured for this session? */ + hdr_len = pppol2tp_l2tp_header_len(session); +@@ -1009,7 +1043,7 @@ static int pppol2tp_xmit(struct ppp_chan + sizeof(struct udphdr) + hdr_len + sizeof(ppph); + old_headroom = skb_headroom(skb); + if (skb_cow_head(skb, headroom)) +- goto abort; ++ goto abort_put_sess_tun; + + new_headroom = skb_headroom(skb); + skb_orphan(skb); +@@ -1069,7 +1103,7 @@ static int pppol2tp_xmit(struct ppp_chan + /* Get routing info from the tunnel socket */ + dst_release(skb->dst); + skb->dst = dst_clone(__sk_dst_get(sk_tun)); +- skb->sk = sk_tun; ++ pppol2tp_skb_set_owner_w(skb, sk_tun); + + /* Queue the packet to IP for output */ + len = skb->len; +@@ -1086,8 +1120,14 @@ static int pppol2tp_xmit(struct ppp_chan + session->stats.tx_errors++; + } + ++ sock_put(sk_tun); ++ sock_put(sk); + return 1; + ++abort_put_sess_tun: ++ sock_put(sk_tun); ++abort_put_sess: ++ sock_put(sk); + abort: + /* Free the original skb */ + kfree_skb(skb); +@@ -1191,7 +1231,7 @@ static void pppol2tp_tunnel_destruct(str + { + struct pppol2tp_tunnel *tunnel; + +- tunnel = pppol2tp_sock_to_tunnel(sk); ++ tunnel = sk->sk_user_data; + if (tunnel == NULL) + goto end; + +@@ -1230,10 +1270,12 @@ static void pppol2tp_session_destruct(st + if (sk->sk_user_data != NULL) { + struct pppol2tp_tunnel *tunnel; + +- session = pppol2tp_sock_to_session(sk); ++ session = sk->sk_user_data; + if (session == NULL) + goto out; + ++ BUG_ON(session->magic != L2TP_SESSION_MAGIC); ++ + /* Don't use pppol2tp_sock_to_tunnel() here to + * get the tunnel context because the tunnel + * socket might have already been closed (its +@@ -1611,7 +1653,7 @@ static int pppol2tp_connect(struct socke + + error = ppp_register_channel(&po->chan); + if (error) +- goto end; ++ goto end_put_tun; + + /* This is how we get the session context from the socket. */ + sk->sk_user_data = session; +@@ -1631,6 +1673,8 @@ out_no_ppp: + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: created\n", session->name); + ++end_put_tun: ++ sock_put(tunnel_sock); + end: + release_sock(sk); + +@@ -1671,6 +1715,7 @@ static int pppol2tp_getname(struct socke + *usockaddr_len = len; + + error = 0; ++ sock_put(sock->sk); + + end: + return error; +@@ -1909,14 +1954,17 @@ static int pppol2tp_ioctl(struct socket + err = -EBADF; + tunnel = pppol2tp_sock_to_tunnel(session->tunnel_sock); + if (tunnel == NULL) +- goto end; ++ goto end_put_sess; + + err = pppol2tp_tunnel_ioctl(tunnel, cmd, arg); +- goto end; ++ sock_put(session->tunnel_sock); ++ goto end_put_sess; + } + + err = pppol2tp_session_ioctl(session, cmd, arg); + ++end_put_sess: ++ sock_put(sk); + end: + return err; + } +@@ -2062,14 +2110,17 @@ static int pppol2tp_setsockopt(struct so + err = -EBADF; + tunnel = pppol2tp_sock_to_tunnel(session->tunnel_sock); + if (tunnel == NULL) +- goto end; ++ goto end_put_sess; + + err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val); ++ sock_put(session->tunnel_sock); + } else + err = pppol2tp_session_setsockopt(sk, session, optname, val); + + err = 0; + ++end_put_sess: ++ sock_put(sk); + end: + return err; + } +@@ -2184,20 +2235,24 @@ static int pppol2tp_getsockopt(struct so + err = -EBADF; + tunnel = pppol2tp_sock_to_tunnel(session->tunnel_sock); + if (tunnel == NULL) +- goto end; ++ goto end_put_sess; + + err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); ++ sock_put(session->tunnel_sock); + } else + err = pppol2tp_session_getsockopt(sk, session, optname, &val); + + err = -EFAULT; + if (put_user(len, (int __user *) optlen)) +- goto end; ++ goto end_put_sess; + + if (copy_to_user((void __user *) optval, &val, len)) +- goto end; ++ goto end_put_sess; + + err = 0; ++ ++end_put_sess: ++ sock_put(sk); + end: + return err; + } diff --git a/queue-2.6.25/l2tp-fix-possible-warn_on-from-socket-code-when-udp-socket-is-closed.patch b/queue-2.6.25/l2tp-fix-possible-warn_on-from-socket-code-when-udp-socket-is-closed.patch new file mode 100644 index 0000000000..6db929b9b5 --- /dev/null +++ b/queue-2.6.25/l2tp-fix-possible-warn_on-from-socket-code-when-udp-socket-is-closed.patch @@ -0,0 +1,66 @@ +From 6000afc70ddd62a24cf3aa636b066f518157998c Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: James Chapman +Date: Mon, 9 Jun 2008 13:34:39 -0700 +Subject: l2tp: Fix possible WARN_ON from socket code when UDP socket is closed + +From: James Chapman + +[ upstream commit: 199f7d24ae59894243687a234a909f44a8724506 ] + +If an L2TP daemon closes a tunnel socket while packets are queued in +the tunnel's reorder queue, a kernel warning is logged because the +socket is closed while skbs are still referencing it. The fix is to +purge the queue in the socket's release handler. + +WARNING: at include/net/sock.h:351 udp_lib_unhash+0x41/0x68() +Pid: 12998, comm: openl2tpd Not tainted 2.6.25 #8 + [] warn_on_slowpath+0x41/0x51 + [] udp_lib_unhash+0x41/0x68 + [] sk_common_release+0x23/0x90 + [] udp_lib_close+0x8/0xa + [] inet_release+0x42/0x48 + [] sock_release+0x14/0x60 + [] sock_close+0x29/0x30 + [] __fput+0xad/0x15b + [] fput+0x17/0x19 + [] filp_close+0x50/0x5a + [] sys_close+0x69/0x9f + [] syscall_call+0x7/0xb + +Signed-off-by: James Chapman +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + drivers/net/pppol2tp.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/drivers/net/pppol2tp.c ++++ b/drivers/net/pppol2tp.c +@@ -1279,6 +1279,7 @@ out: + static int pppol2tp_release(struct socket *sock) + { + struct sock *sk = sock->sk; ++ struct pppol2tp_session *session; + int error; + + if (!sk) +@@ -1296,9 +1297,18 @@ static int pppol2tp_release(struct socke + sock_orphan(sk); + sock->sk = NULL; + ++ session = pppol2tp_sock_to_session(sk); ++ + /* Purge any queued data */ + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); ++ if (session != NULL) { ++ struct sk_buff *skb; ++ while ((skb = skb_dequeue(&session->reorder_q))) { ++ kfree_skb(skb); ++ sock_put(sk); ++ } ++ } + + release_sock(sk); + diff --git a/queue-2.6.25/net-fix-call-to-change_rx_flags-in-dev_change_flags.patch b/queue-2.6.25/net-fix-call-to-change_rx_flags-in-dev_change_flags.patch new file mode 100644 index 0000000000..273cc577b8 --- /dev/null +++ b/queue-2.6.25/net-fix-call-to-change_rx_flags-in-dev_change_flags.patch @@ -0,0 +1,36 @@ +From b9b704db3c0cde4a25b85501d2c9d650d8be9e1d Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: David Woodhouse +Date: Tue, 20 May 2008 14:36:14 -0700 +Subject: net: Fix call to ->change_rx_flags(dev, IFF_MULTICAST) in dev_change_flags() + +From: David Woodhouse + +[ upstream commit: 0e91796eb46e29edc791131c832a2232bcaed9dd ] + +Am I just being particularly dim today, or can the call to +dev->change_rx_flags(dev, IFF_MULTICAST) in dev_change_flags() never +happen? + +We've just set dev->flags = flags & IFF_MULTICAST, effectively. So the +condition '(dev->flags ^ flags) & IFF_MULTICAST' is _never_ going to be +true. + +Signed-off-by: David Woodhouse +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/core/dev.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -3132,7 +3132,7 @@ int dev_change_flags(struct net_device * + * Load in the correct multicast list now the flags have changed. + */ + +- if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST) ++ if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST) + dev->change_rx_flags(dev, IFF_MULTICAST); + + dev_set_rx_mode(dev); diff --git a/queue-2.6.25/net_sched-cls_api-fix-return-value-for-non-existant-classifiers.patch b/queue-2.6.25/net_sched-cls_api-fix-return-value-for-non-existant-classifiers.patch new file mode 100644 index 0000000000..df6dd34b9c --- /dev/null +++ b/queue-2.6.25/net_sched-cls_api-fix-return-value-for-non-existant-classifiers.patch @@ -0,0 +1,31 @@ +From 44a02f11d1d7f26e1ae811009b7ef5a657c2056d Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Patrick McHardy +Date: Tue, 20 May 2008 14:34:46 -0700 +Subject: net_sched: cls_api: fix return value for non-existant classifiers + +From: Patrick McHardy + +[ upstream commit: f2df824948d559ea818e03486a8583e42ea6ab37 ] + +cls_api should return ENOENT when the requested classifier doesn't +exist. + +Signed-off-by: Patrick McHardy +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/sched/cls_api.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/sched/cls_api.c ++++ b/net/sched/cls_api.c +@@ -220,7 +220,7 @@ replay: + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (tp == NULL) + goto errout; +- err = -EINVAL; ++ err = -ENOENT; + tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]); + if (tp_ops == NULL) { + #ifdef CONFIG_KMOD diff --git a/queue-2.6.25/netlink-fix-nla_parse_nested_compat-to-call-nla_parse-directly.patch b/queue-2.6.25/netlink-fix-nla_parse_nested_compat-to-call-nla_parse-directly.patch new file mode 100644 index 0000000000..42719dc508 --- /dev/null +++ b/queue-2.6.25/netlink-fix-nla_parse_nested_compat-to-call-nla_parse-directly.patch @@ -0,0 +1,50 @@ +From f48a77e02822767a51915454a6cc5feba39e0c53 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Thomas Graf +Date: Thu, 22 May 2008 10:48:59 -0700 +Subject: netlink: Fix nla_parse_nested_compat() to call nla_parse() directly + +From: Thomas Graf + +[ upstream commit: b9a2f2e450b0f770bb4347ae8d48eb2dea701e24 ] + +The purpose of nla_parse_nested_compat() is to parse attributes which +contain a struct followed by a stream of nested attributes. So far, +it called nla_parse_nested() to parse the stream of nested attributes +which was wrong, as nla_parse_nested() expects a container attribute +as data which holds the attribute stream. It needs to call +nla_parse() directly while pointing at the next possible alignment +point after the struct in the beginning of the attribute. + +With this patch, I can no longer reproduce the reported leftover +warnings. + +Signed-off-by: Thomas Graf +Acked-by: Patrick McHardy +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + include/net/netlink.h | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/include/net/netlink.h ++++ b/include/net/netlink.h +@@ -772,12 +772,13 @@ static inline int __nla_parse_nested_com + const struct nla_policy *policy, + int len) + { +- if (nla_len(nla) < len) ++ int nested_len = nla_len(nla) - NLA_ALIGN(len); ++ ++ if (nested_len < 0) + return -1; +- if (nla_len(nla) >= NLA_ALIGN(len) + sizeof(struct nlattr)) +- return nla_parse_nested(tb, maxtype, +- nla_data(nla) + NLA_ALIGN(len), +- policy); ++ if (nested_len >= nla_attr_size(0)) ++ return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), ++ nested_len, policy); + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + return 0; + } diff --git a/queue-2.6.25/series b/queue-2.6.25/series index 58213d7b57..a9610071ab 100644 --- a/queue-2.6.25/series +++ b/queue-2.6.25/series @@ -11,3 +11,23 @@ ecryptfs-clean-up-lock_parent.patch ecryptfs-fix-missed-mutex_unlock.patch ps3-fix-frame-buffer-build-error.patch sunhv-fix-locking-in-non-paged-i-o-case.patch +af_key-fix-selector-family-initialization.patch +ax25-fix-null-pointer-dereference-and-lockup.patch +bluetooth-fix-locking-bug-in-the-rfcomm-socket-cleanup-handling.patch +can-fix-copy_from_user-results-interpretation.patch +cassini-only-use-chip-checksum-for-ipv4-packets.patch +net-fix-call-to-change_rx_flags-in-dev_change_flags.patch +net_sched-cls_api-fix-return-value-for-non-existant-classifiers.patch +ipsec-use-the-correct-ip_local_out-function.patch +netlink-fix-nla_parse_nested_compat-to-call-nla_parse-directly.patch +l2tp-avoid-skb-truesize-bug-if-headroom-is-increased.patch +vlan-correctly-handle-device-notifications-for-layered-vlan-devices.patch +tcp-tcp-connection-times-out-if-icmp-frag-needed-is-delayed.patch +tcp-allow-send-limited-cwnd-to-grow-up-to-max_burst-when-gso-disabled.patch +tcp-limit-cwnd-growth-when-deferring-for-gso.patch +l2tp-fix-possible-warn_on-from-socket-code-when-udp-socket-is-closed.patch +l2tp-fix-possible-oops-if-transmitting-or-receiving-when-tunnel-goes-down.patch +tcp-fix-skb-vs-fack_count-out-of-sync-condition.patch +tcp-frto-fix-fallback-to-conventional-recovery.patch +tcp-frto-sack-variant-is-errorneously-used-with-newreno.patch +tcp-frto-work-around-inorder-receivers.patch diff --git a/queue-2.6.25/tcp-allow-send-limited-cwnd-to-grow-up-to-max_burst-when-gso-disabled.patch b/queue-2.6.25/tcp-allow-send-limited-cwnd-to-grow-up-to-max_burst-when-gso-disabled.patch new file mode 100644 index 0000000000..80f4a03636 --- /dev/null +++ b/queue-2.6.25/tcp-allow-send-limited-cwnd-to-grow-up-to-max_burst-when-gso-disabled.patch @@ -0,0 +1,42 @@ +From 94f3671135bdc0d573dd4f7731dda9072b25a23a Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: John Heffner +Date: Tue, 29 Apr 2008 03:13:02 -0700 +Subject: tcp: Allow send-limited cwnd to grow up to max_burst when gso disabled + +From: John Heffner + +[ upstream commit: ce447eb91409225f8a488f6b7b2a1bdf7b2d884f ] + +This changes the logic in tcp_is_cwnd_limited() so that cwnd may grow +up to tcp_max_burst() even when sk_can_gso() is false, or when +sysctl_tcp_tso_win_divisor != 0. + +Signed-off-by: John Heffner +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ipv4/tcp_cong.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -285,14 +285,11 @@ int tcp_is_cwnd_limited(const struct soc + if (in_flight >= tp->snd_cwnd) + return 1; + +- if (!sk_can_gso(sk)) +- return 0; +- + left = tp->snd_cwnd - in_flight; +- if (sysctl_tcp_tso_win_divisor) +- return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd; +- else +- return left <= tcp_max_burst(tp); ++ if (sk_can_gso(sk) && ++ left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd) ++ return 1; ++ return left <= tcp_max_burst(tp); + } + EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); + diff --git a/queue-2.6.25/tcp-fix-skb-vs-fack_count-out-of-sync-condition.patch b/queue-2.6.25/tcp-fix-skb-vs-fack_count-out-of-sync-condition.patch new file mode 100644 index 0000000000..b5ebcb948a --- /dev/null +++ b/queue-2.6.25/tcp-fix-skb-vs-fack_count-out-of-sync-condition.patch @@ -0,0 +1,70 @@ +From 623af2c3a3cd84450306051a8fcba0a962868942 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Ilpo Järvinen +Date: Wed, 4 Jun 2008 12:07:44 -0700 +Subject: tcp: fix skb vs fack_count out-of-sync condition + +From: Ilpo Järvinen + +[ upstream commit: a6604471db5e7a33474a7f16c64d6b118fae3e74 ] + +This bug is able to corrupt fackets_out in very rare cases. +In order for this to cause corruption: + 1) DSACK in the middle of previous SACK block must be generated. + 2) In order to take that particular branch, part or all of the + DSACKed segment must already be SACKed so that we have that + in cache in the first place. + 3) The new info must be top enough so that fackets_out will be + updated on this iteration. +...then fack_count is updated while skb wasn't, then we walk again +that particular segment thus updating fack_count twice for +a single skb and finally that value is assigned to fackets_out +by tcp_sacktag_one. + +It is safe to call tcp_sacktag_one just once for a segment (at +DSACK), no need to call again for plain SACK. + +Potential problem of the miscount are limited to premature entry +to recovery and to inflated reordering metric (which could even +cancel each other out in the most the luckiest scenarios :-)). +Both are quite insignificant in worst case too and there exists +also code to reset them (fackets_out once sacked_out becomes zero +and reordering metric on RTO). + +This has been reported by a number of people, because it occurred +quite rarely, it has been very evasive. Andy Furniss was able to +get it to occur couple of times so that a bit more info was +collected about the problem using a debug patch, though it still +required lot of checking around. Thanks also to others who have +tried to help here. + +This is listed as Bugzilla #10346. The bug was introduced by +me in commit 68f8353b48 ([TCP]: Rewrite SACK block processing & +sack_recv_cache use), I probably thought back then that there's +need to scan that entry twice or didn't dare to make it go +through it just once there. Going through twice would have +required restoring fack_count after the walk but as noted above, +I chose to drop the additional walk step altogether here. + +Signed-off-by: Ilpo Järvinen +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ipv4/tcp_input.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -1393,9 +1393,9 @@ static struct sk_buff *tcp_maybe_skippin + + if (before(next_dup->start_seq, skip_to_seq)) { + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); +- tcp_sacktag_walk(skb, sk, NULL, +- next_dup->start_seq, next_dup->end_seq, +- 1, fack_count, reord, flag); ++ skb = tcp_sacktag_walk(skb, sk, NULL, ++ next_dup->start_seq, next_dup->end_seq, ++ 1, fack_count, reord, flag); + } + + return skb; diff --git a/queue-2.6.25/tcp-frto-fix-fallback-to-conventional-recovery.patch b/queue-2.6.25/tcp-frto-fix-fallback-to-conventional-recovery.patch new file mode 100644 index 0000000000..981edc6961 --- /dev/null +++ b/queue-2.6.25/tcp-frto-fix-fallback-to-conventional-recovery.patch @@ -0,0 +1,56 @@ +From ab453ba3be3db9b46c8f37c0bf55d6b6811a2b78 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Ilpo Järvinen +Date: Tue, 13 May 2008 02:53:26 -0700 +Subject: tcp FRTO: Fix fallback to conventional recovery + +From: Ilpo Järvinen + +[ upstream commit: a1c1f281b84a751fdb5ff919da3b09df7297619f ] + +It seems that commit 009a2e3e4ec ("[TCP] FRTO: Improve +interoperability with other undo_marker users") run into +another land-mine which caused fallback to conventional +recovery to break: + +1. Cumulative ACK arrives after FRTO retransmission +2. tcp_try_to_open sees zero retrans_out, clears retrans_stamp + which should be kept like in CA_Loss state it would be +3. undo_marker change allowed tcp_packet_delayed to return + true because of the cleared retrans_stamp once FRTO is + terminated causing LossUndo to occur, which means all loss + markings FRTO made are reverted. + +This means that the conventional recovery basically recovered +one loss per RTT, which is not that efficient. It was quite +unobvious that the undo_marker change broken something like +this, I had a quite long session to track it down because of +the non-intuitiviness of the bug (luckily I had a trivial +reproducer at hand and I was also able to learn to use kprobes +in the process as well :-)). + +This together with the NewReno+FRTO fix and FRTO in-order +workaround this fixes Damon's problems, this and the first +mentioned are enough to fix Bugzilla #10063. + +Signed-off-by: Ilpo Järvinen +Reported-by: Damon L. Chesser +Tested-by: Damon L. Chesser +Tested-by: Sebastian Hyrwall +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ipv4/tcp_input.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -2465,7 +2465,7 @@ static void tcp_try_to_open(struct sock + + tcp_verify_left_out(tp); + +- if (tp->retrans_out == 0) ++ if (!tp->frto_counter && tp->retrans_out == 0) + tp->retrans_stamp = 0; + + if (flag & FLAG_ECE) diff --git a/queue-2.6.25/tcp-frto-sack-variant-is-errorneously-used-with-newreno.patch b/queue-2.6.25/tcp-frto-sack-variant-is-errorneously-used-with-newreno.patch new file mode 100644 index 0000000000..f8a5208533 --- /dev/null +++ b/queue-2.6.25/tcp-frto-sack-variant-is-errorneously-used-with-newreno.patch @@ -0,0 +1,105 @@ +From 014a7ae5645af49ada9b3ad9aaef57487d1a29ba Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Ilpo Järvinen +Date: Thu, 8 May 2008 01:09:11 -0700 +Subject: tcp FRTO: SACK variant is errorneously used with NewReno + +From: Ilpo Järvinen + +[ upstream commit: 62ab22278308a40bcb7f4079e9719ab8b7fe11b5 ] + +Note: there's actually another bug in FRTO's SACK variant, which +is the causing failure in NewReno case because of the error +that's fixed here. I'll fix the SACK case separately (it's +a separate bug really, though related, but in order to fix that +I need to audit tp->snd_nxt usage a bit). + +There were two places where SACK variant of FRTO is getting +incorrectly used even if SACK wasn't negotiated by the TCP flow. +This leads to incorrect setting of frto_highmark with NewReno +if a previous recovery was interrupted by another RTO. + +An eventual fallback to conventional recovery then incorrectly +considers one or couple of segments as forward transmissions +though they weren't, which then are not LOST marked during +fallback making them "non-retransmittable" until the next RTO. +In a bad case, those segments are really lost and are the only +one left in the window. Thus TCP needs another RTO to continue. +The next FRTO, however, could again repeat the same events +making the progress of the TCP flow extremely slow. + +In order for these events to occur at all, FRTO must occur +again in FRTOs step 3 while the key segments must be lost as +well, which is not too likely in practice. It seems to most +frequently with some small devices such as network printers +that *seem* to accept TCP segments only in-order. In cases +were key segments weren't lost, things get automatically +resolved because those wrongly marked segments don't need to be +retransmitted in order to continue. + +I found a reproducer after digging up relevant reports (few +reports in total, none at netdev or lkml I know of), some +cases seemed to indicate middlebox issues which seems now +to be a false assumption some people had made. Bugzilla +#10063 _might_ be related. Damon L. Chesser +had a reproducable case and was kind enough to tcpdump it +for me. With the tcpdump log it was quite trivial to figure +out. + +Signed-off-by: Ilpo Järvinen +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ipv4/tcp_input.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -113,8 +113,6 @@ int sysctl_tcp_abc __read_mostly; + #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) + #define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) + +-#define IsSackFrto() (sysctl_tcp_frto == 0x2) +- + #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) + #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) + +@@ -1685,6 +1683,11 @@ static inline void tcp_reset_reno_sack(s + tp->sacked_out = 0; + } + ++static int tcp_is_sackfrto(const struct tcp_sock *tp) ++{ ++ return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp); ++} ++ + /* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) + */ +@@ -1701,7 +1704,7 @@ int tcp_use_frto(struct sock *sk) + if (icsk->icsk_mtup.probe_size) + return 0; + +- if (IsSackFrto()) ++ if (tcp_is_sackfrto(tp)) + return 1; + + /* Avoid expensive walking of rexmit queue if possible */ +@@ -1791,7 +1794,7 @@ void tcp_enter_frto(struct sock *sk) + /* Earlier loss recovery underway (see RFC4138; Appendix B). + * The last condition is necessary at least in tp->frto_counter case. + */ +- if (IsSackFrto() && (tp->frto_counter || ++ if (tcp_is_sackfrto(tp) && (tp->frto_counter || + ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && + after(tp->high_seq, tp->snd_una)) { + tp->frto_highmark = tp->high_seq; +@@ -3110,7 +3113,7 @@ static int tcp_process_frto(struct sock + return 1; + } + +- if (!IsSackFrto() || tcp_is_reno(tp)) { ++ if (!tcp_is_sackfrto(tp)) { + /* RFC4138 shortcoming in step 2; should also have case c): + * ACK isn't duplicate nor advances window, e.g., opposite dir + * data, winupdate diff --git a/queue-2.6.25/tcp-frto-work-around-inorder-receivers.patch b/queue-2.6.25/tcp-frto-work-around-inorder-receivers.patch new file mode 100644 index 0000000000..8459149a38 --- /dev/null +++ b/queue-2.6.25/tcp-frto-work-around-inorder-receivers.patch @@ -0,0 +1,68 @@ +From 497157281d4368c61f53846646ec89ce4175839c Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Ilpo Järvinen +Date: Tue, 13 May 2008 02:54:19 -0700 +Subject: tcp FRTO: work-around inorder receivers + +From: Ilpo Järvinen + +[ upstream commit: 79d44516b4b178ffb6e2159c75584cfcfc097914 ] + +If receiver consumes segments successfully only in-order, FRTO +fallback to conventional recovery produces RTO loop because +FRTO's forward transmissions will always get dropped and need to +be resent, yet by default they're not marked as lost (which are +the only segments we will retransmit in CA_Loss). + +Price to pay about this is occassionally unnecessarily +retransmitting the forward transmission(s). SACK blocks help +a bit to avoid this, so it's mainly a concern for NewReno case +though SACK is not fully immune either. + +This change has a side-effect of fixing SACKFRTO problem where +it didn't have snd_nxt of the RTO time available anymore when +fallback become necessary (this problem would have only occured +when RTO would occur for two or more segments and ECE arrives +in step 3; no need to figure out how to fix that unless the +TODO item of selective behavior is considered in future). + +Signed-off-by: Ilpo Järvinen +Reported-by: Damon L. Chesser +Tested-by: Damon L. Chesser +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ipv4/tcp_input.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -1841,9 +1841,16 @@ static void tcp_enter_frto_loss(struct s + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + } + +- /* Don't lost mark skbs that were fwd transmitted after RTO */ +- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) && +- !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { ++ /* Marking forward transmissions that were made after RTO lost ++ * can cause unnecessary retransmissions in some scenarios, ++ * SACK blocks will mitigate that in some but not in all cases. ++ * We used to not mark them but it was causing break-ups with ++ * receivers that do only in-order receival. ++ * ++ * TODO: we could detect presence of such receiver and select ++ * different behavior per flow. ++ */ ++ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + } +@@ -1859,7 +1866,7 @@ static void tcp_enter_frto_loss(struct s + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); + tcp_set_ca_state(sk, TCP_CA_Loss); +- tp->high_seq = tp->frto_highmark; ++ tp->high_seq = tp->snd_nxt; + TCP_ECN_queue_cwr(tp); + + tcp_clear_retrans_hints_partial(tp); diff --git a/queue-2.6.25/tcp-limit-cwnd-growth-when-deferring-for-gso.patch b/queue-2.6.25/tcp-limit-cwnd-growth-when-deferring-for-gso.patch new file mode 100644 index 0000000000..656d9ff87f --- /dev/null +++ b/queue-2.6.25/tcp-limit-cwnd-growth-when-deferring-for-gso.patch @@ -0,0 +1,34 @@ +From fdd040a9f8f7fb8152d8101a34f147daf0b3a003 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: John Heffner +Date: Tue, 29 Apr 2008 03:13:52 -0700 +Subject: tcp: Limit cwnd growth when deferring for GSO + +From: John Heffner + +[ upstream commit: 246eb2af060fc32650f07203c02bdc0456ad76c7 ] + +This fixes inappropriately large cwnd growth on sender-limited flows +when GSO is enabled, limiting cwnd growth to 64k. + +[ Backport to 2.6.25 by replacing sk->sk_gso_max_size with 65536 -DaveM ] + +Signed-off-by: John Heffner +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ipv4/tcp_cong.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -287,7 +287,8 @@ int tcp_is_cwnd_limited(const struct soc + + left = tp->snd_cwnd - in_flight; + if (sk_can_gso(sk) && +- left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd) ++ left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && ++ left * tp->mss_cache < 65536) + return 1; + return left <= tcp_max_burst(tp); + } diff --git a/queue-2.6.25/tcp-tcp-connection-times-out-if-icmp-frag-needed-is-delayed.patch b/queue-2.6.25/tcp-tcp-connection-times-out-if-icmp-frag-needed-is-delayed.patch new file mode 100644 index 0000000000..ac2d98ffcc --- /dev/null +++ b/queue-2.6.25/tcp-tcp-connection-times-out-if-icmp-frag-needed-is-delayed.patch @@ -0,0 +1,74 @@ +From a999cd82c502e723f2504782050d6792d0503ac9 Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Sridhar Samudrala +Date: Wed, 21 May 2008 16:42:20 -0700 +Subject: tcp: TCP connection times out if ICMP frag needed is delayed + +From: Sridhar Samudrala + +[ upstream commit: 7d227cd235c809c36c847d6a597956ad9e9d2bae ] + +We are seeing an issue with TCP in handling an ICMP frag needed +message that is received after net.ipv4.tcp_retries1 retransmits. +The default value of retries1 is 3. So if the path mtu changes +and ICMP frag needed is lost for the first 3 retransmits or if +it gets delayed until 3 retransmits are done, TCP doesn't update +MSS correctly and continues to retransmit the orginal message +until it timesout after tcp_retries2 retransmits. + +I am seeing this issue even with the latest 2.6.25.4 kernel. + +In tcp_retransmit_timer(), when retransmits counter exceeds +tcp_retries1 value, the dst cache entry of the socket is reset. +At this time, if we receive an ICMP frag needed message, the +dst entry gets updated with the new MTU, but the TCP sockets +dst_cache entry remains NULL. + +So the next time when we try to retransmit after the ICMP frag +needed is received, tcp_retransmit_skb() gets called. Here the +cur_mss value is calculated at the start of the routine with +a NULL sk_dst_cache. Instead we should call tcp_current_mss after +the rebuild_header that caches the dst entry with the updated mtu. +Also the rebuild_header should be called before tcp_fragment +so that skb is fragmented if the mss goes down. + +Signed-off-by: Sridhar Samudrala +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ipv4/tcp_output.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1836,7 +1836,7 @@ int tcp_retransmit_skb(struct sock *sk, + { + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); +- unsigned int cur_mss = tcp_current_mss(sk, 0); ++ unsigned int cur_mss; + int err; + + /* Inconslusive MTU probe */ +@@ -1858,6 +1858,11 @@ int tcp_retransmit_skb(struct sock *sk, + return -ENOMEM; + } + ++ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) ++ return -EHOSTUNREACH; /* Routing failure or similar. */ ++ ++ cur_mss = tcp_current_mss(sk, 0); ++ + /* If receiver has shrunk his window, and skb is out of + * new window, do not retransmit it. The exception is the + * case, when window is shrunk to zero. In this case +@@ -1884,9 +1889,6 @@ int tcp_retransmit_skb(struct sock *sk, + (sysctl_tcp_retrans_collapse != 0)) + tcp_retrans_try_collapse(sk, skb, cur_mss); + +- if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) +- return -EHOSTUNREACH; /* Routing failure or similar. */ +- + /* Some Solaris stacks overoptimize and ignore the FIN on a + * retransmit when old data is attached. So strip it off + * since it is cheap to do so and saves bytes on the network. diff --git a/queue-2.6.25/vlan-correctly-handle-device-notifications-for-layered-vlan-devices.patch b/queue-2.6.25/vlan-correctly-handle-device-notifications-for-layered-vlan-devices.patch new file mode 100644 index 0000000000..3854fc36f5 --- /dev/null +++ b/queue-2.6.25/vlan-correctly-handle-device-notifications-for-layered-vlan-devices.patch @@ -0,0 +1,38 @@ +From 50e7aa60b098edaceb29c24fa4194427a29719ea Mon Sep 17 00:00:00 2001 +Message-Id: <20080609.134337.193698173.davem@davemloft.net> +From: Patrick McHardy +Date: Mon, 9 Jun 2008 11:42:44 -0700 +Subject: vlan: Correctly handle device notifications for layered VLAN devices + +From: Patrick McHardy + +[ upstream commit: 81d85346b3fcd8b3167eac8b5fb415a210bd4345 ] + +Commit 30688a9 ([VLAN]: Handle vlan devices net namespace changing) +changed the device notifier to special-case notifications for VLAN +devices, effectively disabling state propagation to underlying VLAN +devices. This is needed for layered VLANs though, so restore the +original behaviour. + +Signed-off-by: Patrick McHardy +Acked-by: Pavel Emelyanov +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/8021q/vlan.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/net/8021q/vlan.c ++++ b/net/8021q/vlan.c +@@ -397,10 +397,8 @@ static int vlan_device_event(struct noti + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + +- if (is_vlan_dev(dev)) { ++ if (is_vlan_dev(dev)) + __vlan_device_event(dev, event); +- goto out; +- } + + grp = __vlan_find_group(dev->ifindex); + if (!grp) -- 2.39.2