From: Greg Kroah-Hartman Date: Thu, 25 Jul 2013 23:05:56 +0000 (-0700) Subject: 3.10-stable patches X-Git-Tag: v3.0.88~11 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5fcfbc0bdd3f747f1f258065c4be4624c66fb5ff;p=thirdparty%2Fkernel%2Fstable-queue.git 3.10-stable patches added patches: 9p-fix-off-by-one-causing-access-violations-and-memory-corruption.patch alx-fix-lockdep-annotation.patch atl1e-fix-dma-mapping-warnings.patch atl1e-unmap-partially-mapped-skb-on-dma-error-and-free-skb.patch be2net-fix-to-avoid-hardware-workaround-when-not-needed.patch dummy-fix-oops-when-loading-the-dummy-failed.patch gre-fix-a-regression-in-ioctl.patch gre-fix-mtu-sizing-check-for-gretap-tunnels.patch hyperv-fix-the-netif_f_sg-flag-setting-in-netvsc.patch ifb-fix-oops-when-loading-the-ifb-failed.patch ifb-fix-rcu_sched-self-detected-stalls.patch ipip-fix-a-regression-in-ioctl.patch ip_tunnels-use-skb-len-to-pmtu-check.patch ipv4-set-transport-header-earlier.patch ipv6-call-udp_push_pending_frames-when-uncorking-a-socket-with-af_inet-pending-data.patch ipv6-fix-route-selection-if-kernel-is-not-compiled-with-config_ipv6_router_pref.patch ipv6-in-case-of-link-failure-remove-route-directly-instead-of-letting-it-expire.patch ipv6-ip6_append_data_mtu-did-not-care-about-pmtudisc-and-frag_size.patch ipv6-mcast-always-hold-idev-lock-before-mca_lock.patch ipv6-only-apply-anti-spoofing-checks-to-not-pointopoint-tunnels.patch ipv6-only-static-routes-qualify-for-equal-cost-multipathing.patch ipv6-rt6_check_neigh-should-successfully-verify-neigh-if-no-nud-information-are-available.patch l2tp-add-missing-.owner-to-struct-pppox_proto.patch macvtap-correctly-linearize-skb-when-zerocopy-is-used.patch macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch neighbour-fix-a-race-in-neigh_destroy.patch net-cadence-macb-fix-bug-typo-in-extracting-gem_irq_read_clear-bit.patch net-swap-ver-and-type-in-pppoe_hdr.patch pkt_sched-sch_qfq-remove-a-source-of-high-packet-delay-jitter.patch sfc-fix-memory-leak-when-discarding-scattered-packets.patch sunvnet-vnet_port_remove-must-call-unregister_netdev.patch tuntap-correctly-linearize-skb-when-zerocopy-is-used.patch tuntap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch vhost-net-fix-use-after-free-in-vhost_net_flush.patch virtio_net-fix-race-in-rx-vq-processing.patch virtio-support-unlocked-queue-poll.patch vlan-fix-a-race-in-egress-prio-management.patch vlan-mask-vlan-prio-bits.patch vti-remove-duplicated-code-to-fix-a-memory-leak.patch x25-fix-broken-locking-in-ioctl-error-paths.patch --- diff --git a/queue-3.10/9p-fix-off-by-one-causing-access-violations-and-memory-corruption.patch b/queue-3.10/9p-fix-off-by-one-causing-access-violations-and-memory-corruption.patch new file mode 100644 index 00000000000..d2abf63a6f7 --- /dev/null +++ b/queue-3.10/9p-fix-off-by-one-causing-access-violations-and-memory-corruption.patch @@ -0,0 +1,76 @@ +From db75617408ddf6d4fa8a65c030861ad0cd7e92ea Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 11 Jul 2013 13:16:54 -0400 +Subject: 9p: fix off by one causing access violations and memory corruption + +From: Sasha Levin + +[ Upstream commit 110ecd69a9feea82a152bbf9b12aba57e6396883 ] + +p9_release_pages() would attempt to dereference one value past the end of +pages[]. This would cause the following crashes: + +[ 6293.171817] BUG: unable to handle kernel paging request at ffff8807c96f3000 +[ 6293.174146] IP: [] p9_release_pages+0x3b/0x60 +[ 6293.176447] PGD 79c5067 PUD 82c1e3067 PMD 82c197067 PTE 80000007c96f3060 +[ 6293.180060] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC +[ 6293.180060] Modules linked in: +[ 6293.180060] CPU: 62 PID: 174043 Comm: modprobe Tainted: G W 3.10.0-next-20130710-sasha #3954 +[ 6293.180060] task: ffff8807b803b000 ti: ffff880787dde000 task.ti: ffff880787dde000 +[ 6293.180060] RIP: 0010:[] [] p9_release_pages+0x3b/0x60 +[ 6293.214316] RSP: 0000:ffff880787ddfc28 EFLAGS: 00010202 +[ 6293.214316] RAX: 0000000000000001 RBX: ffff8807c96f2ff8 RCX: 0000000000000000 +[ 6293.222017] RDX: ffff8807b803b000 RSI: 0000000000000001 RDI: ffffea001c7e3d40 +[ 6293.222017] RBP: ffff880787ddfc48 R08: 0000000000000000 R09: 0000000000000000 +[ 6293.222017] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000001 +[ 6293.222017] R13: 0000000000000001 R14: ffff8807cc50c070 R15: ffff8807cc50c070 +[ 6293.222017] FS: 00007f572641d700(0000) GS:ffff8807f3600000(0000) knlGS:0000000000000000 +[ 6293.256784] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b +[ 6293.256784] CR2: ffff8807c96f3000 CR3: 00000007c8e81000 CR4: 00000000000006e0 +[ 6293.256784] Stack: +[ 6293.256784] ffff880787ddfcc8 ffff880787ddfcc8 0000000000000000 ffff880787ddfcc8 +[ 6293.256784] ffff880787ddfd48 ffffffff84128be8 ffff880700000002 0000000000000001 +[ 6293.256784] ffff8807b803b000 ffff880787ddfce0 0000100000000000 0000000000000000 +[ 6293.256784] Call Trace: +[ 6293.256784] [] p9_virtio_zc_request+0x598/0x630 +[ 6293.256784] [] ? wake_up_bit+0x40/0x40 +[ 6293.256784] [] p9_client_zc_rpc+0x111/0x3a0 +[ 6293.256784] [] ? sched_clock_cpu+0x108/0x120 +[ 6293.256784] [] p9_client_read+0xe1/0x2c0 +[ 6293.256784] [] v9fs_file_read+0x90/0xc0 +[ 6293.256784] [] vfs_read+0xc3/0x130 +[ 6293.256784] [] ? trace_hardirqs_on+0xd/0x10 +[ 6293.256784] [] SyS_read+0x62/0xa0 +[ 6293.256784] [] tracesys+0xdd/0xe2 +[ 6293.256784] Code: 66 90 48 89 fb 41 89 f5 48 8b 3f 48 85 ff 74 29 85 f6 74 25 45 31 e4 66 0f 1f 84 00 00 00 00 00 e8 eb 14 12 fd 41 ff c4 49 63 c4 <48> 8b 3c c3 48 85 ff 74 05 45 39 e5 75 e7 48 83 c4 08 5b 41 5c +[ 6293.256784] RIP [] p9_release_pages+0x3b/0x60 +[ 6293.256784] RSP +[ 6293.256784] CR2: ffff8807c96f3000 +[ 6293.256784] ---[ end trace 50822ee72cd360fc ]--- + +Signed-off-by: Sasha Levin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/9p/trans_common.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/net/9p/trans_common.c ++++ b/net/9p/trans_common.c +@@ -24,11 +24,11 @@ + */ + void p9_release_pages(struct page **pages, int nr_pages) + { +- int i = 0; +- while (pages[i] && nr_pages--) { +- put_page(pages[i]); +- i++; +- } ++ int i; ++ ++ for (i = 0; i < nr_pages; i++) ++ if (pages[i]) ++ put_page(pages[i]); + } + EXPORT_SYMBOL(p9_release_pages); + diff --git a/queue-3.10/alx-fix-lockdep-annotation.patch b/queue-3.10/alx-fix-lockdep-annotation.patch new file mode 100644 index 00000000000..ee823dabdae --- /dev/null +++ b/queue-3.10/alx-fix-lockdep-annotation.patch @@ -0,0 +1,39 @@ +From d0772a6314c2ed4d04ab0163c50b3ef6ff9eba40 Mon Sep 17 00:00:00 2001 +From: Maarten Lankhorst +Date: Thu, 11 Jul 2013 15:53:21 +0200 +Subject: alx: fix lockdep annotation + +From: Maarten Lankhorst + +[ Upstream commit a8798a5c77c9981e88caef1373a3310bf8aed219 ] + +Move spin_lock_init to be called before the spinlocks are used, preventing a lockdep splat. + +Signed-off-by: Maarten Lankhorst +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/atheros/alx/main.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/atheros/alx/main.c ++++ b/drivers/net/ethernet/atheros/alx/main.c +@@ -1303,6 +1303,8 @@ static int alx_probe(struct pci_dev *pde + + SET_NETDEV_DEV(netdev, &pdev->dev); + alx = netdev_priv(netdev); ++ spin_lock_init(&alx->hw.mdio_lock); ++ spin_lock_init(&alx->irq_lock); + alx->dev = netdev; + alx->hw.pdev = pdev; + alx->msg_enable = NETIF_MSG_LINK | NETIF_MSG_HW | NETIF_MSG_IFUP | +@@ -1385,9 +1387,6 @@ static int alx_probe(struct pci_dev *pde + + INIT_WORK(&alx->link_check_wk, alx_link_check); + INIT_WORK(&alx->reset_wk, alx_reset); +- spin_lock_init(&alx->hw.mdio_lock); +- spin_lock_init(&alx->irq_lock); +- + netif_carrier_off(netdev); + + err = register_netdev(netdev); diff --git a/queue-3.10/atl1e-fix-dma-mapping-warnings.patch b/queue-3.10/atl1e-fix-dma-mapping-warnings.patch new file mode 100644 index 00000000000..5d2b01656dd --- /dev/null +++ b/queue-3.10/atl1e-fix-dma-mapping-warnings.patch @@ -0,0 +1,150 @@ +From bf6a9aa8649eefee6a93b18d827bd2bbee2dd1ae Mon Sep 17 00:00:00 2001 +From: Neil Horman +Date: Fri, 12 Jul 2013 10:58:48 -0400 +Subject: atl1e: fix dma mapping warnings + +From: Neil Horman + +[ Upstream commit 352900b583b2852152a1e05ea0e8b579292e731e ] + +Recently had this backtrace reported: +WARNING: at lib/dma-debug.c:937 check_unmap+0x47d/0x930() +Hardware name: System Product Name +ATL1E 0000:02:00.0: DMA-API: device driver failed to check map error[device +address=0x00000000cbfd1000] [size=90 bytes] [mapped as single] +Modules linked in: xt_conntrack nf_conntrack ebtable_filter ebtables +ip6table_filter ip6_tables snd_hda_codec_hdmi snd_hda_codec_realtek iTCO_wdt +iTCO_vendor_support snd_hda_intel acpi_cpufreq mperf coretemp btrfs zlib_deflate +snd_hda_codec snd_hwdep microcode raid6_pq libcrc32c snd_seq usblp serio_raw xor +snd_seq_device joydev snd_pcm snd_page_alloc snd_timer snd lpc_ich i2c_i801 +soundcore mfd_core atl1e asus_atk0110 ata_generic pata_acpi radeon i2c_algo_bit +drm_kms_helper ttm drm i2c_core pata_marvell uinput +Pid: 314, comm: systemd-journal Not tainted 3.9.0-0.rc6.git2.3.fc19.x86_64 #1 +Call Trace: + [] warn_slowpath_common+0x66/0x80 + [] warn_slowpath_fmt+0x4c/0x50 + [] check_unmap+0x47d/0x930 + [] ? sched_clock_cpu+0xa8/0x100 + [] debug_dma_unmap_page+0x5f/0x70 + [] ? unmap_single+0x20/0x30 + [] atl1e_intr+0x3a1/0x5b0 [atl1e] + [] ? trace_hardirqs_off+0xd/0x10 + [] handle_irq_event_percpu+0x56/0x390 + [] handle_irq_event+0x3d/0x60 + [] handle_fasteoi_irq+0x5a/0x100 + [] handle_irq+0xbf/0x150 + [] ? file_sb_list_del+0x3f/0x50 + [] ? irq_enter+0x50/0xa0 + [] do_IRQ+0x4d/0xc0 + [] ? file_sb_list_del+0x3f/0x50 + [] common_interrupt+0x72/0x72 + [] ? lock_release+0xc2/0x310 + [] lg_local_unlock_cpu+0x24/0x50 + [] file_sb_list_del+0x3f/0x50 + [] fput+0x2d/0xc0 + [] filp_close+0x61/0x90 + [] __close_fd+0x8d/0x150 + [] sys_close+0x20/0x50 + [] system_call_fastpath+0x16/0x1b + +The usual straighforward failure to check for dma_mapping_error after a map +operation is completed. + +This patch should fix it, the reporter wandered off after filing this bz: +https://bugzilla.redhat.com/show_bug.cgi?id=954170 + +and I don't have hardware to test, but the fix is pretty straightforward, so I +figured I'd post it for review. + +Signed-off-by: Neil Horman +CC: Jay Cliburn +CC: Chris Snook +CC: "David S. Miller" +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/atheros/atl1e/atl1e_main.c | 28 +++++++++++++++++++++--- + 1 file changed, 25 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c ++++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c +@@ -1665,8 +1665,8 @@ check_sum: + return 0; + } + +-static void atl1e_tx_map(struct atl1e_adapter *adapter, +- struct sk_buff *skb, struct atl1e_tpd_desc *tpd) ++static int atl1e_tx_map(struct atl1e_adapter *adapter, ++ struct sk_buff *skb, struct atl1e_tpd_desc *tpd) + { + struct atl1e_tpd_desc *use_tpd = NULL; + struct atl1e_tx_buffer *tx_buffer = NULL; +@@ -1677,6 +1677,7 @@ static void atl1e_tx_map(struct atl1e_ad + u16 nr_frags; + u16 f; + int segment; ++ int ring_start = adapter->tx_ring.next_to_use; + + nr_frags = skb_shinfo(skb)->nr_frags; + segment = (tpd->word3 >> TPD_SEGMENT_EN_SHIFT) & TPD_SEGMENT_EN_MASK; +@@ -1689,6 +1690,9 @@ static void atl1e_tx_map(struct atl1e_ad + tx_buffer->length = map_len; + tx_buffer->dma = pci_map_single(adapter->pdev, + skb->data, hdr_len, PCI_DMA_TODEVICE); ++ if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) ++ return -ENOSPC; ++ + ATL1E_SET_PCIMAP_TYPE(tx_buffer, ATL1E_TX_PCIMAP_SINGLE); + mapped_len += map_len; + use_tpd->buffer_addr = cpu_to_le64(tx_buffer->dma); +@@ -1715,6 +1719,13 @@ static void atl1e_tx_map(struct atl1e_ad + tx_buffer->dma = + pci_map_single(adapter->pdev, skb->data + mapped_len, + map_len, PCI_DMA_TODEVICE); ++ ++ if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) { ++ /* Reset the tx rings next pointer */ ++ adapter->tx_ring.next_to_use = ring_start; ++ return -ENOSPC; ++ } ++ + ATL1E_SET_PCIMAP_TYPE(tx_buffer, ATL1E_TX_PCIMAP_SINGLE); + mapped_len += map_len; + use_tpd->buffer_addr = cpu_to_le64(tx_buffer->dma); +@@ -1750,6 +1761,13 @@ static void atl1e_tx_map(struct atl1e_ad + (i * MAX_TX_BUF_LEN), + tx_buffer->length, + DMA_TO_DEVICE); ++ ++ if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) { ++ /* Reset the ring next to use pointer */ ++ adapter->tx_ring.next_to_use = ring_start; ++ return -ENOSPC; ++ } ++ + ATL1E_SET_PCIMAP_TYPE(tx_buffer, ATL1E_TX_PCIMAP_PAGE); + use_tpd->buffer_addr = cpu_to_le64(tx_buffer->dma); + use_tpd->word2 = (use_tpd->word2 & (~TPD_BUFLEN_MASK)) | +@@ -1767,6 +1785,7 @@ static void atl1e_tx_map(struct atl1e_ad + /* The last buffer info contain the skb address, + so it will be free after unmap */ + tx_buffer->skb = skb; ++ return 0; + } + + static void atl1e_tx_queue(struct atl1e_adapter *adapter, u16 count, +@@ -1834,10 +1853,13 @@ static netdev_tx_t atl1e_xmit_frame(stru + return NETDEV_TX_OK; + } + +- atl1e_tx_map(adapter, skb, tpd); ++ if (atl1e_tx_map(adapter, skb, tpd)) ++ goto out; ++ + atl1e_tx_queue(adapter, tpd_req, tpd); + + netdev->trans_start = jiffies; /* NETIF_F_LLTX driver :( */ ++out: + spin_unlock_irqrestore(&adapter->tx_lock, flags); + return NETDEV_TX_OK; + } diff --git a/queue-3.10/atl1e-unmap-partially-mapped-skb-on-dma-error-and-free-skb.patch b/queue-3.10/atl1e-unmap-partially-mapped-skb-on-dma-error-and-free-skb.patch new file mode 100644 index 00000000000..b8176f1a45e --- /dev/null +++ b/queue-3.10/atl1e-unmap-partially-mapped-skb-on-dma-error-and-free-skb.patch @@ -0,0 +1,86 @@ +From 326eb306b8445bccf894e99ccde478eb4731b726 Mon Sep 17 00:00:00 2001 +From: Neil Horman +Date: Tue, 16 Jul 2013 10:49:41 -0400 +Subject: atl1e: unmap partially mapped skb on dma error and free skb + +From: Neil Horman + +[ Upstream commit 584ec4355355ffac43571b02a314d43eb2f7fcbf ] + +Ben Hutchings pointed out that my recent update to atl1e +in commit 352900b583b2852152a1e05ea0e8b579292e731e +("atl1e: fix dma mapping warnings") was missing a bit of code. + +Specifically it reset the hardware tx ring to its origional state when +we hit a dma error, but didn't unmap any exiting mappings from the +operation. This patch fixes that up. It also remembers to free the +skb in the event that an error occurs, so we don't leak. Untested, as +I don't have hardware. I think its pretty straightforward, but please +review closely. + +Signed-off-by: Neil Horman +CC: Ben Hutchings +CC: Jay Cliburn +CC: Chris Snook +CC: "David S. Miller" +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/atheros/atl1e/atl1e_main.c | 24 +++++++++++++++++++++++- + 1 file changed, 23 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c ++++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c +@@ -1678,6 +1678,7 @@ static int atl1e_tx_map(struct atl1e_ada + u16 f; + int segment; + int ring_start = adapter->tx_ring.next_to_use; ++ int ring_end; + + nr_frags = skb_shinfo(skb)->nr_frags; + segment = (tpd->word3 >> TPD_SEGMENT_EN_SHIFT) & TPD_SEGMENT_EN_MASK; +@@ -1721,6 +1722,15 @@ static int atl1e_tx_map(struct atl1e_ada + map_len, PCI_DMA_TODEVICE); + + if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) { ++ /* We need to unwind the mappings we've done */ ++ ring_end = adapter->tx_ring.next_to_use; ++ adapter->tx_ring.next_to_use = ring_start; ++ while (adapter->tx_ring.next_to_use != ring_end) { ++ tpd = atl1e_get_tpd(adapter); ++ tx_buffer = atl1e_get_tx_buffer(adapter, tpd); ++ pci_unmap_single(adapter->pdev, tx_buffer->dma, ++ tx_buffer->length, PCI_DMA_TODEVICE); ++ } + /* Reset the tx rings next pointer */ + adapter->tx_ring.next_to_use = ring_start; + return -ENOSPC; +@@ -1763,6 +1773,16 @@ static int atl1e_tx_map(struct atl1e_ada + DMA_TO_DEVICE); + + if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) { ++ /* We need to unwind the mappings we've done */ ++ ring_end = adapter->tx_ring.next_to_use; ++ adapter->tx_ring.next_to_use = ring_start; ++ while (adapter->tx_ring.next_to_use != ring_end) { ++ tpd = atl1e_get_tpd(adapter); ++ tx_buffer = atl1e_get_tx_buffer(adapter, tpd); ++ dma_unmap_page(&adapter->pdev->dev, tx_buffer->dma, ++ tx_buffer->length, DMA_TO_DEVICE); ++ } ++ + /* Reset the ring next to use pointer */ + adapter->tx_ring.next_to_use = ring_start; + return -ENOSPC; +@@ -1853,8 +1873,10 @@ static netdev_tx_t atl1e_xmit_frame(stru + return NETDEV_TX_OK; + } + +- if (atl1e_tx_map(adapter, skb, tpd)) ++ if (atl1e_tx_map(adapter, skb, tpd)) { ++ dev_kfree_skb_any(skb); + goto out; ++ } + + atl1e_tx_queue(adapter, tpd_req, tpd); + diff --git a/queue-3.10/be2net-fix-to-avoid-hardware-workaround-when-not-needed.patch b/queue-3.10/be2net-fix-to-avoid-hardware-workaround-when-not-needed.patch new file mode 100644 index 00000000000..e35c4f4e885 --- /dev/null +++ b/queue-3.10/be2net-fix-to-avoid-hardware-workaround-when-not-needed.patch @@ -0,0 +1,49 @@ +From b88b4272651cb4ee68c7a32cfc256fd4e8fdf735 Mon Sep 17 00:00:00 2001 +From: Sarveshwar Bandi +Date: Tue, 16 Jul 2013 12:44:02 +0530 +Subject: be2net: Fix to avoid hardware workaround when not needed + +From: Sarveshwar Bandi + +[ Upstream commit 52fe29e4bb614367c108b717c6d7fe5953eb7af3 ] + +Hardware workaround requesting hardware to skip vlan insertion is necessary +only when umc or qnq is enabled. Enabling this workaround in other scenarios +could cause controller to stall. + +Signed-off-by: Sarveshwar Bandi +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/emulex/benet/be_main.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/emulex/benet/be_main.c ++++ b/drivers/net/ethernet/emulex/benet/be_main.c +@@ -782,16 +782,22 @@ static struct sk_buff *be_insert_vlan_in + + if (vlan_tx_tag_present(skb)) + vlan_tag = be_get_tx_vlan_tag(adapter, skb); +- else if (qnq_async_evt_rcvd(adapter) && adapter->pvid) +- vlan_tag = adapter->pvid; ++ ++ if (qnq_async_evt_rcvd(adapter) && adapter->pvid) { ++ if (!vlan_tag) ++ vlan_tag = adapter->pvid; ++ /* f/w workaround to set skip_hw_vlan = 1, informs the F/W to ++ * skip VLAN insertion ++ */ ++ if (skip_hw_vlan) ++ *skip_hw_vlan = true; ++ } + + if (vlan_tag) { + skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag); + if (unlikely(!skb)) + return skb; + skb->vlan_tci = 0; +- if (skip_hw_vlan) +- *skip_hw_vlan = true; + } + + /* Insert the outer VLAN, if any */ diff --git a/queue-3.10/dummy-fix-oops-when-loading-the-dummy-failed.patch b/queue-3.10/dummy-fix-oops-when-loading-the-dummy-failed.patch new file mode 100644 index 00000000000..4d6fb426fda --- /dev/null +++ b/queue-3.10/dummy-fix-oops-when-loading-the-dummy-failed.patch @@ -0,0 +1,87 @@ +From a3bd2b75636d9e8ce1105521a210039fca6433c2 Mon Sep 17 00:00:00 2001 +From: dingtianhong +Date: Thu, 11 Jul 2013 19:04:02 +0800 +Subject: dummy: fix oops when loading the dummy failed + +From: dingtianhong + +[ Upstream commit 2c8a01894a12665d8059fad8f0a293c98a264121 ] + +We rename the dummy in modprobe.conf like this: + +install dummy0 /sbin/modprobe -o dummy0 --ignore-install dummy +install dummy1 /sbin/modprobe -o dummy1 --ignore-install dummy + +We got oops when we run the command: + +modprobe dummy0 +modprobe dummy1 + +------------[ cut here ]------------ + +[ 3302.187584] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 +[ 3302.195411] IP: [] __rtnl_link_unregister+0x9a/0xd0 +[ 3302.201844] PGD 85c94a067 PUD 8517bd067 PMD 0 +[ 3302.206305] Oops: 0002 [#1] SMP +[ 3302.299737] task: ffff88105ccea300 ti: ffff880eba4a0000 task.ti: ffff880eba4a0000 +[ 3302.307186] RIP: 0010:[] [] __rtnl_link_unregister+0x9a/0xd0 +[ 3302.316044] RSP: 0018:ffff880eba4a1dd8 EFLAGS: 00010246 +[ 3302.321332] RAX: 0000000000000000 RBX: ffffffff81a9d738 RCX: 0000000000000002 +[ 3302.328436] RDX: 0000000000000000 RSI: ffffffffa04d602c RDI: ffff880eba4a1dd8 +[ 3302.335541] RBP: ffff880eba4a1e18 R08: dead000000200200 R09: dead000000100100 +[ 3302.342644] R10: 0000000000000080 R11: 0000000000000003 R12: ffffffff81a9d788 +[ 3302.349748] R13: ffffffffa04d7020 R14: ffffffff81a9d670 R15: ffff880eba4a1dd8 +[ 3302.364910] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 3302.370630] CR2: 0000000000000008 CR3: 000000085e15e000 CR4: 00000000000427e0 +[ 3302.377734] DR0: 0000000000000003 DR1: 00000000000000b0 DR2: 0000000000000001 +[ 3302.384838] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 +[ 3302.391940] Stack: +[ 3302.393944] ffff880eba4a1dd8 ffff880eba4a1dd8 ffff880eba4a1e18 ffffffffa04d70c0 +[ 3302.401350] 00000000ffffffef ffffffffa01a8000 0000000000000000 ffffffff816111c8 +[ 3302.408758] ffff880eba4a1e48 ffffffffa01a80be ffff880eba4a1e48 ffffffffa04d70c0 +[ 3302.416164] Call Trace: +[ 3302.418605] [] ? 0xffffffffa01a7fff +[ 3302.423727] [] dummy_init_module+0xbe/0x1000 [dummy0] +[ 3302.430405] [] ? 0xffffffffa01a7fff +[ 3302.435535] [] do_one_initcall+0x152/0x1b0 +[ 3302.441263] [] do_init_module+0x7b/0x200 +[ 3302.446824] [] load_module+0x4e2/0x530 +[ 3302.452215] [] ? ddebug_dyndbg_boot_param_cb+0x60/0x60 +[ 3302.458979] [] SyS_init_module+0xd1/0x130 +[ 3302.464627] [] system_call_fastpath+0x16/0x1b +[ 3302.490090] RIP [] __rtnl_link_unregister+0x9a/0xd0 +[ 3302.496607] RSP +[ 3302.500084] CR2: 0000000000000008 +[ 3302.503466] ---[ end trace 8342d49cd49f78ed ]--- + +The reason is that when loading dummy, if __rtnl_link_register() return failed, +the init_module should return and avoid take the wrong path. + +Signed-off-by: Tan Xiaojun +Signed-off-by: Ding Tianhong +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/dummy.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/dummy.c ++++ b/drivers/net/dummy.c +@@ -185,6 +185,8 @@ static int __init dummy_init_module(void + + rtnl_lock(); + err = __rtnl_link_register(&dummy_link_ops); ++ if (err < 0) ++ goto out; + + for (i = 0; i < numdummies && !err; i++) { + err = dummy_init_one(); +@@ -192,6 +194,8 @@ static int __init dummy_init_module(void + } + if (err < 0) + __rtnl_link_unregister(&dummy_link_ops); ++ ++out: + rtnl_unlock(); + + return err; diff --git a/queue-3.10/gre-fix-a-regression-in-ioctl.patch b/queue-3.10/gre-fix-a-regression-in-ioctl.patch new file mode 100644 index 00000000000..5affd05364e --- /dev/null +++ b/queue-3.10/gre-fix-a-regression-in-ioctl.patch @@ -0,0 +1,54 @@ +From d9b54511307e46a8f144b20af88e9279966725f1 Mon Sep 17 00:00:00 2001 +From: Cong Wang +Date: Sat, 29 Jun 2013 12:02:59 +0800 +Subject: gre: fix a regression in ioctl + +From: Cong Wang + +[ Upstream commit 6c734fb8592f6768170e48e7102cb2f0a1bb9759 ] + +When testing GRE tunnel, I got: + + # ip tunnel show + get tunnel gre0 failed: Invalid argument + get tunnel gre1 failed: Invalid argument + +This is a regression introduced by commit c54419321455631079c7d +("GRE: Refactor GRE tunneling code.") because previously we +only check the parameters for SIOCADDTUNNEL and SIOCCHGTUNNEL, +after that commit, the check is moved for all commands. + +So, just check for SIOCADDTUNNEL and SIOCCHGTUNNEL. + +After this patch I got: + + # ip tunnel show + gre0: gre/ip remote any local any ttl inherit nopmtudisc + gre1: gre/ip remote 192.168.122.101 local 192.168.122.45 ttl inherit + +Signed-off-by: Cong Wang +Cc: Pravin B Shelar +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_gre.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/net/ipv4/ip_gre.c ++++ b/net/ipv4/ip_gre.c +@@ -503,10 +503,11 @@ static int ipgre_tunnel_ioctl(struct net + + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; +- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || +- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || +- ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) { +- return -EINVAL; ++ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { ++ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || ++ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || ++ ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) ++ return -EINVAL; + } + p.i_flags = gre_flags_to_tnl_flags(p.i_flags); + p.o_flags = gre_flags_to_tnl_flags(p.o_flags); diff --git a/queue-3.10/gre-fix-mtu-sizing-check-for-gretap-tunnels.patch b/queue-3.10/gre-fix-mtu-sizing-check-for-gretap-tunnels.patch new file mode 100644 index 00000000000..fcec4779e1a --- /dev/null +++ b/queue-3.10/gre-fix-mtu-sizing-check-for-gretap-tunnels.patch @@ -0,0 +1,41 @@ +From 60731ca136b36cde13dd6b021711f031d70e061f Mon Sep 17 00:00:00 2001 +From: Alexander Duyck +Date: Thu, 11 Jul 2013 13:12:22 -0700 +Subject: gre: Fix MTU sizing check for gretap tunnels + +From: Alexander Duyck + +[ Upstream commit 8c91e162e058bb91b7766f26f4d5823a21941026 ] + +This change fixes an MTU sizing issue seen with gretap tunnels when non-gso +packets are sent from the interface. + +In my case I was able to reproduce the issue by simply sending a ping of +1421 bytes with the gretap interface created on a device with a standard +1500 mtu. + +This fix is based on the fact that the tunnel mtu is already adjusted by +dev->hard_header_len so it would make sense that any packets being compared +against that mtu should also be adjusted by hard_header_len and the tunnel +header instead of just the tunnel header. + +Signed-off-by: Alexander Duyck +Reported-by: Cong Wang +Acked-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_tunnel.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv4/ip_tunnel.c ++++ b/net/ipv4/ip_tunnel.c +@@ -490,7 +490,7 @@ static int tnl_update_pmtu(struct net_de + struct rtable *rt, __be16 df) + { + struct ip_tunnel *tunnel = netdev_priv(dev); +- int pkt_size = skb->len - tunnel->hlen; ++ int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; + int mtu; + + if (df) diff --git a/queue-3.10/hyperv-fix-the-netif_f_sg-flag-setting-in-netvsc.patch b/queue-3.10/hyperv-fix-the-netif_f_sg-flag-setting-in-netvsc.patch new file mode 100644 index 00000000000..80b2e365ad1 --- /dev/null +++ b/queue-3.10/hyperv-fix-the-netif_f_sg-flag-setting-in-netvsc.patch @@ -0,0 +1,35 @@ +From fe7d570e2db88a8b10c61122d17cb0effd04e3c0 Mon Sep 17 00:00:00 2001 +From: Haiyang Zhang +Date: Tue, 16 Jul 2013 23:01:20 -0700 +Subject: hyperv: Fix the NETIF_F_SG flag setting in netvsc + +From: Haiyang Zhang + +[ Upstream commit f45708209dc445bac0844f6ce86e315a2ffe8a29 ] + +SG mode is not currently supported by netvsc, so remove this flag for now. +Otherwise, it will be unconditionally enabled by commit ec5f0615642 + "Kill link between CSUM and SG features" +Previously, the SG feature is disabled because CSUM is not set here. + +Signed-off-by: Haiyang Zhang +Reviewed-by: K. Y. Srinivasan +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/hyperv/netvsc_drv.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/hyperv/netvsc_drv.c ++++ b/drivers/net/hyperv/netvsc_drv.c +@@ -431,8 +431,8 @@ static int netvsc_probe(struct hv_device + net->netdev_ops = &device_ops; + + /* TODO: Add GSO and Checksum offload */ +- net->hw_features = NETIF_F_SG; +- net->features = NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_TX; ++ net->hw_features = 0; ++ net->features = NETIF_F_HW_VLAN_CTAG_TX; + + SET_ETHTOOL_OPS(net, ðtool_ops); + SET_NETDEV_DEV(net, &dev->device); diff --git a/queue-3.10/ifb-fix-oops-when-loading-the-ifb-failed.patch b/queue-3.10/ifb-fix-oops-when-loading-the-ifb-failed.patch new file mode 100644 index 00000000000..8183b00d497 --- /dev/null +++ b/queue-3.10/ifb-fix-oops-when-loading-the-ifb-failed.patch @@ -0,0 +1,39 @@ +From 44780fa991640ee8c5fc4f4c47d5033a5c98895d Mon Sep 17 00:00:00 2001 +From: dingtianhong +Date: Thu, 11 Jul 2013 19:04:06 +0800 +Subject: ifb: fix oops when loading the ifb failed + +From: dingtianhong + +[ Upstream commit f2966cd5691058b8674a20766525bedeaea9cbcf ] + +If __rtnl_link_register() return faild when loading the ifb, it will +take the wrong path and get oops, so fix it just like dummy. + +Signed-off-by: Ding Tianhong +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ifb.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ifb.c ++++ b/drivers/net/ifb.c +@@ -291,6 +291,8 @@ static int __init ifb_init_module(void) + + rtnl_lock(); + err = __rtnl_link_register(&ifb_link_ops); ++ if (err < 0) ++ goto out; + + for (i = 0; i < numifbs && !err; i++) { + err = ifb_init_one(i); +@@ -298,6 +300,8 @@ static int __init ifb_init_module(void) + } + if (err) + __rtnl_link_unregister(&ifb_link_ops); ++ ++out: + rtnl_unlock(); + + return err; diff --git a/queue-3.10/ifb-fix-rcu_sched-self-detected-stalls.patch b/queue-3.10/ifb-fix-rcu_sched-self-detected-stalls.patch new file mode 100644 index 00000000000..a03d5fe317c --- /dev/null +++ b/queue-3.10/ifb-fix-rcu_sched-self-detected-stalls.patch @@ -0,0 +1,44 @@ +From b99eebace35b3d3ae6ddcc2af5659e3ab7a2921c Mon Sep 17 00:00:00 2001 +From: dingtianhong +Date: Wed, 10 Jul 2013 12:04:02 +0800 +Subject: ifb: fix rcu_sched self-detected stalls + +From: dingtianhong + +[ Upstream commit 440d57bc5ff55ec1efb3efc9cbe9420b4bbdfefa ] + +According to the commit 16b0dc29c1af9df341428f4c49ada4f626258082 +(dummy: fix rcu_sched self-detected stalls) + +Eric Dumazet fix the problem in dummy, but the ifb will occur the +same problem like the dummy modules. + +Trying to "modprobe ifb numifbs=30000" triggers : + +INFO: rcu_sched self-detected stall on CPU + +After this splat, RTNL is locked and reboot is needed. + +We must call cond_resched() to avoid this, even holding RTNL. + +Signed-off-by: Ding Tianhong +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ifb.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/net/ifb.c ++++ b/drivers/net/ifb.c +@@ -292,8 +292,10 @@ static int __init ifb_init_module(void) + rtnl_lock(); + err = __rtnl_link_register(&ifb_link_ops); + +- for (i = 0; i < numifbs && !err; i++) ++ for (i = 0; i < numifbs && !err; i++) { + err = ifb_init_one(i); ++ cond_resched(); ++ } + if (err) + __rtnl_link_unregister(&ifb_link_ops); + rtnl_unlock(); diff --git a/queue-3.10/ip_tunnels-use-skb-len-to-pmtu-check.patch b/queue-3.10/ip_tunnels-use-skb-len-to-pmtu-check.patch new file mode 100644 index 00000000000..d33d2cefd6a --- /dev/null +++ b/queue-3.10/ip_tunnels-use-skb-len-to-pmtu-check.patch @@ -0,0 +1,152 @@ +From e85dcba98ae899b9e6d26625a86750eb92c9fadc Mon Sep 17 00:00:00 2001 +From: Pravin B Shelar +Date: Tue, 2 Jul 2013 10:57:33 -0700 +Subject: ip_tunnels: Use skb-len to PMTU check. + +From: Pravin B Shelar + +[ Upstream commit 23a3647bc4f93bac3776c66dc2c7f7f68b3cd662 ] + +In path mtu check, ip header total length works for gre device +but not for gre-tap device. Use skb len which is consistent +for all tunneling types. This is old bug in gre. +This also fixes mtu calculation bug introduced by +commit c54419321455631079c7d (GRE: Refactor GRE tunneling code). + +Reported-by: Timo Teras +Signed-off-by: Pravin B Shelar +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_tunnel.c | 97 ++++++++++++++++++++++++++++----------------------- + 1 file changed, 54 insertions(+), 43 deletions(-) + +--- a/net/ipv4/ip_tunnel.c ++++ b/net/ipv4/ip_tunnel.c +@@ -486,6 +486,53 @@ drop: + } + EXPORT_SYMBOL_GPL(ip_tunnel_rcv); + ++static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, ++ struct rtable *rt, __be16 df) ++{ ++ struct ip_tunnel *tunnel = netdev_priv(dev); ++ int pkt_size = skb->len - tunnel->hlen; ++ int mtu; ++ ++ if (df) ++ mtu = dst_mtu(&rt->dst) - dev->hard_header_len ++ - sizeof(struct iphdr) - tunnel->hlen; ++ else ++ mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; ++ ++ if (skb_dst(skb)) ++ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); ++ ++ if (skb->protocol == htons(ETH_P_IP)) { ++ if (!skb_is_gso(skb) && ++ (df & htons(IP_DF)) && mtu < pkt_size) { ++ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); ++ return -E2BIG; ++ } ++ } ++#if IS_ENABLED(CONFIG_IPV6) ++ else if (skb->protocol == htons(ETH_P_IPV6)) { ++ struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); ++ ++ if (rt6 && mtu < dst_mtu(skb_dst(skb)) && ++ mtu >= IPV6_MIN_MTU) { ++ if ((tunnel->parms.iph.daddr && ++ !ipv4_is_multicast(tunnel->parms.iph.daddr)) || ++ rt6->rt6i_dst.plen == 128) { ++ rt6->rt6i_flags |= RTF_MODIFIED; ++ dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); ++ } ++ } ++ ++ if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && ++ mtu < pkt_size) { ++ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ++ return -E2BIG; ++ } ++ } ++#endif ++ return 0; ++} ++ + void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + const struct iphdr *tnl_params) + { +@@ -499,7 +546,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, + struct net_device *tdev; /* Device to other host */ + unsigned int max_headroom; /* The extra header space needed */ + __be32 dst; +- int mtu; + + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + +@@ -579,50 +625,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, + goto tx_error; + } + +- df = tnl_params->frag_off; +- +- if (df) +- mtu = dst_mtu(&rt->dst) - dev->hard_header_len +- - sizeof(struct iphdr); +- else +- mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; +- +- if (skb_dst(skb)) +- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); +- +- if (skb->protocol == htons(ETH_P_IP)) { +- df |= (inner_iph->frag_off&htons(IP_DF)); +- +- if (!skb_is_gso(skb) && +- (inner_iph->frag_off&htons(IP_DF)) && +- mtu < ntohs(inner_iph->tot_len)) { +- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +- ip_rt_put(rt); +- goto tx_error; +- } +- } +-#if IS_ENABLED(CONFIG_IPV6) +- else if (skb->protocol == htons(ETH_P_IPV6)) { +- struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); +- +- if (rt6 && mtu < dst_mtu(skb_dst(skb)) && +- mtu >= IPV6_MIN_MTU) { +- if ((tunnel->parms.iph.daddr && +- !ipv4_is_multicast(tunnel->parms.iph.daddr)) || +- rt6->rt6i_dst.plen == 128) { +- rt6->rt6i_flags |= RTF_MODIFIED; +- dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); +- } +- } + +- if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && +- mtu < skb->len) { +- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); +- ip_rt_put(rt); +- goto tx_error; +- } ++ if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) { ++ ip_rt_put(rt); ++ goto tx_error; + } +-#endif + + if (tunnel->err_count > 0) { + if (time_before(jiffies, +@@ -646,6 +653,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, + ttl = ip4_dst_hoplimit(&rt->dst); + } + ++ df = tnl_params->frag_off; ++ if (skb->protocol == htons(ETH_P_IP)) ++ df |= (inner_iph->frag_off&htons(IP_DF)); ++ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr) + + rt->dst.header_len; + if (max_headroom > dev->needed_headroom) { diff --git a/queue-3.10/ipip-fix-a-regression-in-ioctl.patch b/queue-3.10/ipip-fix-a-regression-in-ioctl.patch new file mode 100644 index 00000000000..93ccf58bb42 --- /dev/null +++ b/queue-3.10/ipip-fix-a-regression-in-ioctl.patch @@ -0,0 +1,51 @@ +From 675b9402488074d7081811cb67055fb1e1f515b3 Mon Sep 17 00:00:00 2001 +From: Cong Wang +Date: Tue, 2 Jul 2013 14:49:34 +0800 +Subject: ipip: fix a regression in ioctl + +From: Cong Wang + +[ Upstream commit 3b7b514f44bff05d26a6499c4d4fac2a83938e6e ] + +This is a regression introduced by +commit fd58156e456d9f68fe0448 (IPIP: Use ip-tunneling code.) + +Similar to GRE tunnel, previously we only check the parameters +for SIOCADDTUNNEL and SIOCCHGTUNNEL, after that commit, the +check is moved for all commands. + +So, just check for SIOCADDTUNNEL and SIOCCHGTUNNEL. + +Also, the check for i_key, o_key etc. is suspicious too, +which did not exist before, reset them before passing +to ip_tunnel_ioctl(). + +Signed-off-by: Cong Wang +Cc: Pravin B Shelar +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ipip.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/net/ipv4/ipip.c ++++ b/net/ipv4/ipip.c +@@ -240,11 +240,13 @@ ipip_tunnel_ioctl(struct net_device *dev + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + +- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || +- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) +- return -EINVAL; +- if (p.i_key || p.o_key || p.i_flags || p.o_flags) +- return -EINVAL; ++ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { ++ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || ++ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) ++ return -EINVAL; ++ } ++ ++ p.i_key = p.o_key = p.i_flags = p.o_flags = 0; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + diff --git a/queue-3.10/ipv4-set-transport-header-earlier.patch b/queue-3.10/ipv4-set-transport-header-earlier.patch new file mode 100644 index 00000000000..b198cc3a0a9 --- /dev/null +++ b/queue-3.10/ipv4-set-transport-header-earlier.patch @@ -0,0 +1,59 @@ +From 4ff552ad9b0463045a9211c5548288fa70649474 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Mon, 15 Jul 2013 20:03:19 -0700 +Subject: ipv4: set transport header earlier + +From: Eric Dumazet + +[ Upstream commit 21d1196a35f5686c4323e42a62fdb4b23b0ab4a3 ] + +commit 45f00f99d6e ("ipv4: tcp: clean up tcp_v4_early_demux()") added a +performance regression for non GRO traffic, basically disabling +IP early demux. + +IPv6 stack resets transport header in ip6_rcv() before calling +IP early demux in ip6_rcv_finish(), while IPv4 does this only in +ip_local_deliver_finish(), _after_ IP early demux. + +GRO traffic happened to enable IP early demux because transport header +is also set in inet_gro_receive() + +Instead of reverting the faulty commit, we can make IPv4/IPv6 behave the +same : transport_header should be set in ip_rcv() instead of +ip_local_deliver_finish() + +ip_local_deliver_finish() can also use skb_network_header_len() which is +faster than ip_hdrlen() + +Signed-off-by: Eric Dumazet +Cc: Neal Cardwell +Cc: Tom Herbert +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_input.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +--- a/net/ipv4/ip_input.c ++++ b/net/ipv4/ip_input.c +@@ -190,10 +190,7 @@ static int ip_local_deliver_finish(struc + { + struct net *net = dev_net(skb->dev); + +- __skb_pull(skb, ip_hdrlen(skb)); +- +- /* Point into the IP datagram, just past the header. */ +- skb_reset_transport_header(skb); ++ __skb_pull(skb, skb_network_header_len(skb)); + + rcu_read_lock(); + { +@@ -437,6 +434,8 @@ int ip_rcv(struct sk_buff *skb, struct n + goto drop; + } + ++ skb->transport_header = skb->network_header + iph->ihl*4; ++ + /* Remove any debris in the socket control block */ + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + diff --git a/queue-3.10/ipv6-call-udp_push_pending_frames-when-uncorking-a-socket-with-af_inet-pending-data.patch b/queue-3.10/ipv6-call-udp_push_pending_frames-when-uncorking-a-socket-with-af_inet-pending-data.patch new file mode 100644 index 00000000000..76e3a0b70d7 --- /dev/null +++ b/queue-3.10/ipv6-call-udp_push_pending_frames-when-uncorking-a-socket-with-af_inet-pending-data.patch @@ -0,0 +1,122 @@ +From 0e3f585c132e7716b8b96c20c59b15a24ec2790e Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Mon, 1 Jul 2013 20:21:30 +0200 +Subject: ipv6: call udp_push_pending_frames when uncorking a socket with AF_INET pending data + +From: Hannes Frederic Sowa + +[ Upstream commit 8822b64a0fa64a5dd1dfcf837c5b0be83f8c05d1 ] + +We accidentally call down to ip6_push_pending_frames when uncorking +pending AF_INET data on a ipv6 socket. This results in the following +splat (from Dave Jones): + +skbuff: skb_under_panic: text:ffffffff816765f6 len:48 put:40 head:ffff88013deb6df0 data:ffff88013deb6dec tail:0x2c end:0xc0 dev: +------------[ cut here ]------------ +kernel BUG at net/core/skbuff.c:126! +invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC +Modules linked in: dccp_ipv4 dccp 8021q garp bridge stp dlci mpoa snd_seq_dummy sctp fuse hidp tun bnep nfnetlink scsi_transport_iscsi rfcomm can_raw can_bcm af_802154 appletalk caif_socket can caif ipt_ULOG x25 rose af_key pppoe pppox ipx phonet irda llc2 ppp_generic slhc p8023 psnap p8022 llc crc_ccitt atm bluetooth ++netrom ax25 nfc rfkill rds af_rxrpc coretemp hwmon kvm_intel kvm crc32c_intel snd_hda_codec_realtek ghash_clmulni_intel microcode pcspkr snd_hda_codec_hdmi snd_hda_intel snd_hda_codec snd_hwdep usb_debug snd_seq snd_seq_device snd_pcm e1000e snd_page_alloc snd_timer ptp snd pps_core soundcore xfs libcrc32c +CPU: 2 PID: 8095 Comm: trinity-child2 Not tainted 3.10.0-rc7+ #37 +task: ffff8801f52c2520 ti: ffff8801e6430000 task.ti: ffff8801e6430000 +RIP: 0010:[] [] skb_panic+0x63/0x65 +RSP: 0018:ffff8801e6431de8 EFLAGS: 00010282 +RAX: 0000000000000086 RBX: ffff8802353d3cc0 RCX: 0000000000000006 +RDX: 0000000000003b90 RSI: ffff8801f52c2ca0 RDI: ffff8801f52c2520 +RBP: ffff8801e6431e08 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000001 R11: 0000000000000001 R12: ffff88022ea0c800 +R13: ffff88022ea0cdf8 R14: ffff8802353ecb40 R15: ffffffff81cc7800 +FS: 00007f5720a10740(0000) GS:ffff880244c00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000005862000 CR3: 000000022843c000 CR4: 00000000001407e0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 +Stack: + ffff88013deb6dec 000000000000002c 00000000000000c0 ffffffff81a3f6e4 + ffff8801e6431e18 ffffffff8159a9aa ffff8801e6431e90 ffffffff816765f6 + ffffffff810b756b 0000000700000002 ffff8801e6431e40 0000fea9292aa8c0 +Call Trace: + [] skb_push+0x3a/0x40 + [] ip6_push_pending_frames+0x1f6/0x4d0 + [] ? mark_held_locks+0xbb/0x140 + [] udp_v6_push_pending_frames+0x2b9/0x3d0 + [] ? udplite_getfrag+0x20/0x20 + [] udp_lib_setsockopt+0x1aa/0x1f0 + [] ? fget_light+0x387/0x4f0 + [] udpv6_setsockopt+0x34/0x40 + [] sock_common_setsockopt+0x14/0x20 + [] SyS_setsockopt+0x71/0xd0 + [] tracesys+0xdd/0xe2 +Code: 00 00 48 89 44 24 10 8b 87 d8 00 00 00 48 89 44 24 08 48 8b 87 e8 00 00 00 48 c7 c7 c0 04 aa 81 48 89 04 24 31 c0 e8 e1 7e ff ff <0f> 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 +RIP [] skb_panic+0x63/0x65 + RSP + +This patch adds a check if the pending data is of address family AF_INET +and directly calls udp_push_ending_frames from udp_v6_push_pending_frames +if that is the case. + +This bug was found by Dave Jones with trinity. + +(Also move the initialization of fl6 below the AF_INET check, even if +not strictly necessary.) + +Signed-off-by: Hannes Frederic Sowa +Cc: Dave Jones +Cc: YOSHIFUJI Hideaki +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/udp.h | 1 + + net/ipv4/udp.c | 3 ++- + net/ipv6/udp.c | 7 ++++++- + 3 files changed, 9 insertions(+), 2 deletions(-) + +--- a/include/net/udp.h ++++ b/include/net/udp.h +@@ -181,6 +181,7 @@ extern int udp_get_port(struct sock *sk, + extern void udp_err(struct sk_buff *, u32); + extern int udp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len); ++extern int udp_push_pending_frames(struct sock *sk); + extern void udp_flush_pending_frames(struct sock *sk); + extern int udp_rcv(struct sk_buff *skb); + extern int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -799,7 +799,7 @@ send: + /* + * Push out all pending data as one UDP datagram. Socket is locked. + */ +-static int udp_push_pending_frames(struct sock *sk) ++int udp_push_pending_frames(struct sock *sk) + { + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); +@@ -818,6 +818,7 @@ out: + up->pending = 0; + return err; + } ++EXPORT_SYMBOL(udp_push_pending_frames); + + int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -955,11 +955,16 @@ static int udp_v6_push_pending_frames(st + struct udphdr *uh; + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); +- struct flowi6 *fl6 = &inet->cork.fl.u.ip6; ++ struct flowi6 *fl6; + int err = 0; + int is_udplite = IS_UDPLITE(sk); + __wsum csum = 0; + ++ if (up->pending == AF_INET) ++ return udp_push_pending_frames(sk); ++ ++ fl6 = &inet->cork.fl.u.ip6; ++ + /* Grab the skbuff where UDP header space exists. */ + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; diff --git a/queue-3.10/ipv6-fix-route-selection-if-kernel-is-not-compiled-with-config_ipv6_router_pref.patch b/queue-3.10/ipv6-fix-route-selection-if-kernel-is-not-compiled-with-config_ipv6_router_pref.patch new file mode 100644 index 00000000000..2f8154a98e4 --- /dev/null +++ b/queue-3.10/ipv6-fix-route-selection-if-kernel-is-not-compiled-with-config_ipv6_router_pref.patch @@ -0,0 +1,189 @@ +From 1ea4568e699d6f1a231c14d5f084b4eb97298b7b Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Thu, 11 Jul 2013 12:43:42 +0200 +Subject: ipv6: fix route selection if kernel is not compiled with CONFIG_IPV6_ROUTER_PREF + +From: Hannes Frederic Sowa + +[ Upstream commit afc154e978de1eb11c555bc8bcec1552f75ebc43 ] + +This is a follow-up patch to 3630d40067a21d4dfbadc6002bb469ce26ac5d52 +("ipv6: rt6_check_neigh should successfully verify neigh if no NUD +information are available"). + +Since the removal of rt->n in rt6_info we can end up with a dst == +NULL in rt6_check_neigh. In case the kernel is not compiled with +CONFIG_IPV6_ROUTER_PREF we should also select a route with unkown +NUD state but we must not avoid doing round robin selection on routes +with the same target. So introduce and pass down a boolean ``do_rr'' to +indicate when we should update rt->rr_ptr. As soon as no route is valid +we do backtracking and do a lookup on a higher level in the fib trie. + +v2: +a) Improved rt6_check_neigh logic (no need to create neighbour there) + and documented return values. + +v3: +a) Introduce enum rt6_nud_state to get rid of the magic numbers + (thanks to David Miller). +b) Update and shorten commit message a bit to actualy reflect + the source. + +Reported-by: Pierre Emeriaud +Cc: YOSHIFUJI Hideaki +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/route.c | 63 ++++++++++++++++++++++++++++++++++--------------------- + 1 file changed, 40 insertions(+), 23 deletions(-) + +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -65,6 +65,12 @@ + #include + #endif + ++enum rt6_nud_state { ++ RT6_NUD_FAIL_HARD = -2, ++ RT6_NUD_FAIL_SOFT = -1, ++ RT6_NUD_SUCCEED = 1 ++}; ++ + static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, + const struct in6_addr *dest); + static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +@@ -527,28 +533,29 @@ static inline int rt6_check_dev(struct r + return 0; + } + +-static inline bool rt6_check_neigh(struct rt6_info *rt) ++static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) + { + struct neighbour *neigh; +- bool ret = false; ++ enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; + + if (rt->rt6i_flags & RTF_NONEXTHOP || + !(rt->rt6i_flags & RTF_GATEWAY)) +- return true; ++ return RT6_NUD_SUCCEED; + + rcu_read_lock_bh(); + neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + if (neigh) { + read_lock(&neigh->lock); + if (neigh->nud_state & NUD_VALID) +- ret = true; ++ ret = RT6_NUD_SUCCEED; + #ifdef CONFIG_IPV6_ROUTER_PREF + else if (!(neigh->nud_state & NUD_FAILED)) +- ret = true; ++ ret = RT6_NUD_SUCCEED; + #endif + read_unlock(&neigh->lock); +- } else if (IS_ENABLED(CONFIG_IPV6_ROUTER_PREF)) { +- ret = true; ++ } else { ++ ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? ++ RT6_NUD_SUCCEED : RT6_NUD_FAIL_SOFT; + } + rcu_read_unlock_bh(); + +@@ -562,43 +569,52 @@ static int rt6_score_route(struct rt6_in + + m = rt6_check_dev(rt, oif); + if (!m && (strict & RT6_LOOKUP_F_IFACE)) +- return -1; ++ return RT6_NUD_FAIL_HARD; + #ifdef CONFIG_IPV6_ROUTER_PREF + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; + #endif +- if (!rt6_check_neigh(rt) && (strict & RT6_LOOKUP_F_REACHABLE)) +- return -1; ++ if (strict & RT6_LOOKUP_F_REACHABLE) { ++ int n = rt6_check_neigh(rt); ++ if (n < 0) ++ return n; ++ } + return m; + } + + static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, +- int *mpri, struct rt6_info *match) ++ int *mpri, struct rt6_info *match, ++ bool *do_rr) + { + int m; ++ bool match_do_rr = false; + + if (rt6_check_expired(rt)) + goto out; + + m = rt6_score_route(rt, oif, strict); +- if (m < 0) ++ if (m == RT6_NUD_FAIL_SOFT && !IS_ENABLED(CONFIG_IPV6_ROUTER_PREF)) { ++ match_do_rr = true; ++ m = 0; /* lowest valid score */ ++ } else if (m < 0) { + goto out; ++ } ++ ++ if (strict & RT6_LOOKUP_F_REACHABLE) ++ rt6_probe(rt); + + if (m > *mpri) { +- if (strict & RT6_LOOKUP_F_REACHABLE) +- rt6_probe(match); ++ *do_rr = match_do_rr; + *mpri = m; + match = rt; +- } else if (strict & RT6_LOOKUP_F_REACHABLE) { +- rt6_probe(rt); + } +- + out: + return match; + } + + static struct rt6_info *find_rr_leaf(struct fib6_node *fn, + struct rt6_info *rr_head, +- u32 metric, int oif, int strict) ++ u32 metric, int oif, int strict, ++ bool *do_rr) + { + struct rt6_info *rt, *match; + int mpri = -1; +@@ -606,10 +622,10 @@ static struct rt6_info *find_rr_leaf(str + match = NULL; + for (rt = rr_head; rt && rt->rt6i_metric == metric; + rt = rt->dst.rt6_next) +- match = find_match(rt, oif, strict, &mpri, match); ++ match = find_match(rt, oif, strict, &mpri, match, do_rr); + for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; + rt = rt->dst.rt6_next) +- match = find_match(rt, oif, strict, &mpri, match); ++ match = find_match(rt, oif, strict, &mpri, match, do_rr); + + return match; + } +@@ -618,15 +634,16 @@ static struct rt6_info *rt6_select(struc + { + struct rt6_info *match, *rt0; + struct net *net; ++ bool do_rr = false; + + rt0 = fn->rr_ptr; + if (!rt0) + fn->rr_ptr = rt0 = fn->leaf; + +- match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict); ++ match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, ++ &do_rr); + +- if (!match && +- (strict & RT6_LOOKUP_F_REACHABLE)) { ++ if (do_rr) { + struct rt6_info *next = rt0->dst.rt6_next; + + /* no entries matched; do round-robin */ diff --git a/queue-3.10/ipv6-in-case-of-link-failure-remove-route-directly-instead-of-letting-it-expire.patch b/queue-3.10/ipv6-in-case-of-link-failure-remove-route-directly-instead-of-letting-it-expire.patch new file mode 100644 index 00000000000..ccc64df53ce --- /dev/null +++ b/queue-3.10/ipv6-in-case-of-link-failure-remove-route-directly-instead-of-letting-it-expire.patch @@ -0,0 +1,97 @@ +From 3e86a493305637e79d72541f571ec4f852ef2024 Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Wed, 10 Jul 2013 23:00:57 +0200 +Subject: ipv6: in case of link failure remove route directly instead of letting it expire + +From: Hannes Frederic Sowa + +[ Upstream commit 1eb4f758286884e7566627164bca4c4a16952a83 ] + +We could end up expiring a route which is part of an ecmp route set. Doing +so would invalidate the rt->rt6i_nsiblings calculations and could provoke +the following panic: + +[ 80.144667] ------------[ cut here ]------------ +[ 80.145172] kernel BUG at net/ipv6/ip6_fib.c:733! +[ 80.145172] invalid opcode: 0000 [#1] SMP +[ 80.145172] Modules linked in: 8021q nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables ip6table_filter ip6_tables ++snd_hda_intel snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_page_alloc snd_timer virtio_balloon snd soundcore i2c_piix4 i2c_core virtio_net virtio_blk +[ 80.145172] CPU: 1 PID: 786 Comm: ping6 Not tainted 3.10.0+ #118 +[ 80.145172] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 +[ 80.145172] task: ffff880117fa0000 ti: ffff880118770000 task.ti: ffff880118770000 +[ 80.145172] RIP: 0010:[] [] fib6_add+0x75d/0x830 +[ 80.145172] RSP: 0018:ffff880118771798 EFLAGS: 00010202 +[ 80.145172] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88011350e480 +[ 80.145172] RDX: ffff88011350e238 RSI: 0000000000000004 RDI: ffff88011350f738 +[ 80.145172] RBP: ffff880118771848 R08: ffff880117903280 R09: 0000000000000001 +[ 80.145172] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011350f680 +[ 80.145172] R13: ffff880117903280 R14: ffff880118771890 R15: ffff88011350ef90 +[ 80.145172] FS: 00007f02b5127740(0000) GS:ffff88011fd00000(0000) knlGS:0000000000000000 +[ 80.145172] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b +[ 80.145172] CR2: 00007f981322a000 CR3: 00000001181b1000 CR4: 00000000000006e0 +[ 80.145172] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 80.145172] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 +[ 80.145172] Stack: +[ 80.145172] 0000000000000001 ffff880100000000 ffff880100000000 ffff880117903280 +[ 80.145172] 0000000000000000 ffff880119a4cf00 0000000000000400 00000000000007fa +[ 80.145172] 0000000000000000 0000000000000000 0000000000000000 ffff88011350f680 +[ 80.145172] Call Trace: +[ 80.145172] [] ? rt6_bind_peer+0x4b/0x90 +[ 80.145172] [] __ip6_ins_rt+0x45/0x70 +[ 80.145172] [] ip6_ins_rt+0x35/0x40 +[ 80.145172] [] ip6_pol_route.isra.44+0x3a4/0x4b0 +[ 80.145172] [] ip6_pol_route_output+0x2a/0x30 +[ 80.145172] [] fib6_rule_action+0xd7/0x210 +[ 80.145172] [] ? ip6_pol_route_input+0x30/0x30 +[ 80.145172] [] fib_rules_lookup+0xc6/0x140 +[ 80.145172] [] fib6_rule_lookup+0x44/0x80 +[ 80.145172] [] ? ip6_pol_route_input+0x30/0x30 +[ 80.145172] [] ip6_route_output+0x73/0xb0 +[ 80.145172] [] ip6_dst_lookup_tail+0x2c3/0x2e0 +[ 80.145172] [] ? list_del+0x11/0x40 +[ 80.145172] [] ? remove_wait_queue+0x3c/0x50 +[ 80.145172] [] ip6_dst_lookup_flow+0x3d/0xa0 +[ 80.145172] [] rawv6_sendmsg+0x267/0xc20 +[ 80.145172] [] inet_sendmsg+0x63/0xb0 +[ 80.145172] [] ? selinux_socket_sendmsg+0x23/0x30 +[ 80.145172] [] sock_sendmsg+0xa6/0xd0 +[ 80.145172] [] SYSC_sendto+0x128/0x180 +[ 80.145172] [] ? update_curr+0xec/0x170 +[ 80.145172] [] ? kvm_clock_get_cycles+0x9/0x10 +[ 80.145172] [] ? __getnstimeofday+0x3e/0xd0 +[ 80.145172] [] SyS_sendto+0xe/0x10 +[ 80.145172] [] system_call_fastpath+0x16/0x1b +[ 80.145172] Code: fe ff ff 41 f6 45 2a 06 0f 85 ca fe ff ff 49 8b 7e 08 4c 89 ee e8 94 ef ff ff e9 b9 fe ff ff 48 8b 82 28 05 00 00 e9 01 ff ff ff <0f> 0b 49 8b 54 24 30 0d 00 00 40 00 89 83 14 01 00 00 48 89 53 +[ 80.145172] RIP [] fib6_add+0x75d/0x830 +[ 80.145172] RSP +[ 80.387413] ---[ end trace 02f20b7a8b81ed95 ]--- +[ 80.390154] Kernel panic - not syncing: Fatal exception in interrupt + +Signed-off-by: Hannes Frederic Sowa +Cc: Nicolas Dichtel +Cc: YOSHIFUJI Hideaki +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/route.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -1076,10 +1076,13 @@ static void ip6_link_failure(struct sk_b + + rt = (struct rt6_info *) skb_dst(skb); + if (rt) { +- if (rt->rt6i_flags & RTF_CACHE) +- rt6_update_expires(rt, 0); +- else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) ++ if (rt->rt6i_flags & RTF_CACHE) { ++ dst_hold(&rt->dst); ++ if (ip6_del_rt(rt)) ++ dst_free(&rt->dst); ++ } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { + rt->rt6i_node->fn_sernum = -1; ++ } + } + } + diff --git a/queue-3.10/ipv6-ip6_append_data_mtu-did-not-care-about-pmtudisc-and-frag_size.patch b/queue-3.10/ipv6-ip6_append_data_mtu-did-not-care-about-pmtudisc-and-frag_size.patch new file mode 100644 index 00000000000..94c4e75f756 --- /dev/null +++ b/queue-3.10/ipv6-ip6_append_data_mtu-did-not-care-about-pmtudisc-and-frag_size.patch @@ -0,0 +1,135 @@ +From 1fcbda94eb3ababc95eff46548962ceb14de638e Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Tue, 2 Jul 2013 08:04:05 +0200 +Subject: ipv6: ip6_append_data_mtu did not care about pmtudisc and frag_size + +From: Hannes Frederic Sowa + +[ Upstream commit 75a493e60ac4bbe2e977e7129d6d8cbb0dd236be ] + +If the socket had an IPV6_MTU value set, ip6_append_data_mtu lost track +of this when appending the second frame on a corked socket. This results +in the following splat: + +[37598.993962] ------------[ cut here ]------------ +[37598.994008] kernel BUG at net/core/skbuff.c:2064! +[37598.994008] invalid opcode: 0000 [#1] SMP +[37598.994008] Modules linked in: tcp_lp uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_core videodev media vfat fat usb_storage fuse ebtable_nat xt_CHECKSUM bridge stp llc ipt_MASQUERADE nf_conntrack_netbios_ns nf_conntrack_broadcast ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat ++nf_nat_ipv4 nf_nat iptable_mangle nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables ip6table_filter ip6_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i cxgb3 mdio libcxgbi ib_iser rdma_cm ib_addr iw_cm ib_cm ib_sa ib_mad ib_core iscsi_tcp libiscsi_tcp libiscsi ++scsi_transport_iscsi rfcomm bnep iTCO_wdt iTCO_vendor_support snd_hda_codec_conexant arc4 iwldvm mac80211 snd_hda_intel acpi_cpufreq mperf coretemp snd_hda_codec microcode cdc_wdm cdc_acm +[37598.994008] snd_hwdep cdc_ether snd_seq snd_seq_device usbnet mii joydev btusb snd_pcm bluetooth i2c_i801 e1000e lpc_ich mfd_core ptp iwlwifi pps_core snd_page_alloc mei cfg80211 snd_timer thinkpad_acpi snd tpm_tis soundcore rfkill tpm tpm_bios vhost_net tun macvtap macvlan kvm_intel kvm uinput binfmt_misc ++dm_crypt i915 i2c_algo_bit drm_kms_helper drm i2c_core wmi video +[37598.994008] CPU 0 +[37598.994008] Pid: 27320, comm: t2 Not tainted 3.9.6-200.fc18.x86_64 #1 LENOVO 27744PG/27744PG +[37598.994008] RIP: 0010:[] [] skb_copy_and_csum_bits+0x325/0x330 +[37598.994008] RSP: 0018:ffff88003670da18 EFLAGS: 00010202 +[37598.994008] RAX: ffff88018105c018 RBX: 0000000000000004 RCX: 00000000000006c0 +[37598.994008] RDX: ffff88018105a6c0 RSI: ffff88018105a000 RDI: ffff8801e1b0aa00 +[37598.994008] RBP: ffff88003670da78 R08: 0000000000000000 R09: ffff88018105c040 +[37598.994008] R10: ffff8801e1b0aa00 R11: 0000000000000000 R12: 000000000000fff8 +[37598.994008] R13: 00000000000004fc R14: 00000000ffff0504 R15: 0000000000000000 +[37598.994008] FS: 00007f28eea59740(0000) GS:ffff88023bc00000(0000) knlGS:0000000000000000 +[37598.994008] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b +[37598.994008] CR2: 0000003d935789e0 CR3: 00000000365cb000 CR4: 00000000000407f0 +[37598.994008] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[37598.994008] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 +[37598.994008] Process t2 (pid: 27320, threadinfo ffff88003670c000, task ffff88022c162ee0) +[37598.994008] Stack: +[37598.994008] ffff88022e098a00 ffff88020f973fc0 0000000000000008 00000000000004c8 +[37598.994008] ffff88020f973fc0 00000000000004c4 ffff88003670da78 ffff8801e1b0a200 +[37598.994008] 0000000000000018 00000000000004c8 ffff88020f973fc0 00000000000004c4 +[37598.994008] Call Trace: +[37598.994008] [] ip6_append_data+0xccf/0xfe0 +[37598.994008] [] ? ip_copy_metadata+0x1a0/0x1a0 +[37598.994008] [] ? _raw_spin_lock_bh+0x16/0x40 +[37598.994008] [] udpv6_sendmsg+0x1ed/0xc10 +[37598.994008] [] ? sock_has_perm+0x75/0x90 +[37598.994008] [] inet_sendmsg+0x63/0xb0 +[37598.994008] [] ? selinux_socket_sendmsg+0x23/0x30 +[37598.994008] [] sock_sendmsg+0xb0/0xe0 +[37598.994008] [] ? __switch_to+0x181/0x4a0 +[37598.994008] [] sys_sendto+0x12d/0x180 +[37598.994008] [] ? __audit_syscall_entry+0x94/0xf0 +[37598.994008] [] ? syscall_trace_enter+0x231/0x240 +[37598.994008] [] tracesys+0xdd/0xe2 +[37598.994008] Code: fe 07 00 00 48 c7 c7 04 28 a6 81 89 45 a0 4c 89 4d b8 44 89 5d a8 e8 1b ac b1 ff 44 8b 5d a8 4c 8b 4d b8 8b 45 a0 e9 cf fe ff ff <0f> 0b 66 0f 1f 84 00 00 00 00 00 66 66 66 66 90 55 48 89 e5 48 +[37598.994008] RIP [] skb_copy_and_csum_bits+0x325/0x330 +[37598.994008] RSP +[37599.007323] ---[ end trace d69f6a17f8ac8eee ]--- + +While there, also check if path mtu discovery is activated for this +socket. The logic was adapted from ip6_append_data when first writing +on the corked socket. + +This bug was introduced with commit +0c1833797a5a6ec23ea9261d979aa18078720b74 ("ipv6: fix incorrect ipsec +fragment"). + +v2: +a) Replace IPV6_PMTU_DISC_DO with IPV6_PMTUDISC_PROBE. +b) Don't pass ipv6_pinfo to ip6_append_data_mtu (suggestion by Gao + feng, thanks!). +c) Change mtu to unsigned int, else we get a warning about + non-matching types because of the min()-macro type-check. + +Acked-by: Gao feng +Cc: YOSHIFUJI Hideaki +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_output.c | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -1098,11 +1098,12 @@ static inline struct ipv6_rt_hdr *ip6_rt + return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; + } + +-static void ip6_append_data_mtu(int *mtu, ++static void ip6_append_data_mtu(unsigned int *mtu, + int *maxfraglen, + unsigned int fragheaderlen, + struct sk_buff *skb, +- struct rt6_info *rt) ++ struct rt6_info *rt, ++ bool pmtuprobe) + { + if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { + if (skb == NULL) { +@@ -1114,7 +1115,9 @@ static void ip6_append_data_mtu(int *mtu + * this fragment is not first, the headers + * space is regarded as data space. + */ +- *mtu = dst_mtu(rt->dst.path); ++ *mtu = min(*mtu, pmtuprobe ? ++ rt->dst.dev->mtu : ++ dst_mtu(rt->dst.path)); + } + *maxfraglen = ((*mtu - fragheaderlen) & ~7) + + fragheaderlen - sizeof(struct frag_hdr); +@@ -1131,11 +1134,10 @@ int ip6_append_data(struct sock *sk, int + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_cork *cork; + struct sk_buff *skb, *skb_prev = NULL; +- unsigned int maxfraglen, fragheaderlen; ++ unsigned int maxfraglen, fragheaderlen, mtu; + int exthdrlen; + int dst_exthdrlen; + int hh_len; +- int mtu; + int copy; + int err; + int offset = 0; +@@ -1292,7 +1294,9 @@ alloc_new_skb: + /* update mtu and maxfraglen if necessary */ + if (skb == NULL || skb_prev == NULL) + ip6_append_data_mtu(&mtu, &maxfraglen, +- fragheaderlen, skb, rt); ++ fragheaderlen, skb, rt, ++ np->pmtudisc == ++ IPV6_PMTUDISC_PROBE); + + skb_prev = skb; + diff --git a/queue-3.10/ipv6-mcast-always-hold-idev-lock-before-mca_lock.patch b/queue-3.10/ipv6-mcast-always-hold-idev-lock-before-mca_lock.patch new file mode 100644 index 00000000000..9ec7e04c827 --- /dev/null +++ b/queue-3.10/ipv6-mcast-always-hold-idev-lock-before-mca_lock.patch @@ -0,0 +1,239 @@ +From 5be3a4ef6d4ada70eee9dddf402f09d5771f071b Mon Sep 17 00:00:00 2001 +From: Amerigo Wang +Date: Sat, 29 Jun 2013 21:30:49 +0800 +Subject: ipv6,mcast: always hold idev->lock before mca_lock + +From: Amerigo Wang + +[ Upstream commit 8965779d2c0e6ab246c82a405236b1fb2adae6b2, with + some bits from commit b7b1bfce0bb68bd8f6e62a28295922785cc63781 + ("ipv6: split duplicate address detection and router solicitation timer") + to get the __ipv6_get_lladdr() used by this patch. ] + +dingtianhong reported the following deadlock detected by lockdep: + + ====================================================== + [ INFO: possible circular locking dependency detected ] + 3.4.24.05-0.1-default #1 Not tainted + ------------------------------------------------------- + ksoftirqd/0/3 is trying to acquire lock: + (&ndev->lock){+.+...}, at: [] ipv6_get_lladdr+0x74/0x120 + + but task is already holding lock: + (&mc->mca_lock){+.+...}, at: [] mld_send_report+0x40/0x150 + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + + -> #1 (&mc->mca_lock){+.+...}: + [] validate_chain+0x637/0x730 + [] __lock_acquire+0x2f7/0x500 + [] lock_acquire+0x114/0x150 + [] rt_spin_lock+0x4a/0x60 + [] igmp6_group_added+0x3b/0x120 + [] ipv6_mc_up+0x38/0x60 + [] ipv6_find_idev+0x3d/0x80 + [] addrconf_notify+0x3d5/0x4b0 + [] notifier_call_chain+0x3f/0x80 + [] raw_notifier_call_chain+0x11/0x20 + [] call_netdevice_notifiers+0x32/0x60 + [] __dev_notify_flags+0x34/0x80 + [] dev_change_flags+0x40/0x70 + [] do_setlink+0x237/0x8a0 + [] rtnl_newlink+0x3ec/0x600 + [] rtnetlink_rcv_msg+0x160/0x310 + [] netlink_rcv_skb+0x89/0xb0 + [] rtnetlink_rcv+0x27/0x40 + [] netlink_unicast+0x140/0x180 + [] netlink_sendmsg+0x33e/0x380 + [] sock_sendmsg+0x112/0x130 + [] __sys_sendmsg+0x44e/0x460 + [] sys_sendmsg+0x44/0x70 + [] system_call_fastpath+0x16/0x1b + + -> #0 (&ndev->lock){+.+...}: + [] check_prev_add+0x3de/0x440 + [] validate_chain+0x637/0x730 + [] __lock_acquire+0x2f7/0x500 + [] lock_acquire+0x114/0x150 + [] rt_read_lock+0x42/0x60 + [] ipv6_get_lladdr+0x74/0x120 + [] mld_newpack+0xb6/0x160 + [] add_grhead+0xab/0xc0 + [] add_grec+0x3ab/0x460 + [] mld_send_report+0x5a/0x150 + [] igmp6_timer_handler+0x4e/0xb0 + [] call_timer_fn+0xca/0x1d0 + [] run_timer_softirq+0x1df/0x2e0 + [] handle_pending_softirqs+0xf7/0x1f0 + [] __do_softirq_common+0x7b/0xf0 + [] __thread_do_softirq+0x1af/0x210 + [] run_ksoftirqd+0xe1/0x1f0 + [] kthread+0xae/0xc0 + [] kernel_thread_helper+0x4/0x10 + +actually we can just hold idev->lock before taking pmc->mca_lock, +and avoid taking idev->lock again when iterating idev->addr_list, +since the upper callers of mld_newpack() already take +read_lock_bh(&idev->lock). + +Reported-by: dingtianhong +Cc: dingtianhong +Cc: Hideaki YOSHIFUJI +Cc: David S. Miller +Cc: Hannes Frederic Sowa +Tested-by: Ding Tianhong +Tested-by: Chen Weilong +Signed-off-by: Cong Wang +Acked-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/addrconf.h | 3 +++ + net/ipv6/addrconf.c | 28 ++++++++++++++++++---------- + net/ipv6/mcast.c | 18 ++++++++++-------- + 3 files changed, 31 insertions(+), 18 deletions(-) + +--- a/include/net/addrconf.h ++++ b/include/net/addrconf.h +@@ -86,6 +86,9 @@ extern int ipv6_dev_get_saddr(struct n + const struct in6_addr *daddr, + unsigned int srcprefs, + struct in6_addr *saddr); ++extern int __ipv6_get_lladdr(struct inet6_dev *idev, ++ struct in6_addr *addr, ++ unsigned char banned_flags); + extern int ipv6_get_lladdr(struct net_device *dev, + struct in6_addr *addr, + unsigned char banned_flags); +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -1448,6 +1448,23 @@ try_nextdev: + } + EXPORT_SYMBOL(ipv6_dev_get_saddr); + ++int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, ++ unsigned char banned_flags) ++{ ++ struct inet6_ifaddr *ifp; ++ int err = -EADDRNOTAVAIL; ++ ++ list_for_each_entry(ifp, &idev->addr_list, if_list) { ++ if (ifp->scope == IFA_LINK && ++ !(ifp->flags & banned_flags)) { ++ *addr = ifp->addr; ++ err = 0; ++ break; ++ } ++ } ++ return err; ++} ++ + int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, + unsigned char banned_flags) + { +@@ -1457,17 +1474,8 @@ int ipv6_get_lladdr(struct net_device *d + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { +- struct inet6_ifaddr *ifp; +- + read_lock_bh(&idev->lock); +- list_for_each_entry(ifp, &idev->addr_list, if_list) { +- if (ifp->scope == IFA_LINK && +- !(ifp->flags & banned_flags)) { +- *addr = ifp->addr; +- err = 0; +- break; +- } +- } ++ err = __ipv6_get_lladdr(idev, addr, banned_flags); + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); +--- a/net/ipv6/mcast.c ++++ b/net/ipv6/mcast.c +@@ -1343,8 +1343,9 @@ static void ip6_mc_hdr(struct sock *sk, + hdr->daddr = *daddr; + } + +-static struct sk_buff *mld_newpack(struct net_device *dev, int size) ++static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) + { ++ struct net_device *dev = idev->dev; + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.igmp_sk; + struct sk_buff *skb; +@@ -1369,7 +1370,7 @@ static struct sk_buff *mld_newpack(struc + + skb_reserve(skb, hlen); + +- if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { ++ if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { + /* : + * use unspecified address as the source address + * when a valid link-local address is not available. +@@ -1465,7 +1466,7 @@ static struct sk_buff *add_grhead(struct + struct mld2_grec *pgr; + + if (!skb) +- skb = mld_newpack(dev, dev->mtu); ++ skb = mld_newpack(pmc->idev, dev->mtu); + if (!skb) + return NULL; + pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec)); +@@ -1485,7 +1486,8 @@ static struct sk_buff *add_grhead(struct + static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, + int type, int gdeleted, int sdeleted) + { +- struct net_device *dev = pmc->idev->dev; ++ struct inet6_dev *idev = pmc->idev; ++ struct net_device *dev = idev->dev; + struct mld2_report *pmr; + struct mld2_grec *pgr = NULL; + struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; +@@ -1514,7 +1516,7 @@ static struct sk_buff *add_grec(struct s + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { + if (skb) + mld_sendpack(skb); +- skb = mld_newpack(dev, dev->mtu); ++ skb = mld_newpack(idev, dev->mtu); + } + } + first = 1; +@@ -1541,7 +1543,7 @@ static struct sk_buff *add_grec(struct s + pgr->grec_nsrcs = htons(scount); + if (skb) + mld_sendpack(skb); +- skb = mld_newpack(dev, dev->mtu); ++ skb = mld_newpack(idev, dev->mtu); + first = 1; + scount = 0; + } +@@ -1596,8 +1598,8 @@ static void mld_send_report(struct inet6 + struct sk_buff *skb = NULL; + int type; + ++ read_lock_bh(&idev->lock); + if (!pmc) { +- read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (pmc->mca_flags & MAF_NOREPORT) + continue; +@@ -1609,7 +1611,6 @@ static void mld_send_report(struct inet6 + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } +- read_unlock_bh(&idev->lock); + } else { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) +@@ -1619,6 +1620,7 @@ static void mld_send_report(struct inet6 + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } ++ read_unlock_bh(&idev->lock); + if (skb) + mld_sendpack(skb); + } diff --git a/queue-3.10/ipv6-only-apply-anti-spoofing-checks-to-not-pointopoint-tunnels.patch b/queue-3.10/ipv6-only-apply-anti-spoofing-checks-to-not-pointopoint-tunnels.patch new file mode 100644 index 00000000000..7624a18e37f --- /dev/null +++ b/queue-3.10/ipv6-only-apply-anti-spoofing-checks-to-not-pointopoint-tunnels.patch @@ -0,0 +1,39 @@ +From 79339ba50702248d19a8825906ceb527d547444f Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Thu, 27 Jun 2013 22:46:04 +0200 +Subject: ipv6: only apply anti-spoofing checks to not-pointopoint tunnels + +From: Hannes Frederic Sowa + +[ Upstream commit 5c29fb12e8fb8a8105ea048cb160fd79a85a52bb ] + +Because of commit 218774dc341f219bfcf940304a081b121a0e8099 ("ipv6: add +anti-spoofing checks for 6to4 and 6rd") the sit driver dropped packets +for 2002::/16 destinations and sources even when configured to work as a +tunnel with fixed endpoint. We may only apply the 6rd/6to4 anti-spoofing +checks if the device is not in pointopoint mode. + +This was an oversight from me in the above commit, sorry. Thanks to +Roman Mamedov for reporting this! + +Reported-by: Roman Mamedov +Cc: David Miller +Cc: YOSHIFUJI Hideaki +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/sit.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv6/sit.c ++++ b/net/ipv6/sit.c +@@ -589,7 +589,7 @@ static int ipip6_rcv(struct sk_buff *skb + tunnel->dev->stats.rx_errors++; + goto out; + } +- } else { ++ } else if (!(tunnel->dev->flags&IFF_POINTOPOINT)) { + if (is_spoofed_6rd(tunnel, iph->saddr, + &ipv6_hdr(skb)->saddr) || + is_spoofed_6rd(tunnel, iph->daddr, diff --git a/queue-3.10/ipv6-only-static-routes-qualify-for-equal-cost-multipathing.patch b/queue-3.10/ipv6-only-static-routes-qualify-for-equal-cost-multipathing.patch new file mode 100644 index 00000000000..f3b952cbbfc --- /dev/null +++ b/queue-3.10/ipv6-only-static-routes-qualify-for-equal-cost-multipathing.patch @@ -0,0 +1,83 @@ +From 8bd8eef9c03de3dc458d95069adaecc5960f9f66 Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Fri, 12 Jul 2013 23:46:33 +0200 +Subject: ipv6: only static routes qualify for equal cost multipathing + +From: Hannes Frederic Sowa + +[ Upstream commit 307f2fb95e9b96b3577916e73d92e104f8f26494 ] + +Static routes in this case are non-expiring routes which did not get +configured by autoconf or by icmpv6 redirects. + +To make sure we actually get an ecmp route while searching for the first +one in this fib6_node's leafs, also make sure it matches the ecmp route +assumptions. + +v2: +a) Removed RTF_EXPIRE check in dst.from chain. The check of RTF_ADDRCONF + already ensures that this route, even if added again without + RTF_EXPIRES (in case of a RA announcement with infinite timeout), + does not cause the rt6i_nsiblings logic to go wrong if a later RA + updates the expiration time later. + +v3: +a) Allow RTF_EXPIRES routes to enter the ecmp route set. We have to do so, + because an pmtu event could update the RTF_EXPIRES flag and we would + not count this route, if another route joins this set. We now filter + only for RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC, which are flags that + don't get changed after rt6_info construction. + +Cc: Nicolas Dichtel +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_fib.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +--- a/net/ipv6/ip6_fib.c ++++ b/net/ipv6/ip6_fib.c +@@ -632,6 +632,12 @@ insert_above: + return ln; + } + ++static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt) ++{ ++ return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == ++ RTF_GATEWAY; ++} ++ + /* + * Insert routing information in a node. + */ +@@ -646,6 +652,7 @@ static int fib6_add_rt2node(struct fib6_ + int add = (!info->nlh || + (info->nlh->nlmsg_flags & NLM_F_CREATE)); + int found = 0; ++ bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + + ins = &fn->leaf; + +@@ -691,9 +698,8 @@ static int fib6_add_rt2node(struct fib6_ + * To avoid long list, we only had siblings if the + * route have a gateway. + */ +- if (rt->rt6i_flags & RTF_GATEWAY && +- !(rt->rt6i_flags & RTF_EXPIRES) && +- !(iter->rt6i_flags & RTF_EXPIRES)) ++ if (rt_can_ecmp && ++ rt6_qualify_for_ecmp(iter)) + rt->rt6i_nsiblings++; + } + +@@ -715,7 +721,8 @@ static int fib6_add_rt2node(struct fib6_ + /* Find the first route that have the same metric */ + sibling = fn->leaf; + while (sibling) { +- if (sibling->rt6i_metric == rt->rt6i_metric) { ++ if (sibling->rt6i_metric == rt->rt6i_metric && ++ rt6_qualify_for_ecmp(sibling)) { + list_add_tail(&rt->rt6i_siblings, + &sibling->rt6i_siblings); + break; diff --git a/queue-3.10/ipv6-rt6_check_neigh-should-successfully-verify-neigh-if-no-nud-information-are-available.patch b/queue-3.10/ipv6-rt6_check_neigh-should-successfully-verify-neigh-if-no-nud-information-are-available.patch new file mode 100644 index 00000000000..7a8a6ab785e --- /dev/null +++ b/queue-3.10/ipv6-rt6_check_neigh-should-successfully-verify-neigh-if-no-nud-information-are-available.patch @@ -0,0 +1,58 @@ +From bd10a3abbed1d5542a0930dcdfc121973276275e Mon Sep 17 00:00:00 2001 +From: Hannes Frederic Sowa +Date: Wed, 3 Jul 2013 20:45:04 +0200 +Subject: ipv6: rt6_check_neigh should successfully verify neigh if no NUD information are available + +From: Hannes Frederic Sowa + +[ Upstream commit 3630d40067a21d4dfbadc6002bb469ce26ac5d52 ] + +After the removal of rt->n we do not create a neighbour entry at route +insertion time (rt6_bind_neighbour is gone). As long as no neighbour is +created because of "useful traffic" we skip this routing entry because +rt6_check_neigh cannot pick up a valid neighbour (neigh == NULL) and +thus returns false. + +This change was introduced by commit +887c95cc1da53f66a5890fdeab13414613010097 ("ipv6: Complete neighbour +entry removal from dst_entry.") + +To quote RFC4191: +"If the host has no information about the router's reachability, then +the host assumes the router is reachable." + +and also: +"A host MUST NOT probe a router's reachability in the absence of useful +traffic that the host would have sent to the router if it were reachable." + +So, just assume the router is reachable and let's rt6_probe do the +rest. We don't need to create a neighbour on route insertion time. + +If we don't compile with CONFIG_IPV6_ROUTER_PREF (RFC4191 support) +a neighbour is only valid if its nud_state is NUD_VALID. I did not find +any references that we should probe the router on route insertion time +via the other RFCs. So skip this route in that case. + +v2: +a) use IS_ENABLED instead of #ifdefs (thanks to Sergei Shtylyov) + +Reported-by: Pierre Emeriaud +Cc: YOSHIFUJI Hideaki +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/route.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -547,6 +547,8 @@ static inline bool rt6_check_neigh(struc + ret = true; + #endif + read_unlock(&neigh->lock); ++ } else if (IS_ENABLED(CONFIG_IPV6_ROUTER_PREF)) { ++ ret = true; + } + rcu_read_unlock_bh(); + diff --git a/queue-3.10/l2tp-add-missing-.owner-to-struct-pppox_proto.patch b/queue-3.10/l2tp-add-missing-.owner-to-struct-pppox_proto.patch new file mode 100644 index 00000000000..38d357afa82 --- /dev/null +++ b/queue-3.10/l2tp-add-missing-.owner-to-struct-pppox_proto.patch @@ -0,0 +1,31 @@ +From c6ad7374aa71d0201f266963d9b5e2cf254ad22b Mon Sep 17 00:00:00 2001 +From: Wei Yongjun +Date: Tue, 2 Jul 2013 09:02:07 +0800 +Subject: l2tp: add missing .owner to struct pppox_proto + +From: Wei Yongjun + +[ Upstream commit e1558a93b61962710733dc8c11a2bc765607f1cd ] + +Add missing .owner of struct pppox_proto. This prevents the +module from being removed from underneath its users. + +Signed-off-by: Wei Yongjun +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/l2tp/l2tp_ppp.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/l2tp/l2tp_ppp.c ++++ b/net/l2tp/l2tp_ppp.c +@@ -1793,7 +1793,8 @@ static const struct proto_ops pppol2tp_o + + static const struct pppox_proto pppol2tp_proto = { + .create = pppol2tp_create, +- .ioctl = pppol2tp_ioctl ++ .ioctl = pppol2tp_ioctl, ++ .owner = THIS_MODULE, + }; + + #ifdef CONFIG_L2TP_V3 diff --git a/queue-3.10/macvtap-correctly-linearize-skb-when-zerocopy-is-used.patch b/queue-3.10/macvtap-correctly-linearize-skb-when-zerocopy-is-used.patch new file mode 100644 index 00000000000..7d5ad0e118e --- /dev/null +++ b/queue-3.10/macvtap-correctly-linearize-skb-when-zerocopy-is-used.patch @@ -0,0 +1,56 @@ +From ebf6764da166478c0c059e5083b12f0f577decdc Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Wed, 10 Jul 2013 13:43:28 +0800 +Subject: macvtap: correctly linearize skb when zerocopy is used + +From: Jason Wang + +[ Upstream commit 61d46bf979d5cd7c164709a80ad5676a35494aae ] + +Userspace may produce vectors greater than MAX_SKB_FRAGS. When we try to +linearize parts of the skb to let the rest of iov to be fit in +the frags, we need count copylen into linear when calling macvtap_alloc_skb() +instead of partly counting it into data_len. Since this breaks +zerocopy_sg_from_iovec() since its inner counter assumes nr_frags should +be zero at beginning. This cause nr_frags to be increased wrongly without +setting the correct frags. + +This bug were introduced from b92946e2919134ebe2a4083e4302236295ea2a73 +(macvtap: zerocopy: validate vectors before building skb). + +Cc: Michael S. Tsirkin +Signed-off-by: Jason Wang +Acked-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/macvtap.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/drivers/net/macvtap.c ++++ b/drivers/net/macvtap.c +@@ -647,6 +647,7 @@ static ssize_t macvtap_get_user(struct m + int vnet_hdr_len = 0; + int copylen = 0; + bool zerocopy = false; ++ size_t linear; + + if (q->flags & IFF_VNET_HDR) { + vnet_hdr_len = q->vnet_hdr_sz; +@@ -701,11 +702,14 @@ static ssize_t macvtap_get_user(struct m + copylen = vnet_hdr.hdr_len; + if (!copylen) + copylen = GOODCOPY_LEN; +- } else ++ linear = copylen; ++ } else { + copylen = len; ++ linear = vnet_hdr.hdr_len; ++ } + + skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen, +- vnet_hdr.hdr_len, noblock, &err); ++ linear, noblock, &err); + if (!skb) + goto err; + diff --git a/queue-3.10/macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch b/queue-3.10/macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch new file mode 100644 index 00000000000..83290c7ee5a --- /dev/null +++ b/queue-3.10/macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch @@ -0,0 +1,123 @@ +From 8270a0a6bfec886971fdece9d4087d4f5e4f62b6 Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Thu, 18 Jul 2013 10:55:16 +0800 +Subject: macvtap: do not zerocopy if iov needs more pages than MAX_SKB_FRAGS + +From: Jason Wang + +[ Upstream commit ece793fcfc417b3925844be88a6a6dc82ae8f7c6 ] + +We try to linearize part of the skb when the number of iov is greater than +MAX_SKB_FRAGS. This is not enough since each single vector may occupy more than +one pages, so zerocopy_sg_fromiovec() may still fail and may break the guest +network. + +Solve this problem by calculate the pages needed for iov before trying to do +zerocopy and switch to use copy instead of zerocopy if it needs more than +MAX_SKB_FRAGS. + +This is done through introducing a new helper to count the pages for iov, and +call uarg->callback() manually when switching from zerocopy to copy to notify +vhost. + +We can do further optimization on top. + +This bug were introduced from b92946e2919134ebe2a4083e4302236295ea2a73 +(macvtap: zerocopy: validate vectors before building skb). + +Cc: Michael S. Tsirkin +Signed-off-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/macvtap.c | 62 +++++++++++++++++++++++++++++--------------------- + 1 file changed, 37 insertions(+), 25 deletions(-) + +--- a/drivers/net/macvtap.c ++++ b/drivers/net/macvtap.c +@@ -633,6 +633,28 @@ static int macvtap_skb_to_vnet_hdr(const + return 0; + } + ++static unsigned long iov_pages(const struct iovec *iv, int offset, ++ unsigned long nr_segs) ++{ ++ unsigned long seg, base; ++ int pages = 0, len, size; ++ ++ while (nr_segs && (offset >= iv->iov_len)) { ++ offset -= iv->iov_len; ++ ++iv; ++ --nr_segs; ++ } ++ ++ for (seg = 0; seg < nr_segs; seg++) { ++ base = (unsigned long)iv[seg].iov_base + offset; ++ len = iv[seg].iov_len - offset; ++ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; ++ pages += size; ++ offset = 0; ++ } ++ ++ return pages; ++} + + /* Get packet from user space buffer */ + static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, +@@ -679,31 +701,15 @@ static ssize_t macvtap_get_user(struct m + if (unlikely(count > UIO_MAXIOV)) + goto err; + +- if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) +- zerocopy = true; +- +- if (zerocopy) { +- /* Userspace may produce vectors with count greater than +- * MAX_SKB_FRAGS, so we need to linearize parts of the skb +- * to let the rest of data to be fit in the frags. +- */ +- if (count > MAX_SKB_FRAGS) { +- copylen = iov_length(iv, count - MAX_SKB_FRAGS); +- if (copylen < vnet_hdr_len) +- copylen = 0; +- else +- copylen -= vnet_hdr_len; +- } +- /* There are 256 bytes to be copied in skb, so there is enough +- * room for skb expand head in case it is used. +- * The rest buffer is mapped from userspace. +- */ +- if (copylen < vnet_hdr.hdr_len) +- copylen = vnet_hdr.hdr_len; +- if (!copylen) +- copylen = GOODCOPY_LEN; ++ if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { ++ copylen = vnet_hdr.hdr_len ? vnet_hdr.hdr_len : GOODCOPY_LEN; + linear = copylen; +- } else { ++ if (iov_pages(iv, vnet_hdr_len + copylen, count) ++ <= MAX_SKB_FRAGS) ++ zerocopy = true; ++ } ++ ++ if (!zerocopy) { + copylen = len; + linear = vnet_hdr.hdr_len; + } +@@ -715,9 +721,15 @@ static ssize_t macvtap_get_user(struct m + + if (zerocopy) + err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); +- else ++ else { + err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, + len); ++ if (!err && m && m->msg_control) { ++ struct ubuf_info *uarg = m->msg_control; ++ uarg->callback(uarg, false); ++ } ++ } ++ + if (err) + goto err_kfree; + diff --git a/queue-3.10/neighbour-fix-a-race-in-neigh_destroy.patch b/queue-3.10/neighbour-fix-a-race-in-neigh_destroy.patch new file mode 100644 index 00000000000..ab9c7983f1b --- /dev/null +++ b/queue-3.10/neighbour-fix-a-race-in-neigh_destroy.patch @@ -0,0 +1,78 @@ +From d605a92bd29513e01af93275527252e7423b2ac7 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Fri, 28 Jun 2013 02:37:42 -0700 +Subject: neighbour: fix a race in neigh_destroy() + +From: Eric Dumazet + +[ Upstream commit c9ab4d85de222f3390c67aedc9c18a50e767531e ] + +There is a race in neighbour code, because neigh_destroy() uses +skb_queue_purge(&neigh->arp_queue) without holding neighbour lock, +while other parts of the code assume neighbour rwlock is what +protects arp_queue + +Convert all skb_queue_purge() calls to the __skb_queue_purge() variant + +Use __skb_queue_head_init() instead of skb_queue_head_init() +to make clear we do not use arp_queue.lock + +And hold neigh->lock in neigh_destroy() to close the race. + +Reported-by: Joe Jin +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/neighbour.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/net/core/neighbour.c ++++ b/net/core/neighbour.c +@@ -231,7 +231,7 @@ static void neigh_flush_dev(struct neigh + we must kill timers etc. and move + it to safe state. + */ +- skb_queue_purge(&n->arp_queue); ++ __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + n->output = neigh_blackhole; + if (n->nud_state & NUD_VALID) +@@ -286,7 +286,7 @@ static struct neighbour *neigh_alloc(str + if (!n) + goto out_entries; + +- skb_queue_head_init(&n->arp_queue); ++ __skb_queue_head_init(&n->arp_queue); + rwlock_init(&n->lock); + seqlock_init(&n->ha_lock); + n->updated = n->used = now; +@@ -708,7 +708,9 @@ void neigh_destroy(struct neighbour *nei + if (neigh_del_timer(neigh)) + pr_warn("Impossible event\n"); + +- skb_queue_purge(&neigh->arp_queue); ++ write_lock_bh(&neigh->lock); ++ __skb_queue_purge(&neigh->arp_queue); ++ write_unlock_bh(&neigh->lock); + neigh->arp_queue_len_bytes = 0; + + if (dev->netdev_ops->ndo_neigh_destroy) +@@ -858,7 +860,7 @@ static void neigh_invalidate(struct neig + neigh->ops->error_report(neigh, skb); + write_lock(&neigh->lock); + } +- skb_queue_purge(&neigh->arp_queue); ++ __skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; + } + +@@ -1210,7 +1212,7 @@ int neigh_update(struct neighbour *neigh + + write_lock_bh(&neigh->lock); + } +- skb_queue_purge(&neigh->arp_queue); ++ __skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; + } + out: diff --git a/queue-3.10/net-cadence-macb-fix-bug-typo-in-extracting-gem_irq_read_clear-bit.patch b/queue-3.10/net-cadence-macb-fix-bug-typo-in-extracting-gem_irq_read_clear-bit.patch new file mode 100644 index 00000000000..729a3a42b01 --- /dev/null +++ b/queue-3.10/net-cadence-macb-fix-bug-typo-in-extracting-gem_irq_read_clear-bit.patch @@ -0,0 +1,28 @@ +From 35e568df646dc23bd2d00c8865c3118794d1835a Mon Sep 17 00:00:00 2001 +From: Jongsung Kim +Date: Tue, 9 Jul 2013 17:36:00 +0900 +Subject: net/cadence/macb: fix bug/typo in extracting gem_irq_read_clear bit + +From: Jongsung Kim + +[ Upstream commit 01276ed2424eb78c95461545410923d5da154d31 ] + +Signed-off-by: Jongsung Kim +Acked-by: Nicolas Ferre +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/cadence/macb.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/cadence/macb.c ++++ b/drivers/net/ethernet/cadence/macb.c +@@ -1070,7 +1070,7 @@ static void macb_configure_dma(struct ma + static void macb_configure_caps(struct macb *bp) + { + if (macb_is_gem(bp)) { +- if (GEM_BF(IRQCOR, gem_readl(bp, DCFG1)) == 0) ++ if (GEM_BFEXT(IRQCOR, gem_readl(bp, DCFG1)) == 0) + bp->caps |= MACB_CAPS_ISR_CLEAR_ON_WRITE; + } + } diff --git a/queue-3.10/net-swap-ver-and-type-in-pppoe_hdr.patch b/queue-3.10/net-swap-ver-and-type-in-pppoe_hdr.patch new file mode 100644 index 00000000000..ae276a1bfbc --- /dev/null +++ b/queue-3.10/net-swap-ver-and-type-in-pppoe_hdr.patch @@ -0,0 +1,34 @@ +From 7da0d57c053a603f3cac04587ecdab2b3072d769 Mon Sep 17 00:00:00 2001 +From: Changli Gao +Date: Sat, 29 Jun 2013 00:15:51 +0800 +Subject: net: Swap ver and type in pppoe_hdr + +From: Changli Gao + +[ Upstream commit b1a5a34bd0b8767ea689e68f8ea513e9710b671e ] + +Ver and type in pppoe_hdr should be swapped as defined by RFC2516 +section-4. + +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/uapi/linux/if_pppox.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/include/uapi/linux/if_pppox.h ++++ b/include/uapi/linux/if_pppox.h +@@ -135,11 +135,11 @@ struct pppoe_tag { + + struct pppoe_hdr { + #if defined(__LITTLE_ENDIAN_BITFIELD) +- __u8 ver : 4; + __u8 type : 4; ++ __u8 ver : 4; + #elif defined(__BIG_ENDIAN_BITFIELD) +- __u8 type : 4; + __u8 ver : 4; ++ __u8 type : 4; + #else + #error "Please fix " + #endif diff --git a/queue-3.10/pkt_sched-sch_qfq-remove-a-source-of-high-packet-delay-jitter.patch b/queue-3.10/pkt_sched-sch_qfq-remove-a-source-of-high-packet-delay-jitter.patch new file mode 100644 index 00000000000..592a01dff2f --- /dev/null +++ b/queue-3.10/pkt_sched-sch_qfq-remove-a-source-of-high-packet-delay-jitter.patch @@ -0,0 +1,220 @@ +From 5f65eb80604e70df56b97008538069892bb81205 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 16 Jul 2013 08:52:30 +0200 +Subject: pkt_sched: sch_qfq: remove a source of high packet delay/jitter + +From: Paolo Valente + +[ Upstream commit 87f40dd6ce7042caca0b3b557e8923127f51f902 ] + +QFQ+ inherits from QFQ a design choice that may cause a high packet +delay/jitter and a severe short-term unfairness. As QFQ, QFQ+ uses a +special quantity, the system virtual time, to track the service +provided by the ideal system it approximates. When a packet is +dequeued, this quantity must be incremented by the size of the packet, +divided by the sum of the weights of the aggregates waiting to be +served. Tracking this sum correctly is a non-trivial task, because, to +preserve tight service guarantees, the decrement of this sum must be +delayed in a special way [1]: this sum can be decremented only after +that its value would decrease also in the ideal system approximated by +QFQ+. For efficiency, QFQ+ keeps track only of the 'instantaneous' +weight sum, increased and decreased immediately as the weight of an +aggregate changes, and as an aggregate is created or destroyed (which, +in its turn, happens as a consequence of some class being +created/destroyed/changed). However, to avoid the problems caused to +service guarantees by these immediate decreases, QFQ+ increments the +system virtual time using the maximum value allowed for the weight +sum, 2^10, in place of the dynamic, instantaneous value. The +instantaneous value of the weight sum is used only to check whether a +request of weight increase or a class creation can be satisfied. + +Unfortunately, the problems caused by this choice are worse than the +temporary degradation of the service guarantees that may occur, when a +class is changed or destroyed, if the instantaneous value of the +weight sum was used to update the system virtual time. In fact, the +fraction of the link bandwidth guaranteed by QFQ+ to each aggregate is +equal to the ratio between the weight of the aggregate and the sum of +the weights of the competing aggregates. The packet delay guaranteed +to the aggregate is instead inversely proportional to the guaranteed +bandwidth. By using the maximum possible value, and not the actual +value of the weight sum, QFQ+ provides each aggregate with the worst +possible service guarantees, and not with service guarantees related +to the actual set of competing aggregates. To see the consequences of +this fact, consider the following simple example. + +Suppose that only the following aggregates are backlogged, i.e., that +only the classes in the following aggregates have packets to transmit: +one aggregate with weight 10, say A, and ten aggregates with weight 1, +say B1, B2, ..., B10. In particular, suppose that these aggregates are +always backlogged. Given the weight distribution, the smoothest and +fairest service order would be: +A B1 A B2 A B3 A B4 A B5 A B6 A B7 A B8 A B9 A B10 A B1 A B2 ... + +QFQ+ would provide exactly this optimal service if it used the actual +value for the weight sum instead of the maximum possible value, i.e., +11 instead of 2^10. In contrast, since QFQ+ uses the latter value, it +serves aggregates as follows (easy to prove and to reproduce +experimentally): +A B1 B2 B3 B4 B5 B6 B7 B8 B9 B10 A A A A A A A A A A B1 B2 ... B10 A A ... + +By replacing 10 with N in the above example, and by increasing N, one +can increase at will the maximum packet delay and the jitter +experienced by the classes in aggregate A. + +This patch addresses this issue by just using the above +'instantaneous' value of the weight sum, instead of the maximum +possible value, when updating the system virtual time. After the +instantaneous weight sum is decreased, QFQ+ may deviate from the ideal +service for a time interval in the order of the time to serve one +maximum-size packet for each backlogged class. The worst-case extent +of the deviation exhibited by QFQ+ during this time interval [1] is +basically the same as of the deviation described above (but, without +this patch, QFQ+ suffers from such a deviation all the time). Finally, +this patch modifies the comment to the function qfq_slot_insert, to +make it coherent with the fact that the weight sum used by QFQ+ can +now be lower than the maximum possible value. + +[1] P. Valente, "Extending WF2Q+ to support a dynamic traffic mix", +Proceedings of AAA-IDEA'05, June 2005. + +Signed-off-by: Paolo Valente +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/sch_qfq.c | 85 ++++++++++++++++++++++++++++++++++------------------ + 1 file changed, 56 insertions(+), 29 deletions(-) + +--- a/net/sched/sch_qfq.c ++++ b/net/sched/sch_qfq.c +@@ -113,7 +113,6 @@ + + #define FRAC_BITS 30 /* fixed point arithmetic */ + #define ONE_FP (1UL << FRAC_BITS) +-#define IWSUM (ONE_FP/QFQ_MAX_WSUM) + + #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ + #define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ +@@ -189,6 +188,7 @@ struct qfq_sched { + struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ + u32 num_active_agg; /* Num. of active aggregates */ + u32 wsum; /* weight sum */ ++ u32 iwsum; /* inverse weight sum */ + + unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ + struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ +@@ -314,6 +314,7 @@ static void qfq_update_agg(struct qfq_sc + + q->wsum += + (int) agg->class_weight * (new_num_classes - agg->num_classes); ++ q->iwsum = ONE_FP / q->wsum; + + agg->num_classes = new_num_classes; + } +@@ -340,6 +341,10 @@ static void qfq_destroy_agg(struct qfq_s + { + if (!hlist_unhashed(&agg->nonfull_next)) + hlist_del_init(&agg->nonfull_next); ++ q->wsum -= agg->class_weight; ++ if (q->wsum != 0) ++ q->iwsum = ONE_FP / q->wsum; ++ + if (q->in_serv_agg == agg) + q->in_serv_agg = qfq_choose_next_agg(q); + kfree(agg); +@@ -827,38 +832,60 @@ static void qfq_make_eligible(struct qfq + } + } + +- + /* +- * The index of the slot in which the aggregate is to be inserted must +- * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1' +- * because the start time of the group may be moved backward by one +- * slot after the aggregate has been inserted, and this would cause +- * non-empty slots to be right-shifted by one position. ++ * The index of the slot in which the input aggregate agg is to be ++ * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2' ++ * and not a '-1' because the start time of the group may be moved ++ * backward by one slot after the aggregate has been inserted, and ++ * this would cause non-empty slots to be right-shifted by one ++ * position. ++ * ++ * QFQ+ fully satisfies this bound to the slot index if the parameters ++ * of the classes are not changed dynamically, and if QFQ+ never ++ * happens to postpone the service of agg unjustly, i.e., it never ++ * happens that the aggregate becomes backlogged and eligible, or just ++ * eligible, while an aggregate with a higher approximated finish time ++ * is being served. In particular, in this case QFQ+ guarantees that ++ * the timestamps of agg are low enough that the slot index is never ++ * higher than 2. Unfortunately, QFQ+ cannot provide the same ++ * guarantee if it happens to unjustly postpone the service of agg, or ++ * if the parameters of some class are changed. ++ * ++ * As for the first event, i.e., an out-of-order service, the ++ * upper bound to the slot index guaranteed by QFQ+ grows to ++ * 2 + ++ * QFQ_MAX_AGG_CLASSES * ((1<budget -= len; + +- q->V += (u64)len * IWSUM; ++ q->V += (u64)len * q->iwsum; + pr_debug("qfq dequeue: len %u F %lld now %lld\n", + len, (unsigned long long) in_serv_agg->F, + (unsigned long long) q->V); diff --git a/queue-3.10/series b/queue-3.10/series index 3ff1243b6b5..708bcf6aac0 100644 --- a/queue-3.10/series +++ b/queue-3.10/series @@ -1,2 +1,42 @@ writeback-fix-periodic-writeback-after-fs-mount.patch sparc32-vm_area_struct-access-for-old-sun-sparcs.patch +ipv6-only-apply-anti-spoofing-checks-to-not-pointopoint-tunnels.patch +neighbour-fix-a-race-in-neigh_destroy.patch +x25-fix-broken-locking-in-ioctl-error-paths.patch +net-swap-ver-and-type-in-pppoe_hdr.patch +gre-fix-a-regression-in-ioctl.patch +vti-remove-duplicated-code-to-fix-a-memory-leak.patch +ipv6-mcast-always-hold-idev-lock-before-mca_lock.patch +ip_tunnels-use-skb-len-to-pmtu-check.patch +l2tp-add-missing-.owner-to-struct-pppox_proto.patch +ipip-fix-a-regression-in-ioctl.patch +ipv6-call-udp_push_pending_frames-when-uncorking-a-socket-with-af_inet-pending-data.patch +ipv6-ip6_append_data_mtu-did-not-care-about-pmtudisc-and-frag_size.patch +ipv6-rt6_check_neigh-should-successfully-verify-neigh-if-no-nud-information-are-available.patch +sfc-fix-memory-leak-when-discarding-scattered-packets.patch +net-cadence-macb-fix-bug-typo-in-extracting-gem_irq_read_clear-bit.patch +virtio-support-unlocked-queue-poll.patch +virtio_net-fix-race-in-rx-vq-processing.patch +vhost-net-fix-use-after-free-in-vhost_net_flush.patch +sunvnet-vnet_port_remove-must-call-unregister_netdev.patch +ifb-fix-rcu_sched-self-detected-stalls.patch +tuntap-correctly-linearize-skb-when-zerocopy-is-used.patch +macvtap-correctly-linearize-skb-when-zerocopy-is-used.patch +ipv6-in-case-of-link-failure-remove-route-directly-instead-of-letting-it-expire.patch +9p-fix-off-by-one-causing-access-violations-and-memory-corruption.patch +alx-fix-lockdep-annotation.patch +ipv6-fix-route-selection-if-kernel-is-not-compiled-with-config_ipv6_router_pref.patch +dummy-fix-oops-when-loading-the-dummy-failed.patch +ifb-fix-oops-when-loading-the-ifb-failed.patch +gre-fix-mtu-sizing-check-for-gretap-tunnels.patch +ipv6-only-static-routes-qualify-for-equal-cost-multipathing.patch +atl1e-fix-dma-mapping-warnings.patch +atl1e-unmap-partially-mapped-skb-on-dma-error-and-free-skb.patch +ipv4-set-transport-header-earlier.patch +be2net-fix-to-avoid-hardware-workaround-when-not-needed.patch +hyperv-fix-the-netif_f_sg-flag-setting-in-netvsc.patch +pkt_sched-sch_qfq-remove-a-source-of-high-packet-delay-jitter.patch +tuntap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch +macvtap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch +vlan-mask-vlan-prio-bits.patch +vlan-fix-a-race-in-egress-prio-management.patch diff --git a/queue-3.10/sfc-fix-memory-leak-when-discarding-scattered-packets.patch b/queue-3.10/sfc-fix-memory-leak-when-discarding-scattered-packets.patch new file mode 100644 index 00000000000..5e354f7d93d --- /dev/null +++ b/queue-3.10/sfc-fix-memory-leak-when-discarding-scattered-packets.patch @@ -0,0 +1,86 @@ +From 8db99edc36ca323408ba5c5bcb8952b01be50225 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 4 Jul 2013 23:48:46 +0100 +Subject: sfc: Fix memory leak when discarding scattered packets + +From: Ben Hutchings + +[ Upstream commit 734d4e159b283a4ae4d007b7e7a91d84398ccb92 ] + +Commit 2768935a4660 ('sfc: reuse pages to avoid DMA mapping/unmapping +costs') did not fully take account of DMA scattering which was +introduced immediately before. If a received packet is invalid and +must be discarded, we only drop a reference to the first buffer's +page, but we need to drop a reference for each buffer the packet +used. + +I think this bug was missed partly because efx_recycle_rx_buffers() +was not renamed and so no longer does what its name says. It does not +change the state of buffers, but only prepares the underlying pages +for recycling. Rename it accordingly. + +Signed-off-by: Ben Hutchings +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/sfc/rx.c | 27 ++++++++++++++++++++------- + 1 file changed, 20 insertions(+), 7 deletions(-) + +--- a/drivers/net/ethernet/sfc/rx.c ++++ b/drivers/net/ethernet/sfc/rx.c +@@ -282,9 +282,9 @@ static void efx_fini_rx_buffer(struct ef + } + + /* Recycle the pages that are used by buffers that have just been received. */ +-static void efx_recycle_rx_buffers(struct efx_channel *channel, +- struct efx_rx_buffer *rx_buf, +- unsigned int n_frags) ++static void efx_recycle_rx_pages(struct efx_channel *channel, ++ struct efx_rx_buffer *rx_buf, ++ unsigned int n_frags) + { + struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); + +@@ -294,6 +294,20 @@ static void efx_recycle_rx_buffers(struc + } while (--n_frags); + } + ++static void efx_discard_rx_packet(struct efx_channel *channel, ++ struct efx_rx_buffer *rx_buf, ++ unsigned int n_frags) ++{ ++ struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); ++ ++ efx_recycle_rx_pages(channel, rx_buf, n_frags); ++ ++ do { ++ efx_free_rx_buffer(rx_buf); ++ rx_buf = efx_rx_buf_next(rx_queue, rx_buf); ++ } while (--n_frags); ++} ++ + /** + * efx_fast_push_rx_descriptors - push new RX descriptors quickly + * @rx_queue: RX descriptor queue +@@ -533,8 +547,7 @@ void efx_rx_packet(struct efx_rx_queue * + */ + if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) { + efx_rx_flush_packet(channel); +- put_page(rx_buf->page); +- efx_recycle_rx_buffers(channel, rx_buf, n_frags); ++ efx_discard_rx_packet(channel, rx_buf, n_frags); + return; + } + +@@ -570,9 +583,9 @@ void efx_rx_packet(struct efx_rx_queue * + efx_sync_rx_buffer(efx, rx_buf, rx_buf->len); + } + +- /* All fragments have been DMA-synced, so recycle buffers and pages. */ ++ /* All fragments have been DMA-synced, so recycle pages. */ + rx_buf = efx_rx_buffer(rx_queue, index); +- efx_recycle_rx_buffers(channel, rx_buf, n_frags); ++ efx_recycle_rx_pages(channel, rx_buf, n_frags); + + /* Pipeline receives so that we give time for packet headers to be + * prefetched into cache. diff --git a/queue-3.10/sunvnet-vnet_port_remove-must-call-unregister_netdev.patch b/queue-3.10/sunvnet-vnet_port_remove-must-call-unregister_netdev.patch new file mode 100644 index 00000000000..39d12d62163 --- /dev/null +++ b/queue-3.10/sunvnet-vnet_port_remove-must-call-unregister_netdev.patch @@ -0,0 +1,30 @@ +From b1036ae16395f14a4e50b96bf09cc36d4bb5c802 Mon Sep 17 00:00:00 2001 +From: Dave Kleikamp +Date: Mon, 1 Jul 2013 16:49:22 -0500 +Subject: sunvnet: vnet_port_remove must call unregister_netdev + +From: Dave Kleikamp + +[ Upstream commit aabb9875d02559ab9b928cd6f259a5cc4c21a589 ] + +The missing call to unregister_netdev() leaves the interface active +after the driver is unloaded by rmmod. + +Signed-off-by: Dave Kleikamp +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/sun/sunvnet.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/sun/sunvnet.c ++++ b/drivers/net/ethernet/sun/sunvnet.c +@@ -1239,6 +1239,8 @@ static int vnet_port_remove(struct vio_d + dev_set_drvdata(&vdev->dev, NULL); + + kfree(port); ++ ++ unregister_netdev(vp->dev); + } + return 0; + } diff --git a/queue-3.10/tuntap-correctly-linearize-skb-when-zerocopy-is-used.patch b/queue-3.10/tuntap-correctly-linearize-skb-when-zerocopy-is-used.patch new file mode 100644 index 00000000000..966394e6d06 --- /dev/null +++ b/queue-3.10/tuntap-correctly-linearize-skb-when-zerocopy-is-used.patch @@ -0,0 +1,56 @@ +From 4782f7d41346ac49c6aa58ee9da6a7ff896cbe4c Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Wed, 10 Jul 2013 13:43:27 +0800 +Subject: tuntap: correctly linearize skb when zerocopy is used + +From: Jason Wang + +[ Upstream commit 3dd5c3308e8b671e8e8882ba972f51cefbe9fd0d ] + +Userspace may produce vectors greater than MAX_SKB_FRAGS. When we try to +linearize parts of the skb to let the rest of iov to be fit in +the frags, we need count copylen into linear when calling tun_alloc_skb() +instead of partly counting it into data_len. Since this breaks +zerocopy_sg_from_iovec() since its inner counter assumes nr_frags should +be zero at beginning. This cause nr_frags to be increased wrongly without +setting the correct frags. + +This bug were introduced from 0690899b4d4501b3505be069b9a687e68ccbe15b +(tun: experimental zero copy tx support) + +Cc: Michael S. Tsirkin +Signed-off-by: Jason Wang +Acked-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1044,7 +1044,7 @@ static ssize_t tun_get_user(struct tun_s + { + struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; + struct sk_buff *skb; +- size_t len = total_len, align = NET_SKB_PAD; ++ size_t len = total_len, align = NET_SKB_PAD, linear; + struct virtio_net_hdr gso = { 0 }; + int offset = 0; + int copylen; +@@ -1108,10 +1108,13 @@ static ssize_t tun_get_user(struct tun_s + copylen = gso.hdr_len; + if (!copylen) + copylen = GOODCOPY_LEN; +- } else ++ linear = copylen; ++ } else { + copylen = len; ++ linear = gso.hdr_len; ++ } + +- skb = tun_alloc_skb(tfile, align, copylen, gso.hdr_len, noblock); ++ skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) != -EAGAIN) + tun->dev->stats.rx_dropped++; diff --git a/queue-3.10/tuntap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch b/queue-3.10/tuntap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch new file mode 100644 index 00000000000..69f05122a90 --- /dev/null +++ b/queue-3.10/tuntap-do-not-zerocopy-if-iov-needs-more-pages-than-max_skb_frags.patch @@ -0,0 +1,124 @@ +From 9055660d71ce3255b6e2f3ce0050ce722ac4e594 Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Thu, 18 Jul 2013 10:55:15 +0800 +Subject: tuntap: do not zerocopy if iov needs more pages than MAX_SKB_FRAGS + +From: Jason Wang + +[ Upstream commit 885291761dba2bfe04df4c0f7bb75e4c920ab82e ] + +We try to linearize part of the skb when the number of iov is greater than +MAX_SKB_FRAGS. This is not enough since each single vector may occupy more than +one pages, so zerocopy_sg_fromiovec() may still fail and may break the guest +network. + +Solve this problem by calculate the pages needed for iov before trying to do +zerocopy and switch to use copy instead of zerocopy if it needs more than +MAX_SKB_FRAGS. + +This is done through introducing a new helper to count the pages for iov, and +call uarg->callback() manually when switching from zerocopy to copy to notify +vhost. + +We can do further optimization on top. + +The bug were introduced from commit 0690899b4d4501b3505be069b9a687e68ccbe15b +(tun: experimental zero copy tx support) + +Cc: Michael S. Tsirkin +Signed-off-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 62 +++++++++++++++++++++++++++++++++--------------------- + 1 file changed, 38 insertions(+), 24 deletions(-) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1037,6 +1037,29 @@ static int zerocopy_sg_from_iovec(struct + return 0; + } + ++static unsigned long iov_pages(const struct iovec *iv, int offset, ++ unsigned long nr_segs) ++{ ++ unsigned long seg, base; ++ int pages = 0, len, size; ++ ++ while (nr_segs && (offset >= iv->iov_len)) { ++ offset -= iv->iov_len; ++ ++iv; ++ --nr_segs; ++ } ++ ++ for (seg = 0; seg < nr_segs; seg++) { ++ base = (unsigned long)iv[seg].iov_base + offset; ++ len = iv[seg].iov_len - offset; ++ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; ++ pages += size; ++ offset = 0; ++ } ++ ++ return pages; ++} ++ + /* Get packet from user space buffer */ + static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, + void *msg_control, const struct iovec *iv, +@@ -1084,32 +1107,18 @@ static ssize_t tun_get_user(struct tun_s + return -EINVAL; + } + +- if (msg_control) +- zerocopy = true; +- +- if (zerocopy) { +- /* Userspace may produce vectors with count greater than +- * MAX_SKB_FRAGS, so we need to linearize parts of the skb +- * to let the rest of data to be fit in the frags. +- */ +- if (count > MAX_SKB_FRAGS) { +- copylen = iov_length(iv, count - MAX_SKB_FRAGS); +- if (copylen < offset) +- copylen = 0; +- else +- copylen -= offset; +- } else +- copylen = 0; +- /* There are 256 bytes to be copied in skb, so there is enough +- * room for skb expand head in case it is used. ++ if (msg_control) { ++ /* There are 256 bytes to be copied in skb, so there is ++ * enough room for skb expand head in case it is used. + * The rest of the buffer is mapped from userspace. + */ +- if (copylen < gso.hdr_len) +- copylen = gso.hdr_len; +- if (!copylen) +- copylen = GOODCOPY_LEN; ++ copylen = gso.hdr_len ? gso.hdr_len : GOODCOPY_LEN; + linear = copylen; +- } else { ++ if (iov_pages(iv, offset + copylen, count) <= MAX_SKB_FRAGS) ++ zerocopy = true; ++ } ++ ++ if (!zerocopy) { + copylen = len; + linear = gso.hdr_len; + } +@@ -1123,8 +1132,13 @@ static ssize_t tun_get_user(struct tun_s + + if (zerocopy) + err = zerocopy_sg_from_iovec(skb, iv, offset, count); +- else ++ else { + err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len); ++ if (!err && msg_control) { ++ struct ubuf_info *uarg = msg_control; ++ uarg->callback(uarg, false); ++ } ++ } + + if (err) { + tun->dev->stats.rx_dropped++; diff --git a/queue-3.10/vhost-net-fix-use-after-free-in-vhost_net_flush.patch b/queue-3.10/vhost-net-fix-use-after-free-in-vhost_net_flush.patch new file mode 100644 index 00000000000..c70bdead4d2 --- /dev/null +++ b/queue-3.10/vhost-net-fix-use-after-free-in-vhost_net_flush.patch @@ -0,0 +1,59 @@ +From d0347c6cbf229fe352006a5463eb2d0cb2150afb Mon Sep 17 00:00:00 2001 +From: "Michael S. Tsirkin" +Date: Tue, 25 Jun 2013 17:29:46 +0300 +Subject: vhost-net: fix use-after-free in vhost_net_flush + +From: "Michael S. Tsirkin" + +[ Upstream commit c38e39c378f46f00ce922dd40a91043a9925c28d ] + +vhost_net_ubuf_put_and_wait has a confusing name: +it will actually also free it's argument. +Thus since commit 1280c27f8e29acf4af2da914e80ec27c3dbd5c01 + "vhost-net: flush outstanding DMAs on memory change" +vhost_net_flush tries to use the argument after passing it +to vhost_net_ubuf_put_and_wait, this results +in use after free. +To fix, don't free the argument in vhost_net_ubuf_put_and_wait, +add an new API for callers that want to free ubufs. + +Acked-by: Asias He +Acked-by: Jason Wang +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Greg Kroah-Hartman +--- + drivers/vhost/net.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -150,6 +150,11 @@ static void vhost_net_ubuf_put_and_wait( + { + kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal); + wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount)); ++} ++ ++static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) ++{ ++ vhost_net_ubuf_put_and_wait(ubufs); + kfree(ubufs); + } + +@@ -948,7 +953,7 @@ static long vhost_net_set_backend(struct + mutex_unlock(&vq->mutex); + + if (oldubufs) { +- vhost_net_ubuf_put_and_wait(oldubufs); ++ vhost_net_ubuf_put_wait_and_free(oldubufs); + mutex_lock(&vq->mutex); + vhost_zerocopy_signal_used(n, vq); + mutex_unlock(&vq->mutex); +@@ -966,7 +971,7 @@ err_used: + rcu_assign_pointer(vq->private_data, oldsock); + vhost_net_enable_vq(n, vq); + if (ubufs) +- vhost_net_ubuf_put_and_wait(ubufs); ++ vhost_net_ubuf_put_wait_and_free(ubufs); + err_ubufs: + fput(sock->file); + err_vq: diff --git a/queue-3.10/virtio-support-unlocked-queue-poll.patch b/queue-3.10/virtio-support-unlocked-queue-poll.patch new file mode 100644 index 00000000000..4e16d11dbef --- /dev/null +++ b/queue-3.10/virtio-support-unlocked-queue-poll.patch @@ -0,0 +1,120 @@ +From 3af0cf8b6b161daea120a84ad3d525a121670947 Mon Sep 17 00:00:00 2001 +From: "Michael S. Tsirkin" +Date: Tue, 9 Jul 2013 13:19:18 +0300 +Subject: virtio: support unlocked queue poll + +From: "Michael S. Tsirkin" + +[ Upstream commit cc229884d3f77ec3b1240e467e0236c3e0647c0c ] + +This adds a way to check ring empty state after enable_cb outside any +locks. Will be used by virtio_net. + +Note: there's room for more optimization: caller is likely to have a +memory barrier already, which means we might be able to get rid of a +barrier here. Deferring this optimization until we do some +benchmarking. + +Signed-off-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/virtio/virtio_ring.c | 56 +++++++++++++++++++++++++++++++++---------- + include/linux/virtio.h | 4 +++ + 2 files changed, 48 insertions(+), 12 deletions(-) + +--- a/drivers/virtio/virtio_ring.c ++++ b/drivers/virtio/virtio_ring.c +@@ -607,19 +607,21 @@ void virtqueue_disable_cb(struct virtque + EXPORT_SYMBOL_GPL(virtqueue_disable_cb); + + /** +- * virtqueue_enable_cb - restart callbacks after disable_cb. ++ * virtqueue_enable_cb_prepare - restart callbacks after disable_cb + * @vq: the struct virtqueue we're talking about. + * +- * This re-enables callbacks; it returns "false" if there are pending +- * buffers in the queue, to detect a possible race between the driver +- * checking for more work, and enabling callbacks. ++ * This re-enables callbacks; it returns current queue state ++ * in an opaque unsigned value. This value should be later tested by ++ * virtqueue_poll, to detect a possible race between the driver checking for ++ * more work, and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +-bool virtqueue_enable_cb(struct virtqueue *_vq) ++unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq) + { + struct vring_virtqueue *vq = to_vvq(_vq); ++ u16 last_used_idx; + + START_USE(vq); + +@@ -629,15 +631,45 @@ bool virtqueue_enable_cb(struct virtqueu + * either clear the flags bit or point the event index at the next + * entry. Always do both to keep code simple. */ + vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; +- vring_used_event(&vq->vring) = vq->last_used_idx; ++ vring_used_event(&vq->vring) = last_used_idx = vq->last_used_idx; ++ END_USE(vq); ++ return last_used_idx; ++} ++EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare); ++ ++/** ++ * virtqueue_poll - query pending used buffers ++ * @vq: the struct virtqueue we're talking about. ++ * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare). ++ * ++ * Returns "true" if there are pending used buffers in the queue. ++ * ++ * This does not need to be serialized. ++ */ ++bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx) ++{ ++ struct vring_virtqueue *vq = to_vvq(_vq); ++ + virtio_mb(vq->weak_barriers); +- if (unlikely(more_used(vq))) { +- END_USE(vq); +- return false; +- } ++ return (u16)last_used_idx != vq->vring.used->idx; ++} ++EXPORT_SYMBOL_GPL(virtqueue_poll); + +- END_USE(vq); +- return true; ++/** ++ * virtqueue_enable_cb - restart callbacks after disable_cb. ++ * @vq: the struct virtqueue we're talking about. ++ * ++ * This re-enables callbacks; it returns "false" if there are pending ++ * buffers in the queue, to detect a possible race between the driver ++ * checking for more work, and enabling callbacks. ++ * ++ * Caller must ensure we don't call this with other virtqueue ++ * operations at the same time (except where noted). ++ */ ++bool virtqueue_enable_cb(struct virtqueue *_vq) ++{ ++ unsigned last_used_idx = virtqueue_enable_cb_prepare(_vq); ++ return !virtqueue_poll(_vq, last_used_idx); + } + EXPORT_SYMBOL_GPL(virtqueue_enable_cb); + +--- a/include/linux/virtio.h ++++ b/include/linux/virtio.h +@@ -70,6 +70,10 @@ void virtqueue_disable_cb(struct virtque + + bool virtqueue_enable_cb(struct virtqueue *vq); + ++unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq); ++ ++bool virtqueue_poll(struct virtqueue *vq, unsigned); ++ + bool virtqueue_enable_cb_delayed(struct virtqueue *vq); + + void *virtqueue_detach_unused_buf(struct virtqueue *vq); diff --git a/queue-3.10/virtio_net-fix-race-in-rx-vq-processing.patch b/queue-3.10/virtio_net-fix-race-in-rx-vq-processing.patch new file mode 100644 index 00000000000..0546e800dd6 --- /dev/null +++ b/queue-3.10/virtio_net-fix-race-in-rx-vq-processing.patch @@ -0,0 +1,59 @@ +From e6a032bca44cd54a168939ee66be707c9b679bec Mon Sep 17 00:00:00 2001 +From: "Michael S. Tsirkin" +Date: Tue, 9 Jul 2013 08:13:04 +0300 +Subject: virtio_net: fix race in RX VQ processing + +From: "Michael S. Tsirkin" + +[ Upstream commit cbdadbbf0c790f79350a8f36029208944c5487d0 ] + +virtio net called virtqueue_enable_cq on RX path after napi_complete, so +with NAPI_STATE_SCHED clear - outside the implicit napi lock. +This violates the requirement to synchronize virtqueue_enable_cq wrt +virtqueue_add_buf. In particular, used event can move backwards, +causing us to lose interrupts. +In a debug build, this can trigger panic within START_USE. + +Jason Wang reports that he can trigger the races artificially, +by adding udelay() in virtqueue_enable_cb() after virtio_mb(). + +However, we must call napi_complete to clear NAPI_STATE_SCHED before +polling the virtqueue for used buffers, otherwise napi_schedule_prep in +a callback will fail, causing us to lose RX events. + +To fix, call virtqueue_enable_cb_prepare with NAPI_STATE_SCHED +set (under napi lock), later call virtqueue_poll with +NAPI_STATE_SCHED clear (outside the lock). + +Reported-by: Jason Wang +Tested-by: Jason Wang +Acked-by: Jason Wang +Signed-off-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/virtio_net.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/drivers/net/virtio_net.c ++++ b/drivers/net/virtio_net.c +@@ -602,7 +602,7 @@ static int virtnet_poll(struct napi_stru + container_of(napi, struct receive_queue, napi); + struct virtnet_info *vi = rq->vq->vdev->priv; + void *buf; +- unsigned int len, received = 0; ++ unsigned int r, len, received = 0; + + again: + while (received < budget && +@@ -619,8 +619,9 @@ again: + + /* Out of packets? */ + if (received < budget) { ++ r = virtqueue_enable_cb_prepare(rq->vq); + napi_complete(napi); +- if (unlikely(!virtqueue_enable_cb(rq->vq)) && ++ if (unlikely(virtqueue_poll(rq->vq, r)) && + napi_schedule_prep(napi)) { + virtqueue_disable_cb(rq->vq); + __napi_schedule(napi); diff --git a/queue-3.10/vlan-fix-a-race-in-egress-prio-management.patch b/queue-3.10/vlan-fix-a-race-in-egress-prio-management.patch new file mode 100644 index 00000000000..19f79199f33 --- /dev/null +++ b/queue-3.10/vlan-fix-a-race-in-egress-prio-management.patch @@ -0,0 +1,47 @@ +From d766645d1d1f64631ef50df36c47c37bded82051 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Thu, 18 Jul 2013 09:35:10 -0700 +Subject: vlan: fix a race in egress prio management + +From: Eric Dumazet + +[ Upstream commit 3e3aac497513c669e1c62c71e1d552ea85c1d974 ] + +egress_priority_map[] hash table updates are protected by rtnl, +and we never remove elements until device is dismantled. + +We have to make sure that before inserting an new element in hash table, +all its fields are committed to memory or else another cpu could +find corrupt values and crash. + +Signed-off-by: Eric Dumazet +Cc: Patrick McHardy +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/8021q/vlan_dev.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/net/8021q/vlan_dev.c ++++ b/net/8021q/vlan_dev.c +@@ -73,6 +73,8 @@ vlan_dev_get_egress_qos_mask(struct net_ + { + struct vlan_priority_tci_mapping *mp; + ++ smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */ ++ + mp = vlan_dev_priv(dev)->egress_priority_map[(skb->priority & 0xF)]; + while (mp) { + if (mp->priority == skb->priority) { +@@ -249,6 +251,11 @@ int vlan_dev_set_egress_priority(const s + np->next = mp; + np->priority = skb_prio; + np->vlan_qos = vlan_qos; ++ /* Before inserting this element in hash table, make sure all its fields ++ * are committed to memory. ++ * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask() ++ */ ++ smp_wmb(); + vlan->egress_priority_map[skb_prio & 0xF] = np; + if (vlan_qos) + vlan->nr_egress_mappings++; diff --git a/queue-3.10/vlan-mask-vlan-prio-bits.patch b/queue-3.10/vlan-mask-vlan-prio-bits.patch new file mode 100644 index 00000000000..9af81e4f603 --- /dev/null +++ b/queue-3.10/vlan-mask-vlan-prio-bits.patch @@ -0,0 +1,93 @@ +From d001214123790aea1c3e77dd0b92136f0443a93a Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Thu, 18 Jul 2013 07:19:26 -0700 +Subject: vlan: mask vlan prio bits + +From: Eric Dumazet + +[ Upstream commit d4b812dea4a236f729526facf97df1a9d18e191c ] + +In commit 48cc32d38a52d0b68f91a171a8d00531edc6a46e +("vlan: don't deliver frames for unknown vlans to protocols") +Florian made sure we set pkt_type to PACKET_OTHERHOST +if the vlan id is set and we could find a vlan device for this +particular id. + +But we also have a problem if prio bits are set. + +Steinar reported an issue on a router receiving IPv6 frames with a +vlan tag of 4000 (id 0, prio 2), and tunneled into a sit device, +because skb->vlan_tci is set. + +Forwarded frame is completely corrupted : We can see (8100:4000) +being inserted in the middle of IPv6 source address : + +16:48:00.780413 IP6 2001:16d8:8100:4000:ee1c:0:9d9:bc87 > +9f94:4d95:2001:67c:29f4::: ICMP6, unknown icmp6 type (0), length 64 + 0x0000: 0000 0029 8000 c7c3 7103 0001 a0ae e651 + 0x0010: 0000 0000 ccce 0b00 0000 0000 1011 1213 + 0x0020: 1415 1617 1819 1a1b 1c1d 1e1f 2021 2223 + 0x0030: 2425 2627 2829 2a2b 2c2d 2e2f 3031 3233 + +It seems we are not really ready to properly cope with this right now. + +We can probably do better in future kernels : +vlan_get_ingress_priority() should be a netdev property instead of +a per vlan_dev one. + +For stable kernels, lets clear vlan_tci to fix the bugs. + +Reported-by: Steinar H. Gunderson +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/if_vlan.h | 3 +-- + net/8021q/vlan_core.c | 2 +- + net/core/dev.c | 11 +++++++++-- + 3 files changed, 11 insertions(+), 5 deletions(-) + +--- a/include/linux/if_vlan.h ++++ b/include/linux/if_vlan.h +@@ -79,9 +79,8 @@ static inline int is_vlan_dev(struct net + } + + #define vlan_tx_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) +-#define vlan_tx_nonzero_tag_present(__skb) \ +- (vlan_tx_tag_present(__skb) && ((__skb)->vlan_tci & VLAN_VID_MASK)) + #define vlan_tx_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) ++#define vlan_tx_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) + + #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + +--- a/net/8021q/vlan_core.c ++++ b/net/8021q/vlan_core.c +@@ -9,7 +9,7 @@ bool vlan_do_receive(struct sk_buff **sk + { + struct sk_buff *skb = *skbp; + __be16 vlan_proto = skb->vlan_proto; +- u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; ++ u16 vlan_id = vlan_tx_tag_get_id(skb); + struct net_device *vlan_dev; + struct vlan_pcpu_stats *rx_stats; + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -3513,8 +3513,15 @@ ncls: + } + } + +- if (vlan_tx_nonzero_tag_present(skb)) +- skb->pkt_type = PACKET_OTHERHOST; ++ if (unlikely(vlan_tx_tag_present(skb))) { ++ if (vlan_tx_tag_get_id(skb)) ++ skb->pkt_type = PACKET_OTHERHOST; ++ /* Note: we might in the future use prio bits ++ * and set skb->priority like in vlan_do_receive() ++ * For the time being, just ignore Priority Code Point ++ */ ++ skb->vlan_tci = 0; ++ } + + /* deliver only exact match when indicated */ + null_or_dev = deliver_exact ? skb->dev : NULL; diff --git a/queue-3.10/vti-remove-duplicated-code-to-fix-a-memory-leak.patch b/queue-3.10/vti-remove-duplicated-code-to-fix-a-memory-leak.patch new file mode 100644 index 00000000000..37ea14bdb11 --- /dev/null +++ b/queue-3.10/vti-remove-duplicated-code-to-fix-a-memory-leak.patch @@ -0,0 +1,47 @@ +From 9df2226e2e019b405e6320599a6c07ef1e4be799 Mon Sep 17 00:00:00 2001 +From: Cong Wang +Date: Sat, 29 Jun 2013 13:00:57 +0800 +Subject: vti: remove duplicated code to fix a memory leak + +From: Cong Wang + +[ Upstream commit ab6c7a0a43c2eaafa57583822b619b22637b49c7 ] + +vti module allocates dev->tstats twice: in vti_fb_tunnel_init() +and in vti_tunnel_init(), this lead to a memory leak of +dev->tstats. + +Just remove the duplicated operations in vti_fb_tunnel_init(). + +(candidate for -stable) + +Signed-off-by: Cong Wang +Cc: Stephen Hemminger +Cc: Saurabh Mohan +Acked-by: Stephen Hemminger +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_vti.c | 7 ------- + 1 file changed, 7 deletions(-) + +--- a/net/ipv4/ip_vti.c ++++ b/net/ipv4/ip_vti.c +@@ -606,17 +606,10 @@ static int __net_init vti_fb_tunnel_init + struct iphdr *iph = &tunnel->parms.iph; + struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id); + +- tunnel->dev = dev; +- strcpy(tunnel->parms.name, dev->name); +- + iph->version = 4; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + +- dev->tstats = alloc_percpu(struct pcpu_tstats); +- if (!dev->tstats) +- return -ENOMEM; +- + dev_hold(dev); + rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); + return 0; diff --git a/queue-3.10/x25-fix-broken-locking-in-ioctl-error-paths.patch b/queue-3.10/x25-fix-broken-locking-in-ioctl-error-paths.patch new file mode 100644 index 00000000000..e22ba9a3e42 --- /dev/null +++ b/queue-3.10/x25-fix-broken-locking-in-ioctl-error-paths.patch @@ -0,0 +1,64 @@ +From ebae8ce31e1b43d3bcf62d5e906cc9ece42428ab Mon Sep 17 00:00:00 2001 +From: Dave Jones +Date: Fri, 28 Jun 2013 12:13:52 -0400 +Subject: x25: Fix broken locking in ioctl error paths. + +From: Dave Jones + +[ Upstream commit 4ccb93ce7439b63c31bc7597bfffd13567fa483d ] + +Two of the x25 ioctl cases have error paths that break out of the function without +unlocking the socket, leading to this warning: + +================================================ +[ BUG: lock held when returning to user space! ] +3.10.0-rc7+ #36 Not tainted +------------------------------------------------ +trinity-child2/31407 is leaving the kernel with locks still held! +1 lock held by trinity-child2/31407: + #0: (sk_lock-AF_X25){+.+.+.}, at: [] x25_ioctl+0x8a/0x740 [x25] + +Signed-off-by: Dave Jones +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/x25/af_x25.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +--- a/net/x25/af_x25.c ++++ b/net/x25/af_x25.c +@@ -1583,11 +1583,11 @@ out_cud_release: + case SIOCX25CALLACCPTAPPRV: { + rc = -EINVAL; + lock_sock(sk); +- if (sk->sk_state != TCP_CLOSE) +- break; +- clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); ++ if (sk->sk_state == TCP_CLOSE) { ++ clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); ++ rc = 0; ++ } + release_sock(sk); +- rc = 0; + break; + } + +@@ -1595,14 +1595,15 @@ out_cud_release: + rc = -EINVAL; + lock_sock(sk); + if (sk->sk_state != TCP_ESTABLISHED) +- break; ++ goto out_sendcallaccpt_release; + /* must call accptapprv above */ + if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) +- break; ++ goto out_sendcallaccpt_release; + x25_write_internal(sk, X25_CALL_ACCEPTED); + x25->state = X25_STATE_3; +- release_sock(sk); + rc = 0; ++out_sendcallaccpt_release: ++ release_sock(sk); + break; + } +