From: Greg Kroah-Hartman Date: Thu, 10 Sep 2009 22:11:24 +0000 (-0700) Subject: start .30 queue up X-Git-Tag: v2.6.27.34~6 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=3139eb1afecb156dbadd0bb19f6b28c521692196;p=thirdparty%2Fkernel%2Fstable-queue.git start .30 queue up --- diff --git a/queue-2.6.30/dccp-missing-destroy-of-percpu-counter-variable-while-unload-module.patch b/queue-2.6.30/dccp-missing-destroy-of-percpu-counter-variable-while-unload-module.patch new file mode 100644 index 00000000000..e6c0f84ae58 --- /dev/null +++ b/queue-2.6.30/dccp-missing-destroy-of-percpu-counter-variable-while-unload-module.patch @@ -0,0 +1,57 @@ +From e526b3b07e9d1ab5726a7fb73da788bdbbf73c31 Mon Sep 17 00:00:00 2001 +From: Wei Yongjun +Date: Tue, 4 Aug 2009 21:44:39 +0000 +Subject: dccp: missing destroy of percpu counter variable while unload module + +From: Wei Yongjun + +[ Upstream commit 476181cb05c6a3aea3ef42309388e255c934a06f ] + +percpu counter dccp_orphan_count is init in dccp_init() by +percpu_counter_init() while dccp module is loaded, but the +destroy of it is missing while dccp module is unloaded. We +can get the kernel WARNING about this. Reproduct by the +following commands: + + $ modprobe dccp + $ rmmod dccp + $ modprobe dccp + +WARNING: at lib/list_debug.c:26 __list_add+0x27/0x5c() +Hardware name: VMware Virtual Platform +list_add corruption. next->prev should be prev (c080c0c4), but was (null). (next +=ca7188cc). +Modules linked in: dccp(+) nfsd lockd nfs_acl auth_rpcgss exportfs sunrpc +Pid: 1956, comm: modprobe Not tainted 2.6.31-rc5 #55 +Call Trace: + [] warn_slowpath_common+0x6a/0x81 + [] ? __list_add+0x27/0x5c + [] warn_slowpath_fmt+0x29/0x2c + [] __list_add+0x27/0x5c + [] __percpu_counter_init+0x4d/0x5d + [] dccp_init+0x19/0x2ed [dccp] + [] do_one_initcall+0x4f/0x111 + [] ? dccp_init+0x0/0x2ed [dccp] + [] ? notifier_call_chain+0x26/0x48 + [] ? __blocking_notifier_call_chain+0x45/0x51 + [] sys_init_module+0xac/0x1bd + [] sysenter_do_call+0x12/0x22 + +Signed-off-by: Wei Yongjun +Acked-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/dccp/proto.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/dccp/proto.c ++++ b/net/dccp/proto.c +@@ -1159,6 +1159,7 @@ static void __exit dccp_fini(void) + kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); + dccp_ackvec_exit(); + dccp_sysctl_exit(); ++ percpu_counter_destroy(&dccp_orphan_count); + } + + module_init(dccp_init); diff --git a/queue-2.6.30/e100-fix-interaction-with-swiotlb-on-x86.patch b/queue-2.6.30/e100-fix-interaction-with-swiotlb-on-x86.patch new file mode 100644 index 00000000000..7efc4550bf0 --- /dev/null +++ b/queue-2.6.30/e100-fix-interaction-with-swiotlb-on-x86.patch @@ -0,0 +1,42 @@ +From 7768af80d97e6ab0ee11c7bf8b2b3b7ec7611581 Mon Sep 17 00:00:00 2001 +From: Krzysztof Hałasa +Date: Sun, 23 Aug 2009 19:02:13 -0700 +Subject: E100: fix interaction with swiotlb on X86. + +From: Krzysztof Hałasa + +[ Upstream commit 6ff9c2e7fa8ca63a575792534b63c5092099c286 ] + +E100 places it's RX packet descriptors inside skb->data and uses them +with bidirectional streaming DMA mapping. Data in descriptors is +accessed simultaneously by the chip (writing status and size when +a packet is received) and CPU (reading to check if the packet was +received). This isn't a valid usage of PCI DMA API, which requires use +of the coherent (consistent) memory for such purpose. Unfortunately e100 +chips working in "simplified" RX mode have to store received data +directly after the descriptor. Fixing the driver to conform to the API +would require using unsupported "flexible" RX mode or receiving data +into a coherent memory and using CPU to copy it to network buffers. + +This patch, while not yet making the driver conform to the PCI DMA API, +allows it to work correctly on X86 with swiotlb (while not breaking +other architectures). + +Signed-off-by: Krzysztof Hałasa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/e100.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/e100.c ++++ b/drivers/net/e100.c +@@ -1764,7 +1764,7 @@ static int e100_rx_indicate(struct nic * + nic->ru_running = RU_SUSPENDED; + pci_dma_sync_single_for_device(nic->pdev, rx->dma_addr, + sizeof(struct rfd), +- PCI_DMA_BIDIRECTIONAL); ++ PCI_DMA_FROMDEVICE); + return -ENODATA; + } + diff --git a/queue-2.6.30/gre-fix-mtu-calculation-for-bound-gre-tunnels.patch b/queue-2.6.30/gre-fix-mtu-calculation-for-bound-gre-tunnels.patch new file mode 100644 index 00000000000..3be8b78158a --- /dev/null +++ b/queue-2.6.30/gre-fix-mtu-calculation-for-bound-gre-tunnels.patch @@ -0,0 +1,32 @@ +From 40c8613fe3e601d957d820c3fa4b759fc285939f Mon Sep 17 00:00:00 2001 +From: Tom Goff +Date: Fri, 14 Aug 2009 16:33:56 -0700 +Subject: gre: Fix MTU calculation for bound GRE tunnels + +From: Tom Goff + +[ Upstream commit 8cdb045632e5ee22854538619ac6f150eb0a4894 ] + +The GRE header length should be subtracted when the tunnel MTU is +calculated. This just corrects for the associativity change +introduced by commit 42aa916265d740d66ac1f17290366e9494c884c2 +("gre: Move MTU setting out of ipgre_tunnel_bind_dev"). + +Signed-off-by: Tom Goff +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_gre.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv4/ip_gre.c ++++ b/net/ipv4/ip_gre.c +@@ -952,7 +952,7 @@ static int ipgre_tunnel_bind_dev(struct + addend += 4; + } + dev->needed_headroom = addend + hlen; +- mtu -= dev->hard_header_len - addend; ++ mtu -= dev->hard_header_len + addend; + + if (mtu < 68) + mtu = 68; diff --git a/queue-2.6.30/net-net_assign_generic-fix.patch b/queue-2.6.30/net-net_assign_generic-fix.patch new file mode 100644 index 00000000000..01d34a5cf6a --- /dev/null +++ b/queue-2.6.30/net-net_assign_generic-fix.patch @@ -0,0 +1,31 @@ +From a76437ff1a8cd2d155baeda4e548c81e0315b541 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Tue, 28 Jul 2009 02:36:15 +0000 +Subject: net: net_assign_generic() fix + +From: Eric Dumazet + +[ Upstream commit 144586301f6af5ae5943a002f030d8c626fa4fdd ] + +memcpy() should take into account size of pointers, +not only number of pointers to copy. + +Signed-off-by: Eric Dumazet +Acked-by: Pavel Emelyanov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/net_namespace.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/core/net_namespace.c ++++ b/net/core/net_namespace.c +@@ -498,7 +498,7 @@ int net_assign_generic(struct net *net, + */ + + ng->len = id; +- memcpy(&ng->ptr, &old_ng->ptr, old_ng->len); ++ memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); + + rcu_assign_pointer(net->gen, ng); + call_rcu(&old_ng->rcu, net_generic_release); diff --git a/queue-2.6.30/ppp-fix-lost-fragments-in-ppp_mp_explode.patch b/queue-2.6.30/ppp-fix-lost-fragments-in-ppp_mp_explode.patch new file mode 100644 index 00000000000..5a151199a51 --- /dev/null +++ b/queue-2.6.30/ppp-fix-lost-fragments-in-ppp_mp_explode.patch @@ -0,0 +1,105 @@ +From c5b89d56629098cfdd9ae024a434fd781564c50e Mon Sep 17 00:00:00 2001 +From: Ben McKeegan +Date: Tue, 28 Jul 2009 07:43:57 +0000 +Subject: ppp: fix lost fragments in ppp_mp_explode() (resubmit) + +From: Ben McKeegan + +[ Upstream commit a53a8b56827cc429c6d9f861ad558beeb5f6103f ] + +This patch fixes the corner cases where the sum of MTU of the free +channels (adjusted for fragmentation overheads) is less than the MTU +of PPP link. There are at least 3 situations where this case might +arise: + +- some of the channels are busy + +- the multilink session is running in a degraded state (i.e. with less +than its full complement of active channels) + +- by design, where multilink protocol is being used to artificially +increase the effective link MTU of a single link. + +Without this patch, at most 1 fragment is ever sent per free channel +for a given PPP frame and any remaining part of the PPP frame that +does not fit into those fragments is silently discarded. + +This patch restores the original behaviour which was broken by commit +9c705260feea6ae329bc6b6d5f6d2ef0227eda0a 'ppp:ppp_mp_explode() +redesign'. Once all 'free' channels have been given a fragment, an +additional fragment is queued to each available channel in turn, as many +times as necessary, until the entire PPP frame has been consumed. + +Signed-off-by: Ben McKeegan +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ppp_generic.c | 34 ++++++++++++++++++---------------- + 1 file changed, 18 insertions(+), 16 deletions(-) + +--- a/drivers/net/ppp_generic.c ++++ b/drivers/net/ppp_generic.c +@@ -1383,7 +1383,7 @@ static int ppp_mp_explode(struct ppp *pp + + /* create a fragment for each channel */ + bits = B; +- while (nfree > 0 && len > 0) { ++ while (len > 0) { + list = list->next; + if (list == &ppp->channels) { + i = 0; +@@ -1430,29 +1430,31 @@ static int ppp_mp_explode(struct ppp *pp + *otherwise divide it according to the speed + *of the channel we are going to transmit on + */ +- if (pch->speed == 0) { +- flen = totlen/nfree ; +- if (nbigger > 0) { +- flen++; +- nbigger--; +- } +- } else { +- flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) / +- ((totspeed*totfree)/pch->speed)) - hdrlen; +- if (nbigger > 0) { +- flen += ((totfree - nzero)*pch->speed)/totspeed; +- nbigger -= ((totfree - nzero)*pch->speed)/ ++ if (nfree > 0) { ++ if (pch->speed == 0) { ++ flen = totlen/nfree ; ++ if (nbigger > 0) { ++ flen++; ++ nbigger--; ++ } ++ } else { ++ flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) / ++ ((totspeed*totfree)/pch->speed)) - hdrlen; ++ if (nbigger > 0) { ++ flen += ((totfree - nzero)*pch->speed)/totspeed; ++ nbigger -= ((totfree - nzero)*pch->speed)/ + totspeed; ++ } + } ++ nfree--; + } +- nfree--; + + /* + *check if we are on the last channel or + *we exceded the lenght of the data to + *fragment + */ +- if ((nfree == 0) || (flen > len)) ++ if ((nfree <= 0) || (flen > len)) + flen = len; + /* + *it is not worth to tx on slow channels: +@@ -1466,7 +1468,7 @@ static int ppp_mp_explode(struct ppp *pp + continue; + } + +- mtu = pch->chan->mtu + 2 - hdrlen; ++ mtu = pch->chan->mtu - hdrlen; + if (mtu < 4) + mtu = 4; + if (flen > mtu) diff --git a/queue-2.6.30/pppol2tp-calls-unregister_pernet_gen_device-at-unload-time.patch b/queue-2.6.30/pppol2tp-calls-unregister_pernet_gen_device-at-unload-time.patch new file mode 100644 index 00000000000..a07abfc8c2e --- /dev/null +++ b/queue-2.6.30/pppol2tp-calls-unregister_pernet_gen_device-at-unload-time.patch @@ -0,0 +1,30 @@ +From 2ca97fbae7af87206fdbdc8112651f9fe963099d Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Tue, 28 Jul 2009 03:47:39 +0000 +Subject: pppol2tp: calls unregister_pernet_gen_device() at unload time + +From: Eric Dumazet + +[ Upstream commit 446e72f30eca76d6f9a1a54adf84d2c6ba2831f8 ] + +Failure to call unregister_pernet_gen_device() can exhaust memory +if module is loaded/unloaded many times. + +Signed-off-by: Eric Dumazet +Acked-by: Cyrill Gorcunov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/pppol2tp.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/pppol2tp.c ++++ b/drivers/net/pppol2tp.c +@@ -2682,6 +2682,7 @@ out_unregister_pppol2tp_proto: + static void __exit pppol2tp_exit(void) + { + unregister_pppox_proto(PX_PROTO_OL2TP); ++ unregister_pernet_gen_device(pppol2tp_net_id, &pppol2tp_net_ops); + proto_unregister(&pppol2tp_sk_proto); + } + diff --git a/queue-2.6.30/series b/queue-2.6.30/series new file mode 100644 index 00000000000..6481d94f00d --- /dev/null +++ b/queue-2.6.30/series @@ -0,0 +1,10 @@ +dccp-missing-destroy-of-percpu-counter-variable-while-unload-module.patch +e100-fix-interaction-with-swiotlb-on-x86.patch +gre-fix-mtu-calculation-for-bound-gre-tunnels.patch +ppp-fix-lost-fragments-in-ppp_mp_explode.patch +pppol2tp-calls-unregister_pernet_gen_device-at-unload-time.patch +net-net_assign_generic-fix.patch +sparc64-kill-spurious-nmi-watchdog-triggers-by-increasing-limit-to-30-seconds.patch +sparc64-validate-linear-d-tlb-misses.patch +sparc64-fix-bootup-with-mcount-in-some-configs.patch +sparc-sys32.s-incorrect-compat-layer-splice-system-call.patch diff --git a/queue-2.6.30/sparc-sys32.s-incorrect-compat-layer-splice-system-call.patch b/queue-2.6.30/sparc-sys32.s-incorrect-compat-layer-splice-system-call.patch new file mode 100644 index 00000000000..4e3615a8c53 --- /dev/null +++ b/queue-2.6.30/sparc-sys32.s-incorrect-compat-layer-splice-system-call.patch @@ -0,0 +1,40 @@ +From f7ef3cabdd8459b9d76348f020e7cf9db04b8666 Mon Sep 17 00:00:00 2001 +From: Mathieu Desnoyers +Date: Tue, 18 Aug 2009 20:16:55 -0700 +Subject: sparc: sys32.S incorrect compat-layer splice() system call + +From: Mathieu Desnoyers + +[ Upstream commit e2c6cbd9ace61039d3de39e717195e38f1492aee ] + +I think arch/sparc/kernel/sys32.S has an incorrect splice definition: + +SIGN2(sys32_splice, sys_splice, %o0, %o1) + +The splice() prototype looks like : + + long splice(int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, unsigned int flags); + +So I think we should have : + +SIGN2(sys32_splice, sys_splice, %o0, %o2) + +Signed-off-by: Mathieu Desnoyers +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/sparc/kernel/sys32.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/sparc/kernel/sys32.S ++++ b/arch/sparc/kernel/sys32.S +@@ -134,7 +134,7 @@ SIGN1(sys32_getpeername, sys_getpeername + SIGN1(sys32_getsockname, sys_getsockname, %o0) + SIGN2(sys32_ioprio_get, sys_ioprio_get, %o0, %o1) + SIGN3(sys32_ioprio_set, sys_ioprio_set, %o0, %o1, %o2) +-SIGN2(sys32_splice, sys_splice, %o0, %o1) ++SIGN2(sys32_splice, sys_splice, %o0, %o2) + SIGN2(sys32_sync_file_range, compat_sync_file_range, %o0, %o5) + SIGN2(sys32_tee, sys_tee, %o0, %o1) + SIGN1(sys32_vmsplice, compat_sys_vmsplice, %o0) diff --git a/queue-2.6.30/sparc64-fix-bootup-with-mcount-in-some-configs.patch b/queue-2.6.30/sparc64-fix-bootup-with-mcount-in-some-configs.patch new file mode 100644 index 00000000000..4a9d54e462b --- /dev/null +++ b/queue-2.6.30/sparc64-fix-bootup-with-mcount-in-some-configs.patch @@ -0,0 +1,85 @@ +From 1979da7c6cb0aa247f3d2baa03b429cc90f723e1 Mon Sep 17 00:00:00 2001 +From: David S. Miller +Date: Fri, 4 Sep 2009 03:38:54 -0700 +Subject: sparc64: Fix bootup with mcount in some configs. + +From: David S. Miller + +[ Upstream commit bd4352cadfacb9084c97c853b025fac010266c26 ] + +Functions invoked early when booting up a cpu can't use +tracing because mcount requires a valid 'current_thread_info()' +and TLB mappings to be setup. + +The code path of sun4v_register_mondo_queues --> register_one_mondo +is one such case. sun4v_register_mondo_queues already has the +necessary 'notrace' annotation, but register_one_mondo does not. + +Normally register_one_mondo is inlined so the bug doesn't trigger, +but with some config/compiler combinations, it won't be so we +must properly mark it notrace. + +While we're here, add 'notrace' annoations to prom_printf and +prom_halt so that early error handling won't have the same problem. + +Reported-by: Alexander Beregalov +Reported-by: Leif Sawyer +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/sparc/kernel/irq_64.c | 2 +- + arch/sparc/prom/misc_64.c | 2 +- + arch/sparc/prom/printf.c | 7 +++---- + 3 files changed, 5 insertions(+), 6 deletions(-) + +--- a/arch/sparc/kernel/irq_64.c ++++ b/arch/sparc/kernel/irq_64.c +@@ -902,7 +902,7 @@ void notrace init_irqwork_curcpu(void) + * Therefore you cannot make any OBP calls, not even prom_printf, + * from these two routines. + */ +-static void __cpuinit register_one_mondo(unsigned long paddr, unsigned long type, unsigned long qmask) ++static void __cpuinit notrace register_one_mondo(unsigned long paddr, unsigned long type, unsigned long qmask) + { + unsigned long num_entries = (qmask + 1) / 64; + unsigned long status; +--- a/arch/sparc/prom/misc_64.c ++++ b/arch/sparc/prom/misc_64.c +@@ -88,7 +88,7 @@ void prom_cmdline(void) + /* Drop into the prom, but completely terminate the program. + * No chance of continuing. + */ +-void prom_halt(void) ++void notrace prom_halt(void) + { + #ifdef CONFIG_SUN_LDOMS + if (ldom_domaining_enabled) +--- a/arch/sparc/prom/printf.c ++++ b/arch/sparc/prom/printf.c +@@ -14,14 +14,14 @@ + */ + + #include ++#include + + #include + #include + + static char ppbuf[1024]; + +-void +-prom_write(const char *buf, unsigned int n) ++void notrace prom_write(const char *buf, unsigned int n) + { + char ch; + +@@ -33,8 +33,7 @@ prom_write(const char *buf, unsigned int + } + } + +-void +-prom_printf(const char *fmt, ...) ++void notrace prom_printf(const char *fmt, ...) + { + va_list args; + int i; diff --git a/queue-2.6.30/sparc64-kill-spurious-nmi-watchdog-triggers-by-increasing-limit-to-30-seconds.patch b/queue-2.6.30/sparc64-kill-spurious-nmi-watchdog-triggers-by-increasing-limit-to-30-seconds.patch new file mode 100644 index 00000000000..a10b357b1e2 --- /dev/null +++ b/queue-2.6.30/sparc64-kill-spurious-nmi-watchdog-triggers-by-increasing-limit-to-30-seconds.patch @@ -0,0 +1,75 @@ +From 3a3326a902700916268d671eb8ac154de059c91a Mon Sep 17 00:00:00 2001 +From: David S. Miller +Date: Thu, 3 Sep 2009 02:35:20 -0700 +Subject: sparc64: Kill spurious NMI watchdog triggers by increasing limit to 30 seconds. + +From: David S. Miller + +[ Upstream commit e6617c6ec28a17cf2f90262b835ec05b9b861400 ] + +This is a compromise and a temporary workaround for bootup NMI +watchdog triggers some people see with qla2xxx devices present. + +This happens when, for example: + +CPU 0 is in the driver init and looping submitting mailbox commands to +load the firmware, then waiting for completion. + +CPU 1 is receiving the device interrupts. CPU 1 is where the NMI +watchdog triggers. + +CPU 0 is submitting mailbox commands fast enough that by the time CPU +1 returns from the device interrupt handler, a new one is pending. +This sequence runs for more than 5 seconds. + +The problematic case is CPU 1's timer interrupt running when the +barrage of device interrupts begin. Then we have: + + timer interrupt + return for softirq checking + pending, thus enable interrupts + + qla2xxx interrupt + return + qla2xxx interrupt + return + ... 5+ seconds pass + final qla2xxx interrupt for fw load + return + + run timer softirq + return + +At some point in the multi-second qla2xxx interrupt storm we trigger +the NMI watchdog on CPU 1 from the NMI interrupt handler. + +The timer softirq, once we get back to running it, is smart enough to +run the timer work enough times to make up for the missed timer +interrupts. + +However, the NMI watchdogs (both x86 and sparc) use the timer +interrupt count to notice the cpu is wedged. But in the above +scenerio we'll receive only one such timer interrupt even if we last +all the way back to running the timer softirq. + +The default watchdog trigger point is only 5 seconds, which is pretty +low (the softwatchdog triggers at 60 seconds). So increase it to 30 +seconds for now. + +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/sparc/kernel/nmi.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/sparc/kernel/nmi.c ++++ b/arch/sparc/kernel/nmi.c +@@ -103,7 +103,7 @@ notrace __kprobes void perfctr_irq(int i + } + if (!touched && __get_cpu_var(last_irq_sum) == sum) { + local_inc(&__get_cpu_var(alert_counter)); +- if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) ++ if (local_read(&__get_cpu_var(alert_counter)) == 30 * nmi_hz) + die_nmi("BUG: NMI Watchdog detected LOCKUP", + regs, panic_on_timeout); + } else { diff --git a/queue-2.6.30/sparc64-validate-linear-d-tlb-misses.patch b/queue-2.6.30/sparc64-validate-linear-d-tlb-misses.patch new file mode 100644 index 00000000000..1b9d9cb952d --- /dev/null +++ b/queue-2.6.30/sparc64-validate-linear-d-tlb-misses.patch @@ -0,0 +1,240 @@ +From 29b0ff908a19b5d98e5a5d080a5b9f6d56ccdd5a Mon Sep 17 00:00:00 2001 +From: David S. Miller +Date: Tue, 25 Aug 2009 16:47:46 -0700 +Subject: sparc64: Validate linear D-TLB misses. + +From: David S. Miller + +[ Upstream commit d8ed1d43e17898761c7221014a15a4c7501d2ff3 ] + +When page alloc debugging is not enabled, we essentially accept any +virtual address for linear kernel TLB misses. But with kgdb, kernel +address probing, and other facilities we can try to access arbitrary +crap. + +So, make sure the address we miss on will translate to physical memory +that actually exists. + +In order to make this work we have to embed the valid address bitmap +into the kernel image. And in order to make that less expensive we +make an adjustment, in that the max physical memory address is +decreased to "1 << 41", even on the chips that support a 42-bit +physical address space. We can do this because bit 41 indicates +"I/O space" and thus covers non-memory ranges. + +The result of this is that: + +1) kpte_linear_bitmap shrinks from 2K to 1K in size + +2) we need 64K more for the valid address bitmap + +We can't let the valid address bitmap be dynamically allocated +once we start using it to validate TLB misses, otherwise we have +crazy issues to deal with wrt. recursive TLB misses and such. + +If we're in a TLB miss it could be the deepest trap level that's legal +inside of the cpu. So if we TLB miss referencing the bitmap, the cpu +will be out of trap levels and enter RED state. + +To guard against out-of-range accesses to the bitmap, we have to check +to make sure no bits in the physical address above bit 40 are set. We +could export and use last_valid_pfn for this check, but that's just an +unnecessary extra memory reference. + +On the plus side of all this, since we load all of these translations +into the special 4MB mapping TSB, and we check the TSB first for TLB +misses, there should be absolutely no real cost for these new checks +in the TLB miss path. + +Reported-by: heyongli@gmail.com +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + arch/sparc/include/asm/pgtable_64.h | 12 +++++++--- + arch/sparc/kernel/ktlb.S | 42 +++++++++++++++++++++++++++++++---- + arch/sparc/mm/init_64.c | 43 ++++++++++++++++++++---------------- + arch/sparc/mm/init_64.h | 7 ++++- + 4 files changed, 76 insertions(+), 28 deletions(-) + +--- a/arch/sparc/include/asm/pgtable_64.h ++++ b/arch/sparc/include/asm/pgtable_64.h +@@ -726,11 +726,17 @@ extern unsigned long pte_file(pte_t); + extern pte_t pgoff_to_pte(unsigned long); + #define PTE_FILE_MAX_BITS (64UL - PAGE_SHIFT - 1UL) + +-extern unsigned long *sparc64_valid_addr_bitmap; ++extern unsigned long sparc64_valid_addr_bitmap[]; + + /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ +-#define kern_addr_valid(addr) \ +- (test_bit(__pa((unsigned long)(addr))>>22, sparc64_valid_addr_bitmap)) ++static inline bool kern_addr_valid(unsigned long addr) ++{ ++ unsigned long paddr = __pa(addr); ++ ++ if ((paddr >> 41UL) != 0UL) ++ return false; ++ return test_bit(paddr >> 22, sparc64_valid_addr_bitmap); ++} + + extern int page_in_phys_avail(unsigned long paddr); + +--- a/arch/sparc/kernel/ktlb.S ++++ b/arch/sparc/kernel/ktlb.S +@@ -151,12 +151,46 @@ kvmap_dtlb_4v: + * Must preserve %g1 and %g6 (TAG). + */ + kvmap_dtlb_tsb4m_miss: +- sethi %hi(kpte_linear_bitmap), %g2 +- or %g2, %lo(kpte_linear_bitmap), %g2 ++ /* Clear the PAGE_OFFSET top virtual bits, shift ++ * down to get PFN, and make sure PFN is in range. ++ */ ++ sllx %g4, 21, %g5 + +- /* Clear the PAGE_OFFSET top virtual bits, then shift +- * down to get a 256MB physical address index. ++ /* Check to see if we know about valid memory at the 4MB ++ * chunk this physical address will reside within. + */ ++ srlx %g5, 21 + 41, %g2 ++ brnz,pn %g2, kvmap_dtlb_longpath ++ nop ++ ++ /* This unconditional branch and delay-slot nop gets patched ++ * by the sethi sequence once the bitmap is properly setup. ++ */ ++ .globl valid_addr_bitmap_insn ++valid_addr_bitmap_insn: ++ ba,pt %xcc, 2f ++ nop ++ .subsection 2 ++ .globl valid_addr_bitmap_patch ++valid_addr_bitmap_patch: ++ sethi %hi(sparc64_valid_addr_bitmap), %g7 ++ or %g7, %lo(sparc64_valid_addr_bitmap), %g7 ++ .previous ++ ++ srlx %g5, 21 + 22, %g2 ++ srlx %g2, 6, %g5 ++ and %g2, 63, %g2 ++ sllx %g5, 3, %g5 ++ ldx [%g7 + %g5], %g5 ++ mov 1, %g7 ++ sllx %g7, %g2, %g7 ++ andcc %g5, %g7, %g0 ++ be,pn %xcc, kvmap_dtlb_longpath ++ ++2: sethi %hi(kpte_linear_bitmap), %g2 ++ or %g2, %lo(kpte_linear_bitmap), %g2 ++ ++ /* Get the 256MB physical address index. */ + sllx %g4, 21, %g5 + mov 1, %g7 + srlx %g5, 21 + 28, %g5 +--- a/arch/sparc/mm/init_64.c ++++ b/arch/sparc/mm/init_64.c +@@ -145,7 +145,8 @@ static void __init read_obp_memory(const + cmp_p64, NULL); + } + +-unsigned long *sparc64_valid_addr_bitmap __read_mostly; ++unsigned long sparc64_valid_addr_bitmap[VALID_ADDR_BITMAP_BYTES / ++ sizeof(unsigned long)]; + EXPORT_SYMBOL(sparc64_valid_addr_bitmap); + + /* Kernel physical address base and size in bytes. */ +@@ -1876,7 +1877,7 @@ static int pavail_rescan_ents __initdata + * memory list again, and make sure it provides at least as much + * memory as 'pavail' does. + */ +-static void __init setup_valid_addr_bitmap_from_pavail(void) ++static void __init setup_valid_addr_bitmap_from_pavail(unsigned long *bitmap) + { + int i; + +@@ -1899,8 +1900,7 @@ static void __init setup_valid_addr_bitm + + if (new_start <= old_start && + new_end >= (old_start + PAGE_SIZE)) { +- set_bit(old_start >> 22, +- sparc64_valid_addr_bitmap); ++ set_bit(old_start >> 22, bitmap); + goto do_next_page; + } + } +@@ -1921,20 +1921,21 @@ static void __init setup_valid_addr_bitm + } + } + ++static void __init patch_tlb_miss_handler_bitmap(void) ++{ ++ extern unsigned int valid_addr_bitmap_insn[]; ++ extern unsigned int valid_addr_bitmap_patch[]; ++ ++ valid_addr_bitmap_insn[1] = valid_addr_bitmap_patch[1]; ++ mb(); ++ valid_addr_bitmap_insn[0] = valid_addr_bitmap_patch[0]; ++ flushi(&valid_addr_bitmap_insn[0]); ++} ++ + void __init mem_init(void) + { + unsigned long codepages, datapages, initpages; + unsigned long addr, last; +- int i; +- +- i = last_valid_pfn >> ((22 - PAGE_SHIFT) + 6); +- i += 1; +- sparc64_valid_addr_bitmap = (unsigned long *) alloc_bootmem(i << 3); +- if (sparc64_valid_addr_bitmap == NULL) { +- prom_printf("mem_init: Cannot alloc valid_addr_bitmap.\n"); +- prom_halt(); +- } +- memset(sparc64_valid_addr_bitmap, 0, i << 3); + + addr = PAGE_OFFSET + kern_base; + last = PAGE_ALIGN(kern_size) + addr; +@@ -1943,15 +1944,19 @@ void __init mem_init(void) + addr += PAGE_SIZE; + } + +- setup_valid_addr_bitmap_from_pavail(); ++ setup_valid_addr_bitmap_from_pavail(sparc64_valid_addr_bitmap); ++ patch_tlb_miss_handler_bitmap(); + + high_memory = __va(last_valid_pfn << PAGE_SHIFT); + + #ifdef CONFIG_NEED_MULTIPLE_NODES +- for_each_online_node(i) { +- if (NODE_DATA(i)->node_spanned_pages != 0) { +- totalram_pages += +- free_all_bootmem_node(NODE_DATA(i)); ++ { ++ int i; ++ for_each_online_node(i) { ++ if (NODE_DATA(i)->node_spanned_pages != 0) { ++ totalram_pages += ++ free_all_bootmem_node(NODE_DATA(i)); ++ } + } + } + #else +--- a/arch/sparc/mm/init_64.h ++++ b/arch/sparc/mm/init_64.h +@@ -5,10 +5,13 @@ + * marked non-static so that assembler code can get at them. + */ + +-#define MAX_PHYS_ADDRESS (1UL << 42UL) +-#define KPTE_BITMAP_CHUNK_SZ (256UL * 1024UL * 1024UL) ++#define MAX_PHYS_ADDRESS (1UL << 41UL) ++#define KPTE_BITMAP_CHUNK_SZ (256UL * 1024UL * 1024UL) + #define KPTE_BITMAP_BYTES \ + ((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 8) ++#define VALID_ADDR_BITMAP_CHUNK_SZ (4UL * 1024UL * 1024UL) ++#define VALID_ADDR_BITMAP_BYTES \ ++ ((MAX_PHYS_ADDRESS / VALID_ADDR_BITMAP_CHUNK_SZ) / 8) + + extern unsigned long kern_linear_pte_xor[2]; + extern unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];