From 64f24c1cb9d0e5d42b5aa204ddb8f87ef1b57229 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 3 May 2013 10:42:24 -0700 Subject: [PATCH] 3.4-stable patches added patches: fix-initialization-of-cmci-cmcp-interrupts.patch fs-fscache-stats.c-fix-memory-leak.patch mm-allow-arch-code-to-control-the-user-page-table-ceiling.patch pci-acpi-don-t-query-osc-support-with-all-possible-controls.patch pci-pm-fix-fallback-to-pci_d0-in-pci_platform_power_transition.patch sysfs-fix-use-after-free-in-case-of-concurrent-read-write-and-readdir.patch wrong-asm-register-contraints-in-the-futex-implementation.patch wrong-asm-register-contraints-in-the-kvm-implementation.patch --- ...itialization-of-cmci-cmcp-interrupts.patch | 128 ++++++++ .../fs-fscache-stats.c-fix-memory-leak.patch | 41 +++ ...-control-the-user-page-table-ceiling.patch | 89 ++++++ ...c-support-with-all-possible-controls.patch | 67 ++++ ..._d0-in-pci_platform_power_transition.patch | 56 ++++ queue-3.4/series | 8 + ...of-concurrent-read-write-and-readdir.patch | 76 +++++ ...ntraints-in-the-futex-implementation.patch | 292 ++++++++++++++++++ ...contraints-in-the-kvm-implementation.patch | 95 ++++++ 9 files changed, 852 insertions(+) create mode 100644 queue-3.4/fix-initialization-of-cmci-cmcp-interrupts.patch create mode 100644 queue-3.4/fs-fscache-stats.c-fix-memory-leak.patch create mode 100644 queue-3.4/mm-allow-arch-code-to-control-the-user-page-table-ceiling.patch create mode 100644 queue-3.4/pci-acpi-don-t-query-osc-support-with-all-possible-controls.patch create mode 100644 queue-3.4/pci-pm-fix-fallback-to-pci_d0-in-pci_platform_power_transition.patch create mode 100644 queue-3.4/sysfs-fix-use-after-free-in-case-of-concurrent-read-write-and-readdir.patch create mode 100644 queue-3.4/wrong-asm-register-contraints-in-the-futex-implementation.patch create mode 100644 queue-3.4/wrong-asm-register-contraints-in-the-kvm-implementation.patch diff --git a/queue-3.4/fix-initialization-of-cmci-cmcp-interrupts.patch b/queue-3.4/fix-initialization-of-cmci-cmcp-interrupts.patch new file mode 100644 index 00000000000..f58330741be --- /dev/null +++ b/queue-3.4/fix-initialization-of-cmci-cmcp-interrupts.patch @@ -0,0 +1,128 @@ +From d303e9e98fce56cdb3c6f2ac92f626fc2bd51c77 Mon Sep 17 00:00:00 2001 +From: Tony Luck +Date: Wed, 20 Mar 2013 10:30:15 -0700 +Subject: Fix initialization of CMCI/CMCP interrupts + +From: Tony Luck + +commit d303e9e98fce56cdb3c6f2ac92f626fc2bd51c77 upstream. + +Back 2010 during a revamp of the irq code some initializations +were moved from ia64_mca_init() to ia64_mca_late_init() in + + commit c75f2aa13f5b268aba369b5dc566088b5194377c + Cannot use register_percpu_irq() from ia64_mca_init() + +But this was hideously wrong. First of all these initializations +are now down far too late. Specifically after all the other cpus +have been brought up and initialized their own CMC vectors from +smp_callin(). Also ia64_mca_late_init() may be called from any cpu +so the line: + ia64_mca_cmc_vector_setup(); /* Setup vector on BSP */ +is generally not executed on the BSP, and so the CMC vector isn't +setup at all on that processor. + +Make use of the arch_early_irq_init() hook to get this code executed +at just the right moment: not too early, not too late. + +Reported-by: Fred Hartnett +Tested-by: Fred Hartnett +Signed-off-by: Tony Luck +Signed-off-by: Greg Kroah-Hartman + +--- + arch/ia64/include/asm/mca.h | 1 + + arch/ia64/kernel/irq.c | 8 ++++++++ + arch/ia64/kernel/mca.c | 37 ++++++++++++++++++++++++------------- + 3 files changed, 33 insertions(+), 13 deletions(-) + +--- a/arch/ia64/include/asm/mca.h ++++ b/arch/ia64/include/asm/mca.h +@@ -143,6 +143,7 @@ extern unsigned long __per_cpu_mca[NR_CP + extern int cpe_vector; + extern int ia64_cpe_irq; + extern void ia64_mca_init(void); ++extern void ia64_mca_irq_init(void); + extern void ia64_mca_cpu_init(void *); + extern void ia64_os_mca_dispatch(void); + extern void ia64_os_mca_dispatch_end(void); +--- a/arch/ia64/kernel/irq.c ++++ b/arch/ia64/kernel/irq.c +@@ -23,6 +23,8 @@ + #include + #include + ++#include ++ + /* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. +@@ -83,6 +85,12 @@ bool is_affinity_mask_valid(const struct + + #endif /* CONFIG_SMP */ + ++int __init arch_early_irq_init(void) ++{ ++ ia64_mca_irq_init(); ++ return 0; ++} ++ + #ifdef CONFIG_HOTPLUG_CPU + unsigned int vectors_in_migration[NR_IRQS]; + +--- a/arch/ia64/kernel/mca.c ++++ b/arch/ia64/kernel/mca.c +@@ -2074,22 +2074,16 @@ ia64_mca_init(void) + printk(KERN_INFO "MCA related initialization done\n"); + } + ++ + /* +- * ia64_mca_late_init +- * +- * Opportunity to setup things that require initialization later +- * than ia64_mca_init. Setup a timer to poll for CPEs if the +- * platform doesn't support an interrupt driven mechanism. +- * +- * Inputs : None +- * Outputs : Status ++ * These pieces cannot be done in ia64_mca_init() because it is called before ++ * early_irq_init() which would wipe out our percpu irq registrations. But we ++ * cannot leave them until ia64_mca_late_init() because by then all the other ++ * processors have been brought online and have set their own CMC vectors to ++ * point at a non-existant action. Called from arch_early_irq_init(). + */ +-static int __init +-ia64_mca_late_init(void) ++void __init ia64_mca_irq_init(void) + { +- if (!mca_init) +- return 0; +- + /* + * Configure the CMCI/P vector and handler. Interrupts for CMC are + * per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c). +@@ -2108,6 +2102,23 @@ ia64_mca_late_init(void) + /* Setup the CPEI/P handler */ + register_percpu_irq(IA64_CPEP_VECTOR, &mca_cpep_irqaction); + #endif ++} ++ ++/* ++ * ia64_mca_late_init ++ * ++ * Opportunity to setup things that require initialization later ++ * than ia64_mca_init. Setup a timer to poll for CPEs if the ++ * platform doesn't support an interrupt driven mechanism. ++ * ++ * Inputs : None ++ * Outputs : Status ++ */ ++static int __init ++ia64_mca_late_init(void) ++{ ++ if (!mca_init) ++ return 0; + + register_hotcpu_notifier(&mca_cpu_notifier); + diff --git a/queue-3.4/fs-fscache-stats.c-fix-memory-leak.patch b/queue-3.4/fs-fscache-stats.c-fix-memory-leak.patch new file mode 100644 index 00000000000..fbc4bf252df --- /dev/null +++ b/queue-3.4/fs-fscache-stats.c-fix-memory-leak.patch @@ -0,0 +1,41 @@ +From ec686c9239b4d472052a271c505d04dae84214cc Mon Sep 17 00:00:00 2001 +From: Anurup m +Date: Mon, 29 Apr 2013 15:05:52 -0700 +Subject: fs/fscache/stats.c: fix memory leak + +From: Anurup m + +commit ec686c9239b4d472052a271c505d04dae84214cc upstream. + +There is a kernel memory leak observed when the proc file +/proc/fs/fscache/stats is read. + +The reason is that in fscache_stats_open, single_open is called and the +respective release function is not called during release. Hence fix +with correct release function - single_release(). + +Addresses https://bugzilla.kernel.org/show_bug.cgi?id=57101 + +Signed-off-by: Anurup m +Cc: shyju pv +Cc: Sanil kumar +Cc: Nataraj m +Cc: Li Zefan +Cc: David Howells +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fscache/stats.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/fscache/stats.c ++++ b/fs/fscache/stats.c +@@ -276,5 +276,5 @@ const struct file_operations fscache_sta + .open = fscache_stats_open, + .read = seq_read, + .llseek = seq_lseek, +- .release = seq_release, ++ .release = single_release, + }; diff --git a/queue-3.4/mm-allow-arch-code-to-control-the-user-page-table-ceiling.patch b/queue-3.4/mm-allow-arch-code-to-control-the-user-page-table-ceiling.patch new file mode 100644 index 00000000000..a5397fb196a --- /dev/null +++ b/queue-3.4/mm-allow-arch-code-to-control-the-user-page-table-ceiling.patch @@ -0,0 +1,89 @@ +From 6ee8630e02be6dd89926ca0fbc21af68b23dc087 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Mon, 29 Apr 2013 15:07:44 -0700 +Subject: mm: allow arch code to control the user page table ceiling + +From: Hugh Dickins + +commit 6ee8630e02be6dd89926ca0fbc21af68b23dc087 upstream. + +On architectures where a pgd entry may be shared between user and kernel +(e.g. ARM+LPAE), freeing page tables needs a ceiling other than 0. +This patch introduces a generic USER_PGTABLES_CEILING that arch code can +override. It is the responsibility of the arch code setting the ceiling +to ensure the complete freeing of the page tables (usually in +pgd_free()). + +[catalin.marinas@arm.com: commit log; shift_arg_pages(), asm-generic/pgtables.h changes] +Signed-off-by: Hugh Dickins +Signed-off-by: Catalin Marinas +Cc: Russell King +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/exec.c | 4 ++-- + include/asm-generic/pgtable.h | 10 ++++++++++ + mm/mmap.c | 4 ++-- + 3 files changed, 14 insertions(+), 4 deletions(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -627,7 +627,7 @@ static int shift_arg_pages(struct vm_are + * when the old and new regions overlap clear from new_end. + */ + free_pgd_range(&tlb, new_end, old_end, new_end, +- vma->vm_next ? vma->vm_next->vm_start : 0); ++ vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + } else { + /* + * otherwise, clean from old_start; this is done to not touch +@@ -636,7 +636,7 @@ static int shift_arg_pages(struct vm_are + * for the others its just a little faster. + */ + free_pgd_range(&tlb, old_start, old_end, new_end, +- vma->vm_next ? vma->vm_next->vm_start : 0); ++ vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + } + tlb_finish_mmu(&tlb, new_end, old_end); + +--- a/include/asm-generic/pgtable.h ++++ b/include/asm-generic/pgtable.h +@@ -7,6 +7,16 @@ + #include + #include + ++/* ++ * On almost all architectures and configurations, 0 can be used as the ++ * upper ceiling to free_pgtables(): on many architectures it has the same ++ * effect as using TASK_SIZE. However, there is one configuration which ++ * must impose a more careful limit, to avoid freeing kernel pgtables. ++ */ ++#ifndef USER_PGTABLES_CEILING ++#define USER_PGTABLES_CEILING 0UL ++#endif ++ + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS + extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1920,7 +1920,7 @@ static void unmap_region(struct mm_struc + unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); + free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, +- next ? next->vm_start : 0); ++ next ? next->vm_start : USER_PGTABLES_CEILING); + tlb_finish_mmu(&tlb, start, end); + } + +@@ -2308,7 +2308,7 @@ void exit_mmap(struct mm_struct *mm) + unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); + +- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); ++ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); + tlb_finish_mmu(&tlb, 0, -1); + + /* diff --git a/queue-3.4/pci-acpi-don-t-query-osc-support-with-all-possible-controls.patch b/queue-3.4/pci-acpi-don-t-query-osc-support-with-all-possible-controls.patch new file mode 100644 index 00000000000..d9938fd6823 --- /dev/null +++ b/queue-3.4/pci-acpi-don-t-query-osc-support-with-all-possible-controls.patch @@ -0,0 +1,67 @@ +From 545d6e189a41c94c11f55045a771118eccc9d9eb Mon Sep 17 00:00:00 2001 +From: Yinghai Lu +Date: Thu, 28 Mar 2013 04:28:58 +0000 +Subject: PCI / ACPI: Don't query OSC support with all possible controls + +From: Yinghai Lu + +commit 545d6e189a41c94c11f55045a771118eccc9d9eb upstream. + +Found problem on system that firmware that could handle pci aer. +Firmware get error reporting after pci injecting error, before os boots. +But after os boots, firmware can not get report anymore, even pci=noaer +is passed. + +Root cause: BIOS _OSC has problem with query bit checking. +It turns out that BIOS vendor is copying example code from ACPI Spec. +In ACPI Spec 5.0, page 290: + + If (Not(And(CDW1,1))) // Query flag clear? + { // Disable GPEs for features granted native control. + If (And(CTRL,0x01)) // Hot plug control granted? + { + Store(0,HPCE) // clear the hot plug SCI enable bit + Store(1,HPCS) // clear the hot plug SCI status bit + } + ... + } + +When Query flag is set, And(CDW1,1) will be 1, Not(1) will return 0xfffffffe. +So it will get into code path that should be for control set only. +BIOS acpi code should be changed to "If (LEqual(And(CDW1,1), 0)))" + +Current kernel code is using _OSC query to notify firmware about support +from OS and then use _OSC to set control bits. +During query support, current code is using all possible controls. +So will execute code that should be only for control set stage. + +That will have problem when pci=noaer or aer firmware_first is used. +As firmware have that control set for os aer already in query support stage, +but later will not os aer handling. + +We should avoid passing all possible controls, just use osc_control_set +instead. +That should workaround BIOS bugs with affected systems on the field +as more bios vendors are copying sample code from ACPI spec. + +Signed-off-by: Yinghai Lu +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/acpi/pci_root.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/acpi/pci_root.c ++++ b/drivers/acpi/pci_root.c +@@ -247,8 +247,8 @@ static acpi_status acpi_pci_query_osc(st + *control &= OSC_PCI_CONTROL_MASKS; + capbuf[OSC_CONTROL_TYPE] = *control | root->osc_control_set; + } else { +- /* Run _OSC query for all possible controls. */ +- capbuf[OSC_CONTROL_TYPE] = OSC_PCI_CONTROL_MASKS; ++ /* Run _OSC query only with existing controls. */ ++ capbuf[OSC_CONTROL_TYPE] = root->osc_control_set; + } + + status = acpi_pci_run_osc(root->device->handle, capbuf, &result); diff --git a/queue-3.4/pci-pm-fix-fallback-to-pci_d0-in-pci_platform_power_transition.patch b/queue-3.4/pci-pm-fix-fallback-to-pci_d0-in-pci_platform_power_transition.patch new file mode 100644 index 00000000000..d16931bdab8 --- /dev/null +++ b/queue-3.4/pci-pm-fix-fallback-to-pci_d0-in-pci_platform_power_transition.patch @@ -0,0 +1,56 @@ +From 769ba7212f2059ca9fe0c73371e3d415c8c1c529 Mon Sep 17 00:00:00 2001 +From: "Rafael J. Wysocki" +Date: Fri, 12 Apr 2013 13:58:17 +0000 +Subject: PCI/PM: Fix fallback to PCI_D0 in pci_platform_power_transition() + +From: "Rafael J. Wysocki" + +commit 769ba7212f2059ca9fe0c73371e3d415c8c1c529 upstream. + +Commit b51306c (PCI: Set device power state to PCI_D0 for device +without native PM support) modified pci_platform_power_transition() +by adding code causing dev->current_state for devices that don't +support native PCI PM but are power-manageable by the platform to be +changed to PCI_D0 regardless of the value returned by the preceding +platform_pci_set_power_state(). In particular, that also is done +if the platform_pci_set_power_state() has been successful, which +causes the correct power state of the device set by +pci_update_current_state() in that case to be overwritten by PCI_D0. + +Fix that mistake by making the fallback to PCI_D0 only happen if +the platform_pci_set_power_state() has returned an error. + +[bhelgaas: folded in Yinghai's simplification, added URL & stable info] +Reference: http://lkml.kernel.org/r/27806FC4E5928A408B78E88BBC67A2306F466BBA@ORSMSX101.amr.corp.intel.com +Reported-by: Chris J. Benenati +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Bjorn Helgaas +Acked-by: Yinghai Lu +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/pci/pci.c | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -673,15 +673,11 @@ static int pci_platform_power_transition + error = platform_pci_set_power_state(dev, state); + if (!error) + pci_update_current_state(dev, state); +- /* Fall back to PCI_D0 if native PM is not supported */ +- if (!dev->pm_cap) +- dev->current_state = PCI_D0; +- } else { ++ } else + error = -ENODEV; +- /* Fall back to PCI_D0 if native PM is not supported */ +- if (!dev->pm_cap) +- dev->current_state = PCI_D0; +- } ++ ++ if (error && !dev->pm_cap) /* Fall back to PCI_D0 */ ++ dev->current_state = PCI_D0; + + return error; + } diff --git a/queue-3.4/series b/queue-3.4/series index c606788e72b..1c385ec82aa 100644 --- a/queue-3.4/series +++ b/queue-3.4/series @@ -24,3 +24,11 @@ tracing-fix-off-by-one-on-allocating-stat-pages.patch tracing-check-return-value-of-tracing_init_dentry.patch tracing-reset-ftrace_graph_filter_enabled-if-count-is-zero.patch i2c-xiic-must-always-write-16-bit-words-to-tx_fifo.patch +sysfs-fix-use-after-free-in-case-of-concurrent-read-write-and-readdir.patch +fix-initialization-of-cmci-cmcp-interrupts.patch +pci-acpi-don-t-query-osc-support-with-all-possible-controls.patch +pci-pm-fix-fallback-to-pci_d0-in-pci_platform_power_transition.patch +wrong-asm-register-contraints-in-the-futex-implementation.patch +wrong-asm-register-contraints-in-the-kvm-implementation.patch +fs-fscache-stats.c-fix-memory-leak.patch +mm-allow-arch-code-to-control-the-user-page-table-ceiling.patch diff --git a/queue-3.4/sysfs-fix-use-after-free-in-case-of-concurrent-read-write-and-readdir.patch b/queue-3.4/sysfs-fix-use-after-free-in-case-of-concurrent-read-write-and-readdir.patch new file mode 100644 index 00000000000..448d506d6d0 --- /dev/null +++ b/queue-3.4/sysfs-fix-use-after-free-in-case-of-concurrent-read-write-and-readdir.patch @@ -0,0 +1,76 @@ +From f7db5e7660b122142410dcf36ba903c73d473250 Mon Sep 17 00:00:00 2001 +From: Ming Lei +Date: Tue, 2 Apr 2013 10:12:26 +0800 +Subject: sysfs: fix use after free in case of concurrent read/write and readdir + +From: Ming Lei + +commit f7db5e7660b122142410dcf36ba903c73d473250 upstream. + +The inode->i_mutex isn't hold when updating filp->f_pos +in read()/write(), so the filp->f_pos might be read as +0 or 1 in readdir() when there is concurrent read()/write() +on this same file, then may cause use after free in readdir(). + +The bug can be reproduced with Li Zefan's test code on the +link: + + https://patchwork.kernel.org/patch/2160771/ + +This patch fixes the use after free under this situation. + +Reported-by: Li Zefan +Signed-off-by: Ming Lei +Signed-off-by: Greg Kroah-Hartman + +--- + fs/sysfs/dir.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +--- a/fs/sysfs/dir.c ++++ b/fs/sysfs/dir.c +@@ -994,6 +994,7 @@ static int sysfs_readdir(struct file * f + enum kobj_ns_type type; + const void *ns; + ino_t ino; ++ loff_t off; + + type = sysfs_ns_type(parent_sd); + ns = sysfs_info(dentry->d_sb)->ns[type]; +@@ -1016,6 +1017,7 @@ static int sysfs_readdir(struct file * f + return 0; + } + mutex_lock(&sysfs_mutex); ++ off = filp->f_pos; + for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); + pos; + pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { +@@ -1027,19 +1029,24 @@ static int sysfs_readdir(struct file * f + len = strlen(name); + ino = pos->s_ino; + type = dt_type(pos); +- filp->f_pos = pos->s_hash; ++ off = filp->f_pos = pos->s_hash; + filp->private_data = sysfs_get(pos); + + mutex_unlock(&sysfs_mutex); +- ret = filldir(dirent, name, len, filp->f_pos, ino, type); ++ ret = filldir(dirent, name, len, off, ino, type); + mutex_lock(&sysfs_mutex); + if (ret < 0) + break; + } + mutex_unlock(&sysfs_mutex); +- if ((filp->f_pos > 1) && !pos) { /* EOF */ +- filp->f_pos = INT_MAX; ++ ++ /* don't reference last entry if its refcount is dropped */ ++ if (!pos) { + filp->private_data = NULL; ++ ++ /* EOF and not changed as 0 or 1 in read/write path */ ++ if (off == filp->f_pos && off > 1) ++ filp->f_pos = INT_MAX; + } + return 0; + } diff --git a/queue-3.4/wrong-asm-register-contraints-in-the-futex-implementation.patch b/queue-3.4/wrong-asm-register-contraints-in-the-futex-implementation.patch new file mode 100644 index 00000000000..291e7a81bd1 --- /dev/null +++ b/queue-3.4/wrong-asm-register-contraints-in-the-futex-implementation.patch @@ -0,0 +1,292 @@ +From 136f39ddc53db3bcee2befbe323a56d4fbf06da8 Mon Sep 17 00:00:00 2001 +From: Stephan Schreiber +Date: Tue, 19 Mar 2013 15:22:27 -0700 +Subject: Wrong asm register contraints in the futex implementation + +From: Stephan Schreiber + +commit 136f39ddc53db3bcee2befbe323a56d4fbf06da8 upstream. + +The Linux Kernel contains some inline assembly source code which has +wrong asm register constraints in arch/ia64/include/asm/futex.h. + +I observed this on Kernel 3.2.23 but it is also true on the most +recent Kernel 3.9-rc1. + +File arch/ia64/include/asm/futex.h: + +static inline int +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + return -EFAULT; + + { + register unsigned long r8 __asm ("r8"); + unsigned long prev; + __asm__ __volatile__( + " mf;; \n" + " mov %0=r0 \n" + " mov ar.ccv=%4;; \n" + "[1:] cmpxchg4.acq %1=[%2],%3,ar.ccv \n" + " .xdata4 \"__ex_table\", 1b-., 2f-. \n" + "[2:]" + : "=r" (r8), "=r" (prev) + : "r" (uaddr), "r" (newval), + "rO" ((long) (unsigned) oldval) + : "memory"); + *uval = prev; + return r8; + } +} + +The list of output registers is + : "=r" (r8), "=r" (prev) +The constraint "=r" means that the GCC has to maintain that these vars +are in registers and contain valid info when the program flow leaves +the assembly block (output registers). +But "=r" also means that GCC can put them in registers that are used +as input registers. Input registers are uaddr, newval, oldval on the +example. +The second assembly instruction + " mov %0=r0 \n" +is the first one which writes to a register; it sets %0 to 0. %0 means +the first register operand; it is r8 here. (The r0 is read-only and +always 0 on the Itanium; it can be used if an immediate zero value is +needed.) +This instruction might overwrite one of the other registers which are +still needed. +Whether it really happens depends on how GCC decides what registers it +uses and how it optimizes the code. + +The objdump utility can give us disassembly. +The futex_atomic_cmpxchg_inatomic() function is inline, so we have to +look for a module that uses the funtion. This is the +cmpxchg_futex_value_locked() function in +kernel/futex.c: + +static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) +{ + int ret; + + pagefault_disable(); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); + pagefault_enable(); + + return ret; +} + +Now the disassembly. At first from the Kernel package 3.2.23 which has +been compiled with GCC 4.4, remeber this Kernel seemed to work: +objdump -d linux-3.2.23/debian/build/build_ia64_none_mckinley/kernel/futex.o + +0000000000000230 : + 230: 0b 18 80 1b 18 21 [MMI] adds r3=3168,r13;; + 236: 80 40 0d 00 42 00 adds r8=40,r3 + 23c: 00 00 04 00 nop.i 0x0;; + 240: 0b 50 00 10 10 10 [MMI] ld4 r10=[r8];; + 246: 90 08 28 00 42 00 adds r9=1,r10 + 24c: 00 00 04 00 nop.i 0x0;; + 250: 09 00 00 00 01 00 [MMI] nop.m 0x0 + 256: 00 48 20 20 23 00 st4 [r8]=r9 + 25c: 00 00 04 00 nop.i 0x0;; + 260: 08 10 80 06 00 21 [MMI] adds r2=32,r3 + 266: 00 00 00 02 00 00 nop.m 0x0 + 26c: 02 08 f1 52 extr.u r16=r33,0,61 + 270: 05 40 88 00 08 e0 [MLX] addp4 r8=r34,r0 + 276: ff ff 0f 00 00 e0 movl r15=0xfffffffbfff;; + 27c: f1 f7 ff 65 + 280: 09 70 00 04 18 10 [MMI] ld8 r14=[r2] + 286: 00 00 00 02 00 c0 nop.m 0x0 + 28c: f0 80 1c d0 cmp.ltu p6,p7=r15,r16;; + 290: 08 40 fc 1d 09 3b [MMI] cmp.eq p8,p9=-1,r14 + 296: 00 00 00 02 00 40 nop.m 0x0 + 29c: e1 08 2d d0 cmp.ltu p10,p11=r14,r33 + 2a0: 56 01 10 00 40 10 [BBB] (p10) br.cond.spnt.few 2e0 + + 2a6: 02 08 00 80 21 03 (p08) br.cond.dpnt.few 2b0 + + 2ac: 40 00 00 41 (p06) br.cond.spnt.few 2e0 + + 2b0: 0a 00 00 00 22 00 [MMI] mf;; + 2b6: 80 00 00 00 42 00 mov r8=r0 + 2bc: 00 00 04 00 nop.i 0x0 + 2c0: 0b 00 20 40 2a 04 [MMI] mov.m ar.ccv=r8;; + 2c6: 10 1a 85 22 20 00 cmpxchg4.acq r33=[r33],r35,ar.ccv + 2cc: 00 00 04 00 nop.i 0x0;; + 2d0: 10 00 84 40 90 11 [MIB] st4 [r32]=r33 + 2d6: 00 00 00 02 00 00 nop.i 0x0 + 2dc: 20 00 00 40 br.few 2f0 + + 2e0: 09 40 c8 f9 ff 27 [MMI] mov r8=-14 + 2e6: 00 00 00 02 00 00 nop.m 0x0 + 2ec: 00 00 04 00 nop.i 0x0;; + 2f0: 0b 58 20 1a 19 21 [MMI] adds r11=3208,r13;; + 2f6: 20 01 2c 20 20 00 ld4 r18=[r11] + 2fc: 00 00 04 00 nop.i 0x0;; + 300: 0b 88 fc 25 3f 23 [MMI] adds r17=-1,r18;; + 306: 00 88 2c 20 23 00 st4 [r11]=r17 + 30c: 00 00 04 00 nop.i 0x0;; + 310: 11 00 00 00 01 00 [MIB] nop.m 0x0 + 316: 00 00 00 02 00 80 nop.i 0x0 + 31c: 08 00 84 00 br.ret.sptk.many b0;; + +The lines + 2b0: 0a 00 00 00 22 00 [MMI] mf;; + 2b6: 80 00 00 00 42 00 mov r8=r0 + 2bc: 00 00 04 00 nop.i 0x0 + 2c0: 0b 00 20 40 2a 04 [MMI] mov.m ar.ccv=r8;; + 2c6: 10 1a 85 22 20 00 cmpxchg4.acq r33=[r33],r35,ar.ccv + 2cc: 00 00 04 00 nop.i 0x0;; +are the instructions of the assembly block. +The line + 2b6: 80 00 00 00 42 00 mov r8=r0 +sets the r8 register to 0 and after that + 2c0: 0b 00 20 40 2a 04 [MMI] mov.m ar.ccv=r8;; +prepares the 'oldvalue' for the cmpxchg but it takes it from r8. This +is wrong. +What happened here is what I explained above: An input register is +overwritten which is still needed. +The register operand constraints in futex.h are wrong. + +(The problem doesn't occur when the Kernel is compiled with GCC 4.6.) + +The attached patch fixes the register operand constraints in futex.h. +The code after patching of it: + +static inline int +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + return -EFAULT; + + { + register unsigned long r8 __asm ("r8") = 0; + unsigned long prev; + __asm__ __volatile__( + " mf;; \n" + " mov ar.ccv=%4;; \n" + "[1:] cmpxchg4.acq %1=[%2],%3,ar.ccv \n" + " .xdata4 \"__ex_table\", 1b-., 2f-. \n" + "[2:]" + : "+r" (r8), "=&r" (prev) + : "r" (uaddr), "r" (newval), + "rO" ((long) (unsigned) oldval) + : "memory"); + *uval = prev; + return r8; + } +} + +I also initialized the 'r8' var with the C programming language. +The _asm qualifier on the definition of the 'r8' var forces GCC to use +the r8 processor register for it. +I don't believe that we should use inline assembly for zeroing out a +local variable. +The constraint is +"+r" (r8) +what means that it is both an input register and an output register. +Note that the page fault handler will modify the r8 register which +will be the return value of the function. +The real fix is +"=&r" (prev) +The & means that GCC must not use any of the input registers to place +this output register in. + +Patched the Kernel 3.2.23 and compiled it with GCC4.4: + +0000000000000230 : + 230: 0b 18 80 1b 18 21 [MMI] adds r3=3168,r13;; + 236: 80 40 0d 00 42 00 adds r8=40,r3 + 23c: 00 00 04 00 nop.i 0x0;; + 240: 0b 50 00 10 10 10 [MMI] ld4 r10=[r8];; + 246: 90 08 28 00 42 00 adds r9=1,r10 + 24c: 00 00 04 00 nop.i 0x0;; + 250: 09 00 00 00 01 00 [MMI] nop.m 0x0 + 256: 00 48 20 20 23 00 st4 [r8]=r9 + 25c: 00 00 04 00 nop.i 0x0;; + 260: 08 10 80 06 00 21 [MMI] adds r2=32,r3 + 266: 20 12 01 10 40 00 addp4 r34=r34,r0 + 26c: 02 08 f1 52 extr.u r16=r33,0,61 + 270: 05 40 00 00 00 e1 [MLX] mov r8=r0 + 276: ff ff 0f 00 00 e0 movl r15=0xfffffffbfff;; + 27c: f1 f7 ff 65 + 280: 09 70 00 04 18 10 [MMI] ld8 r14=[r2] + 286: 00 00 00 02 00 c0 nop.m 0x0 + 28c: f0 80 1c d0 cmp.ltu p6,p7=r15,r16;; + 290: 08 40 fc 1d 09 3b [MMI] cmp.eq p8,p9=-1,r14 + 296: 00 00 00 02 00 40 nop.m 0x0 + 29c: e1 08 2d d0 cmp.ltu p10,p11=r14,r33 + 2a0: 56 01 10 00 40 10 [BBB] (p10) br.cond.spnt.few 2e0 + + 2a6: 02 08 00 80 21 03 (p08) br.cond.dpnt.few 2b0 + + 2ac: 40 00 00 41 (p06) br.cond.spnt.few 2e0 + + 2b0: 0b 00 00 00 22 00 [MMI] mf;; + 2b6: 00 10 81 54 08 00 mov.m ar.ccv=r34 + 2bc: 00 00 04 00 nop.i 0x0;; + 2c0: 09 58 8c 42 11 10 [MMI] cmpxchg4.acq r11=[r33],r35,ar.ccv + 2c6: 00 00 00 02 00 00 nop.m 0x0 + 2cc: 00 00 04 00 nop.i 0x0;; + 2d0: 10 00 2c 40 90 11 [MIB] st4 [r32]=r11 + 2d6: 00 00 00 02 00 00 nop.i 0x0 + 2dc: 20 00 00 40 br.few 2f0 + + 2e0: 09 40 c8 f9 ff 27 [MMI] mov r8=-14 + 2e6: 00 00 00 02 00 00 nop.m 0x0 + 2ec: 00 00 04 00 nop.i 0x0;; + 2f0: 0b 88 20 1a 19 21 [MMI] adds r17=3208,r13;; + 2f6: 30 01 44 20 20 00 ld4 r19=[r17] + 2fc: 00 00 04 00 nop.i 0x0;; + 300: 0b 90 fc 27 3f 23 [MMI] adds r18=-1,r19;; + 306: 00 90 44 20 23 00 st4 [r17]=r18 + 30c: 00 00 04 00 nop.i 0x0;; + 310: 11 00 00 00 01 00 [MIB] nop.m 0x0 + 316: 00 00 00 02 00 80 nop.i 0x0 + 31c: 08 00 84 00 br.ret.sptk.many b0;; + +Much better. +There is a + 270: 05 40 00 00 00 e1 [MLX] mov r8=r0 +which was generated by C code r8 = 0. Below + 2b6: 00 10 81 54 08 00 mov.m ar.ccv=r34 +what means that oldval is no longer overwritten. + +This is Debian bug#702641 +(http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=702641). + +The patch is applicable on Kernel 3.9-rc1, 3.2.23 and many other versions. + +Signed-off-by: Stephan Schreiber +Signed-off-by: Tony Luck +Signed-off-by: Greg Kroah-Hartman + +--- + arch/ia64/include/asm/futex.h | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/arch/ia64/include/asm/futex.h ++++ b/arch/ia64/include/asm/futex.h +@@ -106,16 +106,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, + return -EFAULT; + + { +- register unsigned long r8 __asm ("r8"); ++ register unsigned long r8 __asm ("r8") = 0; + unsigned long prev; + __asm__ __volatile__( + " mf;; \n" +- " mov %0=r0 \n" + " mov ar.ccv=%4;; \n" + "[1:] cmpxchg4.acq %1=[%2],%3,ar.ccv \n" + " .xdata4 \"__ex_table\", 1b-., 2f-. \n" + "[2:]" +- : "=r" (r8), "=r" (prev) ++ : "+r" (r8), "=&r" (prev) + : "r" (uaddr), "r" (newval), + "rO" ((long) (unsigned) oldval) + : "memory"); diff --git a/queue-3.4/wrong-asm-register-contraints-in-the-kvm-implementation.patch b/queue-3.4/wrong-asm-register-contraints-in-the-kvm-implementation.patch new file mode 100644 index 00000000000..99c754b842c --- /dev/null +++ b/queue-3.4/wrong-asm-register-contraints-in-the-kvm-implementation.patch @@ -0,0 +1,95 @@ +From de53e9caa4c6149ef4a78c2f83d7f5b655848767 Mon Sep 17 00:00:00 2001 +From: Stephan Schreiber +Date: Tue, 19 Mar 2013 15:27:12 -0700 +Subject: Wrong asm register contraints in the kvm implementation + +From: Stephan Schreiber + +commit de53e9caa4c6149ef4a78c2f83d7f5b655848767 upstream. + +The Linux Kernel contains some inline assembly source code which has +wrong asm register constraints in arch/ia64/kvm/vtlb.c. + +I observed this on Kernel 3.2.35 but it is also true on the most +recent Kernel 3.9-rc1. + +File arch/ia64/kvm/vtlb.c: + +u64 guest_vhpt_lookup(u64 iha, u64 *pte) +{ + u64 ret; + struct thash_data *data; + + data = __vtr_lookup(current_vcpu, iha, D_TLB); + if (data != NULL) + thash_vhpt_insert(current_vcpu, data->page_flags, + data->itir, iha, D_TLB); + + asm volatile ( + "rsm psr.ic|psr.i;;" + "srlz.d;;" + "ld8.s r9=[%1];;" + "tnat.nz p6,p7=r9;;" + "(p6) mov %0=1;" + "(p6) mov r9=r0;" + "(p7) extr.u r9=r9,0,53;;" + "(p7) mov %0=r0;" + "(p7) st8 [%2]=r9;;" + "ssm psr.ic;;" + "srlz.d;;" + "ssm psr.i;;" + "srlz.d;;" + : "=r"(ret) : "r"(iha), "r"(pte):"memory"); + + return ret; +} + +The list of output registers is + : "=r"(ret) : "r"(iha), "r"(pte):"memory"); +The constraint "=r" means that the GCC has to maintain that these vars +are in registers and contain valid info when the program flow leaves +the assembly block (output registers). +But "=r" also means that GCC can put them in registers that are used +as input registers. Input registers are iha, pte on the example. +If the predicate p7 is true, the 8th assembly instruction + "(p7) mov %0=r0;" +is the first one which writes to a register which is maintained by the +register constraints; it sets %0. %0 means the first register operand; +it is ret here. +This instruction might overwrite the %2 register (pte) which is needed +by the next instruction: + "(p7) st8 [%2]=r9;;" +Whether it really happens depends on how GCC decides what registers it +uses and how it optimizes the code. + +The attached patch fixes the register operand constraints in +arch/ia64/kvm/vtlb.c. +The register constraints should be + : "=&r"(ret) : "r"(iha), "r"(pte):"memory"); +The & means that GCC must not use any of the input registers to place +this output register in. + +This is Debian bug#702639 +(http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=702639). + +The patch is applicable on Kernel 3.9-rc1, 3.2.35 and many other versions. + +Signed-off-by: Stephan Schreiber +Signed-off-by: Tony Luck +Signed-off-by: Greg Kroah-Hartman + +--- + arch/ia64/kvm/vtlb.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/ia64/kvm/vtlb.c ++++ b/arch/ia64/kvm/vtlb.c +@@ -256,7 +256,7 @@ u64 guest_vhpt_lookup(u64 iha, u64 *pte) + "srlz.d;;" + "ssm psr.i;;" + "srlz.d;;" +- : "=r"(ret) : "r"(iha), "r"(pte):"memory"); ++ : "=&r"(ret) : "r"(iha), "r"(pte) : "memory"); + + return ret; + } -- 2.47.3