--- /dev/null
+From d303e9e98fce56cdb3c6f2ac92f626fc2bd51c77 Mon Sep 17 00:00:00 2001
+From: Tony Luck <tony.luck@intel.com>
+Date: Wed, 20 Mar 2013 10:30:15 -0700
+Subject: Fix initialization of CMCI/CMCP interrupts
+
+From: Tony Luck <tony.luck@intel.com>
+
+commit d303e9e98fce56cdb3c6f2ac92f626fc2bd51c77 upstream.
+
+Back 2010 during a revamp of the irq code some initializations
+were moved from ia64_mca_init() to ia64_mca_late_init() in
+
+ commit c75f2aa13f5b268aba369b5dc566088b5194377c
+ Cannot use register_percpu_irq() from ia64_mca_init()
+
+But this was hideously wrong. First of all these initializations
+are now down far too late. Specifically after all the other cpus
+have been brought up and initialized their own CMC vectors from
+smp_callin(). Also ia64_mca_late_init() may be called from any cpu
+so the line:
+ ia64_mca_cmc_vector_setup(); /* Setup vector on BSP */
+is generally not executed on the BSP, and so the CMC vector isn't
+setup at all on that processor.
+
+Make use of the arch_early_irq_init() hook to get this code executed
+at just the right moment: not too early, not too late.
+
+Reported-by: Fred Hartnett <fred.hartnett@hp.com>
+Tested-by: Fred Hartnett <fred.hartnett@hp.com>
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/ia64/include/asm/mca.h | 1 +
+ arch/ia64/kernel/irq.c | 8 ++++++++
+ arch/ia64/kernel/mca.c | 37 ++++++++++++++++++++++++-------------
+ 3 files changed, 33 insertions(+), 13 deletions(-)
+
+--- a/arch/ia64/include/asm/mca.h
++++ b/arch/ia64/include/asm/mca.h
+@@ -143,6 +143,7 @@ extern unsigned long __per_cpu_mca[NR_CP
+ extern int cpe_vector;
+ extern int ia64_cpe_irq;
+ extern void ia64_mca_init(void);
++extern void ia64_mca_irq_init(void);
+ extern void ia64_mca_cpu_init(void *);
+ extern void ia64_os_mca_dispatch(void);
+ extern void ia64_os_mca_dispatch_end(void);
+--- a/arch/ia64/kernel/irq.c
++++ b/arch/ia64/kernel/irq.c
+@@ -23,6 +23,8 @@
+ #include <linux/interrupt.h>
+ #include <linux/kernel_stat.h>
+
++#include <asm/mca.h>
++
+ /*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+@@ -83,6 +85,12 @@ bool is_affinity_mask_valid(const struct
+
+ #endif /* CONFIG_SMP */
+
++int __init arch_early_irq_init(void)
++{
++ ia64_mca_irq_init();
++ return 0;
++}
++
+ #ifdef CONFIG_HOTPLUG_CPU
+ unsigned int vectors_in_migration[NR_IRQS];
+
+--- a/arch/ia64/kernel/mca.c
++++ b/arch/ia64/kernel/mca.c
+@@ -2074,22 +2074,16 @@ ia64_mca_init(void)
+ printk(KERN_INFO "MCA related initialization done\n");
+ }
+
++
+ /*
+- * ia64_mca_late_init
+- *
+- * Opportunity to setup things that require initialization later
+- * than ia64_mca_init. Setup a timer to poll for CPEs if the
+- * platform doesn't support an interrupt driven mechanism.
+- *
+- * Inputs : None
+- * Outputs : Status
++ * These pieces cannot be done in ia64_mca_init() because it is called before
++ * early_irq_init() which would wipe out our percpu irq registrations. But we
++ * cannot leave them until ia64_mca_late_init() because by then all the other
++ * processors have been brought online and have set their own CMC vectors to
++ * point at a non-existant action. Called from arch_early_irq_init().
+ */
+-static int __init
+-ia64_mca_late_init(void)
++void __init ia64_mca_irq_init(void)
+ {
+- if (!mca_init)
+- return 0;
+-
+ /*
+ * Configure the CMCI/P vector and handler. Interrupts for CMC are
+ * per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c).
+@@ -2108,6 +2102,23 @@ ia64_mca_late_init(void)
+ /* Setup the CPEI/P handler */
+ register_percpu_irq(IA64_CPEP_VECTOR, &mca_cpep_irqaction);
+ #endif
++}
++
++/*
++ * ia64_mca_late_init
++ *
++ * Opportunity to setup things that require initialization later
++ * than ia64_mca_init. Setup a timer to poll for CPEs if the
++ * platform doesn't support an interrupt driven mechanism.
++ *
++ * Inputs : None
++ * Outputs : Status
++ */
++static int __init
++ia64_mca_late_init(void)
++{
++ if (!mca_init)
++ return 0;
+
+ register_hotcpu_notifier(&mca_cpu_notifier);
+
--- /dev/null
+From ec686c9239b4d472052a271c505d04dae84214cc Mon Sep 17 00:00:00 2001
+From: Anurup m <anurup.m@huawei.com>
+Date: Mon, 29 Apr 2013 15:05:52 -0700
+Subject: fs/fscache/stats.c: fix memory leak
+
+From: Anurup m <anurup.m@huawei.com>
+
+commit ec686c9239b4d472052a271c505d04dae84214cc upstream.
+
+There is a kernel memory leak observed when the proc file
+/proc/fs/fscache/stats is read.
+
+The reason is that in fscache_stats_open, single_open is called and the
+respective release function is not called during release. Hence fix
+with correct release function - single_release().
+
+Addresses https://bugzilla.kernel.org/show_bug.cgi?id=57101
+
+Signed-off-by: Anurup m <anurup.m@huawei.com>
+Cc: shyju pv <shyju.pv@huawei.com>
+Cc: Sanil kumar <sanil.kumar@huawei.com>
+Cc: Nataraj m <nataraj.m@huawei.com>
+Cc: Li Zefan <lizefan@huawei.com>
+Cc: David Howells <dhowells@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fscache/stats.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/fscache/stats.c
++++ b/fs/fscache/stats.c
+@@ -276,5 +276,5 @@ const struct file_operations fscache_sta
+ .open = fscache_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+- .release = seq_release,
++ .release = single_release,
+ };
--- /dev/null
+From 6ee8630e02be6dd89926ca0fbc21af68b23dc087 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Mon, 29 Apr 2013 15:07:44 -0700
+Subject: mm: allow arch code to control the user page table ceiling
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 6ee8630e02be6dd89926ca0fbc21af68b23dc087 upstream.
+
+On architectures where a pgd entry may be shared between user and kernel
+(e.g. ARM+LPAE), freeing page tables needs a ceiling other than 0.
+This patch introduces a generic USER_PGTABLES_CEILING that arch code can
+override. It is the responsibility of the arch code setting the ceiling
+to ensure the complete freeing of the page tables (usually in
+pgd_free()).
+
+[catalin.marinas@arm.com: commit log; shift_arg_pages(), asm-generic/pgtables.h changes]
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Russell King <linux@arm.linux.org.uk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/exec.c | 4 ++--
+ include/asm-generic/pgtable.h | 10 ++++++++++
+ mm/mmap.c | 4 ++--
+ 3 files changed, 14 insertions(+), 4 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -627,7 +627,7 @@ static int shift_arg_pages(struct vm_are
+ * when the old and new regions overlap clear from new_end.
+ */
+ free_pgd_range(&tlb, new_end, old_end, new_end,
+- vma->vm_next ? vma->vm_next->vm_start : 0);
++ vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+ } else {
+ /*
+ * otherwise, clean from old_start; this is done to not touch
+@@ -636,7 +636,7 @@ static int shift_arg_pages(struct vm_are
+ * for the others its just a little faster.
+ */
+ free_pgd_range(&tlb, old_start, old_end, new_end,
+- vma->vm_next ? vma->vm_next->vm_start : 0);
++ vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+ }
+ tlb_finish_mmu(&tlb, new_end, old_end);
+
+--- a/include/asm-generic/pgtable.h
++++ b/include/asm-generic/pgtable.h
+@@ -7,6 +7,16 @@
+ #include <linux/mm_types.h>
+ #include <linux/bug.h>
+
++/*
++ * On almost all architectures and configurations, 0 can be used as the
++ * upper ceiling to free_pgtables(): on many architectures it has the same
++ * effect as using TASK_SIZE. However, there is one configuration which
++ * must impose a more careful limit, to avoid freeing kernel pgtables.
++ */
++#ifndef USER_PGTABLES_CEILING
++#define USER_PGTABLES_CEILING 0UL
++#endif
++
+ #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+ extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1920,7 +1920,7 @@ static void unmap_region(struct mm_struc
+ unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+ vm_unacct_memory(nr_accounted);
+ free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+- next ? next->vm_start : 0);
++ next ? next->vm_start : USER_PGTABLES_CEILING);
+ tlb_finish_mmu(&tlb, start, end);
+ }
+
+@@ -2308,7 +2308,7 @@ void exit_mmap(struct mm_struct *mm)
+ unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+ vm_unacct_memory(nr_accounted);
+
+- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
++ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ tlb_finish_mmu(&tlb, 0, -1);
+
+ /*
--- /dev/null
+From 545d6e189a41c94c11f55045a771118eccc9d9eb Mon Sep 17 00:00:00 2001
+From: Yinghai Lu <yinghai@kernel.org>
+Date: Thu, 28 Mar 2013 04:28:58 +0000
+Subject: PCI / ACPI: Don't query OSC support with all possible controls
+
+From: Yinghai Lu <yinghai@kernel.org>
+
+commit 545d6e189a41c94c11f55045a771118eccc9d9eb upstream.
+
+Found problem on system that firmware that could handle pci aer.
+Firmware get error reporting after pci injecting error, before os boots.
+But after os boots, firmware can not get report anymore, even pci=noaer
+is passed.
+
+Root cause: BIOS _OSC has problem with query bit checking.
+It turns out that BIOS vendor is copying example code from ACPI Spec.
+In ACPI Spec 5.0, page 290:
+
+ If (Not(And(CDW1,1))) // Query flag clear?
+ { // Disable GPEs for features granted native control.
+ If (And(CTRL,0x01)) // Hot plug control granted?
+ {
+ Store(0,HPCE) // clear the hot plug SCI enable bit
+ Store(1,HPCS) // clear the hot plug SCI status bit
+ }
+ ...
+ }
+
+When Query flag is set, And(CDW1,1) will be 1, Not(1) will return 0xfffffffe.
+So it will get into code path that should be for control set only.
+BIOS acpi code should be changed to "If (LEqual(And(CDW1,1), 0)))"
+
+Current kernel code is using _OSC query to notify firmware about support
+from OS and then use _OSC to set control bits.
+During query support, current code is using all possible controls.
+So will execute code that should be only for control set stage.
+
+That will have problem when pci=noaer or aer firmware_first is used.
+As firmware have that control set for os aer already in query support stage,
+but later will not os aer handling.
+
+We should avoid passing all possible controls, just use osc_control_set
+instead.
+That should workaround BIOS bugs with affected systems on the field
+as more bios vendors are copying sample code from ACPI spec.
+
+Signed-off-by: Yinghai Lu <yinghai@kernel.org>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/acpi/pci_root.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/acpi/pci_root.c
++++ b/drivers/acpi/pci_root.c
+@@ -247,8 +247,8 @@ static acpi_status acpi_pci_query_osc(st
+ *control &= OSC_PCI_CONTROL_MASKS;
+ capbuf[OSC_CONTROL_TYPE] = *control | root->osc_control_set;
+ } else {
+- /* Run _OSC query for all possible controls. */
+- capbuf[OSC_CONTROL_TYPE] = OSC_PCI_CONTROL_MASKS;
++ /* Run _OSC query only with existing controls. */
++ capbuf[OSC_CONTROL_TYPE] = root->osc_control_set;
+ }
+
+ status = acpi_pci_run_osc(root->device->handle, capbuf, &result);
--- /dev/null
+From 769ba7212f2059ca9fe0c73371e3d415c8c1c529 Mon Sep 17 00:00:00 2001
+From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
+Date: Fri, 12 Apr 2013 13:58:17 +0000
+Subject: PCI/PM: Fix fallback to PCI_D0 in pci_platform_power_transition()
+
+From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
+
+commit 769ba7212f2059ca9fe0c73371e3d415c8c1c529 upstream.
+
+Commit b51306c (PCI: Set device power state to PCI_D0 for device
+without native PM support) modified pci_platform_power_transition()
+by adding code causing dev->current_state for devices that don't
+support native PCI PM but are power-manageable by the platform to be
+changed to PCI_D0 regardless of the value returned by the preceding
+platform_pci_set_power_state(). In particular, that also is done
+if the platform_pci_set_power_state() has been successful, which
+causes the correct power state of the device set by
+pci_update_current_state() in that case to be overwritten by PCI_D0.
+
+Fix that mistake by making the fallback to PCI_D0 only happen if
+the platform_pci_set_power_state() has returned an error.
+
+[bhelgaas: folded in Yinghai's simplification, added URL & stable info]
+Reference: http://lkml.kernel.org/r/27806FC4E5928A408B78E88BBC67A2306F466BBA@ORSMSX101.amr.corp.intel.com
+Reported-by: Chris J. Benenati <chris.j.benenati@intel.com>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Acked-by: Yinghai Lu <yinghai@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pci/pci.c | 12 ++++--------
+ 1 file changed, 4 insertions(+), 8 deletions(-)
+
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -673,15 +673,11 @@ static int pci_platform_power_transition
+ error = platform_pci_set_power_state(dev, state);
+ if (!error)
+ pci_update_current_state(dev, state);
+- /* Fall back to PCI_D0 if native PM is not supported */
+- if (!dev->pm_cap)
+- dev->current_state = PCI_D0;
+- } else {
++ } else
+ error = -ENODEV;
+- /* Fall back to PCI_D0 if native PM is not supported */
+- if (!dev->pm_cap)
+- dev->current_state = PCI_D0;
+- }
++
++ if (error && !dev->pm_cap) /* Fall back to PCI_D0 */
++ dev->current_state = PCI_D0;
+
+ return error;
+ }
tracing-check-return-value-of-tracing_init_dentry.patch
tracing-reset-ftrace_graph_filter_enabled-if-count-is-zero.patch
i2c-xiic-must-always-write-16-bit-words-to-tx_fifo.patch
+sysfs-fix-use-after-free-in-case-of-concurrent-read-write-and-readdir.patch
+fix-initialization-of-cmci-cmcp-interrupts.patch
+pci-acpi-don-t-query-osc-support-with-all-possible-controls.patch
+pci-pm-fix-fallback-to-pci_d0-in-pci_platform_power_transition.patch
+wrong-asm-register-contraints-in-the-futex-implementation.patch
+wrong-asm-register-contraints-in-the-kvm-implementation.patch
+fs-fscache-stats.c-fix-memory-leak.patch
+mm-allow-arch-code-to-control-the-user-page-table-ceiling.patch
--- /dev/null
+From f7db5e7660b122142410dcf36ba903c73d473250 Mon Sep 17 00:00:00 2001
+From: Ming Lei <ming.lei@canonical.com>
+Date: Tue, 2 Apr 2013 10:12:26 +0800
+Subject: sysfs: fix use after free in case of concurrent read/write and readdir
+
+From: Ming Lei <ming.lei@canonical.com>
+
+commit f7db5e7660b122142410dcf36ba903c73d473250 upstream.
+
+The inode->i_mutex isn't hold when updating filp->f_pos
+in read()/write(), so the filp->f_pos might be read as
+0 or 1 in readdir() when there is concurrent read()/write()
+on this same file, then may cause use after free in readdir().
+
+The bug can be reproduced with Li Zefan's test code on the
+link:
+
+ https://patchwork.kernel.org/patch/2160771/
+
+This patch fixes the use after free under this situation.
+
+Reported-by: Li Zefan <lizefan@huawei.com>
+Signed-off-by: Ming Lei <ming.lei@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/sysfs/dir.c | 15 +++++++++++----
+ 1 file changed, 11 insertions(+), 4 deletions(-)
+
+--- a/fs/sysfs/dir.c
++++ b/fs/sysfs/dir.c
+@@ -994,6 +994,7 @@ static int sysfs_readdir(struct file * f
+ enum kobj_ns_type type;
+ const void *ns;
+ ino_t ino;
++ loff_t off;
+
+ type = sysfs_ns_type(parent_sd);
+ ns = sysfs_info(dentry->d_sb)->ns[type];
+@@ -1016,6 +1017,7 @@ static int sysfs_readdir(struct file * f
+ return 0;
+ }
+ mutex_lock(&sysfs_mutex);
++ off = filp->f_pos;
+ for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
+ pos;
+ pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
+@@ -1027,19 +1029,24 @@ static int sysfs_readdir(struct file * f
+ len = strlen(name);
+ ino = pos->s_ino;
+ type = dt_type(pos);
+- filp->f_pos = pos->s_hash;
++ off = filp->f_pos = pos->s_hash;
+ filp->private_data = sysfs_get(pos);
+
+ mutex_unlock(&sysfs_mutex);
+- ret = filldir(dirent, name, len, filp->f_pos, ino, type);
++ ret = filldir(dirent, name, len, off, ino, type);
+ mutex_lock(&sysfs_mutex);
+ if (ret < 0)
+ break;
+ }
+ mutex_unlock(&sysfs_mutex);
+- if ((filp->f_pos > 1) && !pos) { /* EOF */
+- filp->f_pos = INT_MAX;
++
++ /* don't reference last entry if its refcount is dropped */
++ if (!pos) {
+ filp->private_data = NULL;
++
++ /* EOF and not changed as 0 or 1 in read/write path */
++ if (off == filp->f_pos && off > 1)
++ filp->f_pos = INT_MAX;
+ }
+ return 0;
+ }
--- /dev/null
+From 136f39ddc53db3bcee2befbe323a56d4fbf06da8 Mon Sep 17 00:00:00 2001
+From: Stephan Schreiber <info@fs-driver.org>
+Date: Tue, 19 Mar 2013 15:22:27 -0700
+Subject: Wrong asm register contraints in the futex implementation
+
+From: Stephan Schreiber <info@fs-driver.org>
+
+commit 136f39ddc53db3bcee2befbe323a56d4fbf06da8 upstream.
+
+The Linux Kernel contains some inline assembly source code which has
+wrong asm register constraints in arch/ia64/include/asm/futex.h.
+
+I observed this on Kernel 3.2.23 but it is also true on the most
+recent Kernel 3.9-rc1.
+
+File arch/ia64/include/asm/futex.h:
+
+static inline int
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+ u32 oldval, u32 newval)
+{
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+ return -EFAULT;
+
+ {
+ register unsigned long r8 __asm ("r8");
+ unsigned long prev;
+ __asm__ __volatile__(
+ " mf;; \n"
+ " mov %0=r0 \n"
+ " mov ar.ccv=%4;; \n"
+ "[1:] cmpxchg4.acq %1=[%2],%3,ar.ccv \n"
+ " .xdata4 \"__ex_table\", 1b-., 2f-. \n"
+ "[2:]"
+ : "=r" (r8), "=r" (prev)
+ : "r" (uaddr), "r" (newval),
+ "rO" ((long) (unsigned) oldval)
+ : "memory");
+ *uval = prev;
+ return r8;
+ }
+}
+
+The list of output registers is
+ : "=r" (r8), "=r" (prev)
+The constraint "=r" means that the GCC has to maintain that these vars
+are in registers and contain valid info when the program flow leaves
+the assembly block (output registers).
+But "=r" also means that GCC can put them in registers that are used
+as input registers. Input registers are uaddr, newval, oldval on the
+example.
+The second assembly instruction
+ " mov %0=r0 \n"
+is the first one which writes to a register; it sets %0 to 0. %0 means
+the first register operand; it is r8 here. (The r0 is read-only and
+always 0 on the Itanium; it can be used if an immediate zero value is
+needed.)
+This instruction might overwrite one of the other registers which are
+still needed.
+Whether it really happens depends on how GCC decides what registers it
+uses and how it optimizes the code.
+
+The objdump utility can give us disassembly.
+The futex_atomic_cmpxchg_inatomic() function is inline, so we have to
+look for a module that uses the funtion. This is the
+cmpxchg_futex_value_locked() function in
+kernel/futex.c:
+
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+ u32 uval, u32 newval)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+ pagefault_enable();
+
+ return ret;
+}
+
+Now the disassembly. At first from the Kernel package 3.2.23 which has
+been compiled with GCC 4.4, remeber this Kernel seemed to work:
+objdump -d linux-3.2.23/debian/build/build_ia64_none_mckinley/kernel/futex.o
+
+0000000000000230 <cmpxchg_futex_value_locked>:
+ 230: 0b 18 80 1b 18 21 [MMI] adds r3=3168,r13;;
+ 236: 80 40 0d 00 42 00 adds r8=40,r3
+ 23c: 00 00 04 00 nop.i 0x0;;
+ 240: 0b 50 00 10 10 10 [MMI] ld4 r10=[r8];;
+ 246: 90 08 28 00 42 00 adds r9=1,r10
+ 24c: 00 00 04 00 nop.i 0x0;;
+ 250: 09 00 00 00 01 00 [MMI] nop.m 0x0
+ 256: 00 48 20 20 23 00 st4 [r8]=r9
+ 25c: 00 00 04 00 nop.i 0x0;;
+ 260: 08 10 80 06 00 21 [MMI] adds r2=32,r3
+ 266: 00 00 00 02 00 00 nop.m 0x0
+ 26c: 02 08 f1 52 extr.u r16=r33,0,61
+ 270: 05 40 88 00 08 e0 [MLX] addp4 r8=r34,r0
+ 276: ff ff 0f 00 00 e0 movl r15=0xfffffffbfff;;
+ 27c: f1 f7 ff 65
+ 280: 09 70 00 04 18 10 [MMI] ld8 r14=[r2]
+ 286: 00 00 00 02 00 c0 nop.m 0x0
+ 28c: f0 80 1c d0 cmp.ltu p6,p7=r15,r16;;
+ 290: 08 40 fc 1d 09 3b [MMI] cmp.eq p8,p9=-1,r14
+ 296: 00 00 00 02 00 40 nop.m 0x0
+ 29c: e1 08 2d d0 cmp.ltu p10,p11=r14,r33
+ 2a0: 56 01 10 00 40 10 [BBB] (p10) br.cond.spnt.few 2e0
+<cmpxchg_futex_value_locked+0xb0>
+ 2a6: 02 08 00 80 21 03 (p08) br.cond.dpnt.few 2b0
+<cmpxchg_futex_value_locked+0x80>
+ 2ac: 40 00 00 41 (p06) br.cond.spnt.few 2e0
+<cmpxchg_futex_value_locked+0xb0>
+ 2b0: 0a 00 00 00 22 00 [MMI] mf;;
+ 2b6: 80 00 00 00 42 00 mov r8=r0
+ 2bc: 00 00 04 00 nop.i 0x0
+ 2c0: 0b 00 20 40 2a 04 [MMI] mov.m ar.ccv=r8;;
+ 2c6: 10 1a 85 22 20 00 cmpxchg4.acq r33=[r33],r35,ar.ccv
+ 2cc: 00 00 04 00 nop.i 0x0;;
+ 2d0: 10 00 84 40 90 11 [MIB] st4 [r32]=r33
+ 2d6: 00 00 00 02 00 00 nop.i 0x0
+ 2dc: 20 00 00 40 br.few 2f0
+<cmpxchg_futex_value_locked+0xc0>
+ 2e0: 09 40 c8 f9 ff 27 [MMI] mov r8=-14
+ 2e6: 00 00 00 02 00 00 nop.m 0x0
+ 2ec: 00 00 04 00 nop.i 0x0;;
+ 2f0: 0b 58 20 1a 19 21 [MMI] adds r11=3208,r13;;
+ 2f6: 20 01 2c 20 20 00 ld4 r18=[r11]
+ 2fc: 00 00 04 00 nop.i 0x0;;
+ 300: 0b 88 fc 25 3f 23 [MMI] adds r17=-1,r18;;
+ 306: 00 88 2c 20 23 00 st4 [r11]=r17
+ 30c: 00 00 04 00 nop.i 0x0;;
+ 310: 11 00 00 00 01 00 [MIB] nop.m 0x0
+ 316: 00 00 00 02 00 80 nop.i 0x0
+ 31c: 08 00 84 00 br.ret.sptk.many b0;;
+
+The lines
+ 2b0: 0a 00 00 00 22 00 [MMI] mf;;
+ 2b6: 80 00 00 00 42 00 mov r8=r0
+ 2bc: 00 00 04 00 nop.i 0x0
+ 2c0: 0b 00 20 40 2a 04 [MMI] mov.m ar.ccv=r8;;
+ 2c6: 10 1a 85 22 20 00 cmpxchg4.acq r33=[r33],r35,ar.ccv
+ 2cc: 00 00 04 00 nop.i 0x0;;
+are the instructions of the assembly block.
+The line
+ 2b6: 80 00 00 00 42 00 mov r8=r0
+sets the r8 register to 0 and after that
+ 2c0: 0b 00 20 40 2a 04 [MMI] mov.m ar.ccv=r8;;
+prepares the 'oldvalue' for the cmpxchg but it takes it from r8. This
+is wrong.
+What happened here is what I explained above: An input register is
+overwritten which is still needed.
+The register operand constraints in futex.h are wrong.
+
+(The problem doesn't occur when the Kernel is compiled with GCC 4.6.)
+
+The attached patch fixes the register operand constraints in futex.h.
+The code after patching of it:
+
+static inline int
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+ u32 oldval, u32 newval)
+{
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+ return -EFAULT;
+
+ {
+ register unsigned long r8 __asm ("r8") = 0;
+ unsigned long prev;
+ __asm__ __volatile__(
+ " mf;; \n"
+ " mov ar.ccv=%4;; \n"
+ "[1:] cmpxchg4.acq %1=[%2],%3,ar.ccv \n"
+ " .xdata4 \"__ex_table\", 1b-., 2f-. \n"
+ "[2:]"
+ : "+r" (r8), "=&r" (prev)
+ : "r" (uaddr), "r" (newval),
+ "rO" ((long) (unsigned) oldval)
+ : "memory");
+ *uval = prev;
+ return r8;
+ }
+}
+
+I also initialized the 'r8' var with the C programming language.
+The _asm qualifier on the definition of the 'r8' var forces GCC to use
+the r8 processor register for it.
+I don't believe that we should use inline assembly for zeroing out a
+local variable.
+The constraint is
+"+r" (r8)
+what means that it is both an input register and an output register.
+Note that the page fault handler will modify the r8 register which
+will be the return value of the function.
+The real fix is
+"=&r" (prev)
+The & means that GCC must not use any of the input registers to place
+this output register in.
+
+Patched the Kernel 3.2.23 and compiled it with GCC4.4:
+
+0000000000000230 <cmpxchg_futex_value_locked>:
+ 230: 0b 18 80 1b 18 21 [MMI] adds r3=3168,r13;;
+ 236: 80 40 0d 00 42 00 adds r8=40,r3
+ 23c: 00 00 04 00 nop.i 0x0;;
+ 240: 0b 50 00 10 10 10 [MMI] ld4 r10=[r8];;
+ 246: 90 08 28 00 42 00 adds r9=1,r10
+ 24c: 00 00 04 00 nop.i 0x0;;
+ 250: 09 00 00 00 01 00 [MMI] nop.m 0x0
+ 256: 00 48 20 20 23 00 st4 [r8]=r9
+ 25c: 00 00 04 00 nop.i 0x0;;
+ 260: 08 10 80 06 00 21 [MMI] adds r2=32,r3
+ 266: 20 12 01 10 40 00 addp4 r34=r34,r0
+ 26c: 02 08 f1 52 extr.u r16=r33,0,61
+ 270: 05 40 00 00 00 e1 [MLX] mov r8=r0
+ 276: ff ff 0f 00 00 e0 movl r15=0xfffffffbfff;;
+ 27c: f1 f7 ff 65
+ 280: 09 70 00 04 18 10 [MMI] ld8 r14=[r2]
+ 286: 00 00 00 02 00 c0 nop.m 0x0
+ 28c: f0 80 1c d0 cmp.ltu p6,p7=r15,r16;;
+ 290: 08 40 fc 1d 09 3b [MMI] cmp.eq p8,p9=-1,r14
+ 296: 00 00 00 02 00 40 nop.m 0x0
+ 29c: e1 08 2d d0 cmp.ltu p10,p11=r14,r33
+ 2a0: 56 01 10 00 40 10 [BBB] (p10) br.cond.spnt.few 2e0
+<cmpxchg_futex_value_locked+0xb0>
+ 2a6: 02 08 00 80 21 03 (p08) br.cond.dpnt.few 2b0
+<cmpxchg_futex_value_locked+0x80>
+ 2ac: 40 00 00 41 (p06) br.cond.spnt.few 2e0
+<cmpxchg_futex_value_locked+0xb0>
+ 2b0: 0b 00 00 00 22 00 [MMI] mf;;
+ 2b6: 00 10 81 54 08 00 mov.m ar.ccv=r34
+ 2bc: 00 00 04 00 nop.i 0x0;;
+ 2c0: 09 58 8c 42 11 10 [MMI] cmpxchg4.acq r11=[r33],r35,ar.ccv
+ 2c6: 00 00 00 02 00 00 nop.m 0x0
+ 2cc: 00 00 04 00 nop.i 0x0;;
+ 2d0: 10 00 2c 40 90 11 [MIB] st4 [r32]=r11
+ 2d6: 00 00 00 02 00 00 nop.i 0x0
+ 2dc: 20 00 00 40 br.few 2f0
+<cmpxchg_futex_value_locked+0xc0>
+ 2e0: 09 40 c8 f9 ff 27 [MMI] mov r8=-14
+ 2e6: 00 00 00 02 00 00 nop.m 0x0
+ 2ec: 00 00 04 00 nop.i 0x0;;
+ 2f0: 0b 88 20 1a 19 21 [MMI] adds r17=3208,r13;;
+ 2f6: 30 01 44 20 20 00 ld4 r19=[r17]
+ 2fc: 00 00 04 00 nop.i 0x0;;
+ 300: 0b 90 fc 27 3f 23 [MMI] adds r18=-1,r19;;
+ 306: 00 90 44 20 23 00 st4 [r17]=r18
+ 30c: 00 00 04 00 nop.i 0x0;;
+ 310: 11 00 00 00 01 00 [MIB] nop.m 0x0
+ 316: 00 00 00 02 00 80 nop.i 0x0
+ 31c: 08 00 84 00 br.ret.sptk.many b0;;
+
+Much better.
+There is a
+ 270: 05 40 00 00 00 e1 [MLX] mov r8=r0
+which was generated by C code r8 = 0. Below
+ 2b6: 00 10 81 54 08 00 mov.m ar.ccv=r34
+what means that oldval is no longer overwritten.
+
+This is Debian bug#702641
+(http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=702641).
+
+The patch is applicable on Kernel 3.9-rc1, 3.2.23 and many other versions.
+
+Signed-off-by: Stephan Schreiber <info@fs-driver.org>
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/ia64/include/asm/futex.h | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/arch/ia64/include/asm/futex.h
++++ b/arch/ia64/include/asm/futex.h
+@@ -106,16 +106,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
+ return -EFAULT;
+
+ {
+- register unsigned long r8 __asm ("r8");
++ register unsigned long r8 __asm ("r8") = 0;
+ unsigned long prev;
+ __asm__ __volatile__(
+ " mf;; \n"
+- " mov %0=r0 \n"
+ " mov ar.ccv=%4;; \n"
+ "[1:] cmpxchg4.acq %1=[%2],%3,ar.ccv \n"
+ " .xdata4 \"__ex_table\", 1b-., 2f-. \n"
+ "[2:]"
+- : "=r" (r8), "=r" (prev)
++ : "+r" (r8), "=&r" (prev)
+ : "r" (uaddr), "r" (newval),
+ "rO" ((long) (unsigned) oldval)
+ : "memory");
--- /dev/null
+From de53e9caa4c6149ef4a78c2f83d7f5b655848767 Mon Sep 17 00:00:00 2001
+From: Stephan Schreiber <info@fs-driver.org>
+Date: Tue, 19 Mar 2013 15:27:12 -0700
+Subject: Wrong asm register contraints in the kvm implementation
+
+From: Stephan Schreiber <info@fs-driver.org>
+
+commit de53e9caa4c6149ef4a78c2f83d7f5b655848767 upstream.
+
+The Linux Kernel contains some inline assembly source code which has
+wrong asm register constraints in arch/ia64/kvm/vtlb.c.
+
+I observed this on Kernel 3.2.35 but it is also true on the most
+recent Kernel 3.9-rc1.
+
+File arch/ia64/kvm/vtlb.c:
+
+u64 guest_vhpt_lookup(u64 iha, u64 *pte)
+{
+ u64 ret;
+ struct thash_data *data;
+
+ data = __vtr_lookup(current_vcpu, iha, D_TLB);
+ if (data != NULL)
+ thash_vhpt_insert(current_vcpu, data->page_flags,
+ data->itir, iha, D_TLB);
+
+ asm volatile (
+ "rsm psr.ic|psr.i;;"
+ "srlz.d;;"
+ "ld8.s r9=[%1];;"
+ "tnat.nz p6,p7=r9;;"
+ "(p6) mov %0=1;"
+ "(p6) mov r9=r0;"
+ "(p7) extr.u r9=r9,0,53;;"
+ "(p7) mov %0=r0;"
+ "(p7) st8 [%2]=r9;;"
+ "ssm psr.ic;;"
+ "srlz.d;;"
+ "ssm psr.i;;"
+ "srlz.d;;"
+ : "=r"(ret) : "r"(iha), "r"(pte):"memory");
+
+ return ret;
+}
+
+The list of output registers is
+ : "=r"(ret) : "r"(iha), "r"(pte):"memory");
+The constraint "=r" means that the GCC has to maintain that these vars
+are in registers and contain valid info when the program flow leaves
+the assembly block (output registers).
+But "=r" also means that GCC can put them in registers that are used
+as input registers. Input registers are iha, pte on the example.
+If the predicate p7 is true, the 8th assembly instruction
+ "(p7) mov %0=r0;"
+is the first one which writes to a register which is maintained by the
+register constraints; it sets %0. %0 means the first register operand;
+it is ret here.
+This instruction might overwrite the %2 register (pte) which is needed
+by the next instruction:
+ "(p7) st8 [%2]=r9;;"
+Whether it really happens depends on how GCC decides what registers it
+uses and how it optimizes the code.
+
+The attached patch fixes the register operand constraints in
+arch/ia64/kvm/vtlb.c.
+The register constraints should be
+ : "=&r"(ret) : "r"(iha), "r"(pte):"memory");
+The & means that GCC must not use any of the input registers to place
+this output register in.
+
+This is Debian bug#702639
+(http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=702639).
+
+The patch is applicable on Kernel 3.9-rc1, 3.2.35 and many other versions.
+
+Signed-off-by: Stephan Schreiber <info@fs-driver.org>
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/ia64/kvm/vtlb.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/ia64/kvm/vtlb.c
++++ b/arch/ia64/kvm/vtlb.c
+@@ -256,7 +256,7 @@ u64 guest_vhpt_lookup(u64 iha, u64 *pte)
+ "srlz.d;;"
+ "ssm psr.i;;"
+ "srlz.d;;"
+- : "=r"(ret) : "r"(iha), "r"(pte):"memory");
++ : "=&r"(ret) : "r"(iha), "r"(pte) : "memory");
+
+ return ret;
+ }