From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 27 Apr 2013 01:03:06 +0000 (-0700)
Subject: 3.4-stable patches
X-Git-Tag: v3.0.76~8
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8fb963ad8fb6d32ca8e28930e970251a6e15aad2;p=thirdparty%2Fkernel%2Fstable-queue.git

3.4-stable patches

added patches:
	sparc64-fix-race-in-tlb-batch-processing.patch
---

diff --git a/queue-3.4/series b/queue-3.4/series
index 396027508e8..13f62c41371 100644
--- a/queue-3.4/series
+++ b/queue-3.4/series
@@ -1,3 +1,4 @@
 aio-fix-possible-invalid-memory-access-when-debug-is-enabled.patch
 tty-do-not-update-atime-mtime-on-read-write.patch
 tty-fix-atime-mtime-regression.patch
+sparc64-fix-race-in-tlb-batch-processing.patch
diff --git a/queue-3.4/sparc64-fix-race-in-tlb-batch-processing.patch b/queue-3.4/sparc64-fix-race-in-tlb-batch-processing.patch
new file mode 100644
index 00000000000..9984d6aa2bb
--- /dev/null
+++ b/queue-3.4/sparc64-fix-race-in-tlb-batch-processing.patch
@@ -0,0 +1,613 @@
+From 25d57a421ac329813283fad8831237db894d4b3e Mon Sep 17 00:00:00 2001
+From: "David S. Miller" <davem@davemloft.net>
+Date: Fri, 19 Apr 2013 17:26:26 -0400
+Subject: sparc64: Fix race in TLB batch processing.
+
+From: "David S. Miller" <davem@davemloft.net>
+
+[ Commits f36391d2790d04993f48da6a45810033a2cdf847 and
+  f0af97070acbad5d6a361f485828223a4faaa0ee upstream. ]
+
+As reported by Dave Kleikamp, when we emit cross calls to do batched
+TLB flush processing we have a race because we do not synchronize on
+the sibling cpus completing the cross call.
+
+So meanwhile the TLB batch can be reset (tb->tlb_nr set to zero, etc.)
+and either flushes are missed or flushes will flush the wrong
+addresses.
+
+Fix this by using generic infrastructure to synchonize on the
+completion of the cross call.
+
+This first required getting the flush_tlb_pending() call out from
+switch_to() which operates with locks held and interrupts disabled.
+The problem is that smp_call_function_many() cannot be invoked with
+IRQs disabled and this is explicitly checked for with WARN_ON_ONCE().
+
+We get the batch processing outside of locked IRQ disabled sections by
+using some ideas from the powerpc port. Namely, we only batch inside
+of arch_{enter,leave}_lazy_mmu_mode() calls.  If we're not in such a
+region, we flush TLBs synchronously.
+
+1) Get rid of xcall_flush_tlb_pending and per-cpu type
+   implementations.
+
+2) Do TLB batch cross calls instead via:
+
+	smp_call_function_many()
+		tlb_pending_func()
+			__flush_tlb_pending()
+
+3) Batch only in lazy mmu sequences:
+
+	a) Add 'active' member to struct tlb_batch
+	b) Define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+	c) Set 'active' in arch_enter_lazy_mmu_mode()
+	d) Run batch and clear 'active' in arch_leave_lazy_mmu_mode()
+	e) Check 'active' in tlb_batch_add_one() and do a synchronous
+           flush if it's clear.
+
+4) Add infrastructure for synchronous TLB page flushes.
+
+	a) Implement __flush_tlb_page and per-cpu variants, patch
+	   as needed.
+	b) Likewise for xcall_flush_tlb_page.
+	c) Implement smp_flush_tlb_page() to invoke the cross-call.
+	d) Wire up global_flush_tlb_page() to the right routine based
+           upon CONFIG_SMP
+
+5) It turns out that singleton batches are very common, 2 out of every
+   3 batch flushes have only a single entry in them.
+
+   The batch flush waiting is very expensive, both because of the poll
+   on sibling cpu completeion, as well as because passing the tlb batch
+   pointer to the sibling cpus invokes a shared memory dereference.
+
+   Therefore, in flush_tlb_pending(), if there is only one entry in
+   the batch perform a completely asynchronous global_flush_tlb_page()
+   instead.
+
+Reported-by: Dave Kleikamp <dave.kleikamp@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/sparc/include/asm/pgtable_64.h   |    1 
+ arch/sparc/include/asm/switch_to_64.h |    3 
+ arch/sparc/include/asm/tlbflush_64.h  |   37 ++++++++--
+ arch/sparc/kernel/smp_64.c            |   41 ++++++++++-
+ arch/sparc/mm/tlb.c                   |   39 ++++++++++-
+ arch/sparc/mm/tsb.c                   |   57 ++++++++++++----
+ arch/sparc/mm/ultra.S                 |  119 +++++++++++++++++++++++++++-------
+ 7 files changed, 242 insertions(+), 55 deletions(-)
+
+--- a/arch/sparc/include/asm/pgtable_64.h
++++ b/arch/sparc/include/asm/pgtable_64.h
+@@ -780,6 +780,7 @@ static inline int io_remap_pfn_range(str
+ 	return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot);
+ }
+ 
++#include <asm/tlbflush.h>
+ #include <asm-generic/pgtable.h>
+ 
+ /* We provide our own get_unmapped_area to cope with VA holes and
+--- a/arch/sparc/include/asm/switch_to_64.h
++++ b/arch/sparc/include/asm/switch_to_64.h
+@@ -18,8 +18,7 @@ do {						\
+ 	 * and 2 stores in this critical code path.  -DaveM
+ 	 */
+ #define switch_to(prev, next, last)					\
+-do {	flush_tlb_pending();						\
+-	save_and_clear_fpu();						\
++do {	save_and_clear_fpu();						\
+ 	/* If you are tempted to conditionalize the following */	\
+ 	/* so that ASI is only written if it changes, think again. */	\
+ 	__asm__ __volatile__("wr %%g0, %0, %%asi"			\
+--- a/arch/sparc/include/asm/tlbflush_64.h
++++ b/arch/sparc/include/asm/tlbflush_64.h
+@@ -11,24 +11,40 @@
+ struct tlb_batch {
+ 	struct mm_struct *mm;
+ 	unsigned long tlb_nr;
++	unsigned long active;
+ 	unsigned long vaddrs[TLB_BATCH_NR];
+ };
+ 
+ extern void flush_tsb_kernel_range(unsigned long start, unsigned long end);
+ extern void flush_tsb_user(struct tlb_batch *tb);
++extern void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr);
+ 
+ /* TLB flush operations. */
+ 
+-extern void flush_tlb_pending(void);
++static inline void flush_tlb_mm(struct mm_struct *mm)
++{
++}
++
++static inline void flush_tlb_page(struct vm_area_struct *vma,
++				  unsigned long vmaddr)
++{
++}
++
++static inline void flush_tlb_range(struct vm_area_struct *vma,
++				   unsigned long start, unsigned long end)
++{
++}
++
++#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+ 
+-#define flush_tlb_range(vma,start,end)	\
+-	do { (void)(start); flush_tlb_pending(); } while (0)
+-#define flush_tlb_page(vma,addr)	flush_tlb_pending()
+-#define flush_tlb_mm(mm)		flush_tlb_pending()
++extern void flush_tlb_pending(void);
++extern void arch_enter_lazy_mmu_mode(void);
++extern void arch_leave_lazy_mmu_mode(void);
++#define arch_flush_lazy_mmu_mode()      do {} while (0)
+ 
+ /* Local cpu only.  */
+ extern void __flush_tlb_all(void);
+-
++extern void __flush_tlb_page(unsigned long context, unsigned long vaddr);
+ extern void __flush_tlb_kernel_range(unsigned long start, unsigned long end);
+ 
+ #ifndef CONFIG_SMP
+@@ -38,15 +54,24 @@ do {	flush_tsb_kernel_range(start,end);
+ 	__flush_tlb_kernel_range(start,end); \
+ } while (0)
+ 
++static inline void global_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
++{
++	__flush_tlb_page(CTX_HWBITS(mm->context), vaddr);
++}
++
+ #else /* CONFIG_SMP */
+ 
+ extern void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end);
++extern void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr);
+ 
+ #define flush_tlb_kernel_range(start, end) \
+ do {	flush_tsb_kernel_range(start,end); \
+ 	smp_flush_tlb_kernel_range(start, end); \
+ } while (0)
+ 
++#define global_flush_tlb_page(mm, vaddr) \
++	smp_flush_tlb_page(mm, vaddr)
++
+ #endif /* ! CONFIG_SMP */
+ 
+ #endif /* _SPARC64_TLBFLUSH_H */
+--- a/arch/sparc/kernel/smp_64.c
++++ b/arch/sparc/kernel/smp_64.c
+@@ -856,7 +856,7 @@ void smp_tsb_sync(struct mm_struct *mm)
+ }
+ 
+ extern unsigned long xcall_flush_tlb_mm;
+-extern unsigned long xcall_flush_tlb_pending;
++extern unsigned long xcall_flush_tlb_page;
+ extern unsigned long xcall_flush_tlb_kernel_range;
+ extern unsigned long xcall_fetch_glob_regs;
+ extern unsigned long xcall_receive_signal;
+@@ -1070,22 +1070,55 @@ local_flush_and_out:
+ 	put_cpu();
+ }
+ 
++struct tlb_pending_info {
++	unsigned long ctx;
++	unsigned long nr;
++	unsigned long *vaddrs;
++};
++
++static void tlb_pending_func(void *info)
++{
++	struct tlb_pending_info *t = info;
++
++	__flush_tlb_pending(t->ctx, t->nr, t->vaddrs);
++}
++
+ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long *vaddrs)
+ {
+ 	u32 ctx = CTX_HWBITS(mm->context);
++	struct tlb_pending_info info;
+ 	int cpu = get_cpu();
+ 
++	info.ctx = ctx;
++	info.nr = nr;
++	info.vaddrs = vaddrs;
++
+ 	if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
+ 		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
+ 	else
+-		smp_cross_call_masked(&xcall_flush_tlb_pending,
+-				      ctx, nr, (unsigned long) vaddrs,
+-				      mm_cpumask(mm));
++		smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
++				       &info, 1);
+ 
+ 	__flush_tlb_pending(ctx, nr, vaddrs);
+ 
+ 	put_cpu();
+ }
++
++void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
++{
++	unsigned long context = CTX_HWBITS(mm->context);
++	int cpu = get_cpu();
++
++	if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
++		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
++	else
++		smp_cross_call_masked(&xcall_flush_tlb_page,
++				      context, vaddr, 0,
++				      mm_cpumask(mm));
++	__flush_tlb_page(context, vaddr);
++
++	put_cpu();
++}
+ 
+ void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end)
+ {
+--- a/arch/sparc/mm/tlb.c
++++ b/arch/sparc/mm/tlb.c
+@@ -24,11 +24,17 @@ static DEFINE_PER_CPU(struct tlb_batch,
+ void flush_tlb_pending(void)
+ {
+ 	struct tlb_batch *tb = &get_cpu_var(tlb_batch);
++	struct mm_struct *mm = tb->mm;
+ 
+-	if (tb->tlb_nr) {
+-		flush_tsb_user(tb);
++	if (!tb->tlb_nr)
++		goto out;
+ 
+-		if (CTX_VALID(tb->mm->context)) {
++	flush_tsb_user(tb);
++
++	if (CTX_VALID(mm->context)) {
++		if (tb->tlb_nr == 1) {
++			global_flush_tlb_page(mm, tb->vaddrs[0]);
++		} else {
+ #ifdef CONFIG_SMP
+ 			smp_flush_tlb_pending(tb->mm, tb->tlb_nr,
+ 					      &tb->vaddrs[0]);
+@@ -37,12 +43,30 @@ void flush_tlb_pending(void)
+ 					    tb->tlb_nr, &tb->vaddrs[0]);
+ #endif
+ 		}
+-		tb->tlb_nr = 0;
+ 	}
+ 
++	tb->tlb_nr = 0;
++
++out:
+ 	put_cpu_var(tlb_batch);
+ }
+ 
++void arch_enter_lazy_mmu_mode(void)
++{
++	struct tlb_batch *tb = &__get_cpu_var(tlb_batch);
++
++	tb->active = 1;
++}
++
++void arch_leave_lazy_mmu_mode(void)
++{
++	struct tlb_batch *tb = &__get_cpu_var(tlb_batch);
++
++	if (tb->tlb_nr)
++		flush_tlb_pending();
++	tb->active = 0;
++}
++
+ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
+ 		   pte_t *ptep, pte_t orig, int fullmm)
+ {
+@@ -90,6 +114,12 @@ no_cache_flush:
+ 		nr = 0;
+ 	}
+ 
++	if (!tb->active) {
++		global_flush_tlb_page(mm, vaddr);
++		flush_tsb_user_page(mm, vaddr);
++		goto out;
++	}
++
+ 	if (nr == 0)
+ 		tb->mm = mm;
+ 
+@@ -98,5 +128,6 @@ no_cache_flush:
+ 	if (nr >= TLB_BATCH_NR)
+ 		flush_tlb_pending();
+ 
++out:
+ 	put_cpu_var(tlb_batch);
+ }
+--- a/arch/sparc/mm/tsb.c
++++ b/arch/sparc/mm/tsb.c
+@@ -7,11 +7,10 @@
+ #include <linux/preempt.h>
+ #include <linux/slab.h>
+ #include <asm/page.h>
+-#include <asm/tlbflush.h>
+-#include <asm/tlb.h>
+-#include <asm/mmu_context.h>
+ #include <asm/pgtable.h>
++#include <asm/mmu_context.h>
+ #include <asm/tsb.h>
++#include <asm/tlb.h>
+ #include <asm/oplib.h>
+ 
+ extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
+@@ -46,23 +45,27 @@ void flush_tsb_kernel_range(unsigned lon
+ 	}
+ }
+ 
+-static void __flush_tsb_one(struct tlb_batch *tb, unsigned long hash_shift,
+-			    unsigned long tsb, unsigned long nentries)
++static void __flush_tsb_one_entry(unsigned long tsb, unsigned long v,
++				  unsigned long hash_shift,
++				  unsigned long nentries)
+ {
+-	unsigned long i;
++	unsigned long tag, ent, hash;
+ 
+-	for (i = 0; i < tb->tlb_nr; i++) {
+-		unsigned long v = tb->vaddrs[i];
+-		unsigned long tag, ent, hash;
++	v &= ~0x1UL;
++	hash = tsb_hash(v, hash_shift, nentries);
++	ent = tsb + (hash * sizeof(struct tsb));
++	tag = (v >> 22UL);
+ 
+-		v &= ~0x1UL;
++	tsb_flush(ent, tag);
++}
+ 
+-		hash = tsb_hash(v, hash_shift, nentries);
+-		ent = tsb + (hash * sizeof(struct tsb));
+-		tag = (v >> 22UL);
++static void __flush_tsb_one(struct tlb_batch *tb, unsigned long hash_shift,
++			    unsigned long tsb, unsigned long nentries)
++{
++	unsigned long i;
+ 
+-		tsb_flush(ent, tag);
+-	}
++	for (i = 0; i < tb->tlb_nr; i++)
++		__flush_tsb_one_entry(tsb, tb->vaddrs[i], hash_shift, nentries);
+ }
+ 
+ void flush_tsb_user(struct tlb_batch *tb)
+@@ -88,6 +91,30 @@ void flush_tsb_user(struct tlb_batch *tb
+ 	}
+ #endif
+ 	spin_unlock_irqrestore(&mm->context.lock, flags);
++}
++
++void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
++{
++	unsigned long nentries, base, flags;
++
++	spin_lock_irqsave(&mm->context.lock, flags);
++
++	base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
++	nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
++	if (tlb_type == cheetah_plus || tlb_type == hypervisor)
++		base = __pa(base);
++	__flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries);
++
++#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
++	if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
++		base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;
++		nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
++		if (tlb_type == cheetah_plus || tlb_type == hypervisor)
++			base = __pa(base);
++		__flush_tsb_one_entry(base, vaddr, HPAGE_SHIFT, nentries);
++	}
++#endif
++	spin_unlock_irqrestore(&mm->context.lock, flags);
+ }
+ 
+ #if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
+--- a/arch/sparc/mm/ultra.S
++++ b/arch/sparc/mm/ultra.S
+@@ -53,6 +53,33 @@ __flush_tlb_mm:		/* 18 insns */
+ 	nop
+ 
+ 	.align		32
++	.globl		__flush_tlb_page
++__flush_tlb_page:	/* 22 insns */
++	/* %o0 = context, %o1 = vaddr */
++	rdpr		%pstate, %g7
++	andn		%g7, PSTATE_IE, %g2
++	wrpr		%g2, %pstate
++	mov		SECONDARY_CONTEXT, %o4
++	ldxa		[%o4] ASI_DMMU, %g2
++	stxa		%o0, [%o4] ASI_DMMU
++	andcc		%o1, 1, %g0
++	andn		%o1, 1, %o3
++	be,pn		%icc, 1f
++	 or		%o3, 0x10, %o3
++	stxa		%g0, [%o3] ASI_IMMU_DEMAP
++1:	stxa		%g0, [%o3] ASI_DMMU_DEMAP
++	membar		#Sync
++	stxa		%g2, [%o4] ASI_DMMU
++	sethi		%hi(KERNBASE), %o4
++	flush		%o4
++	retl
++	 wrpr		%g7, 0x0, %pstate
++	nop
++	nop
++	nop
++	nop
++
++	.align		32
+ 	.globl		__flush_tlb_pending
+ __flush_tlb_pending:	/* 26 insns */
+ 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
+@@ -203,6 +230,31 @@ __cheetah_flush_tlb_mm: /* 19 insns */
+ 	retl
+ 	 wrpr		%g7, 0x0, %pstate
+ 
++__cheetah_flush_tlb_page:	/* 22 insns */
++	/* %o0 = context, %o1 = vaddr */
++	rdpr		%pstate, %g7
++	andn		%g7, PSTATE_IE, %g2
++	wrpr		%g2, 0x0, %pstate
++	wrpr		%g0, 1, %tl
++	mov		PRIMARY_CONTEXT, %o4
++	ldxa		[%o4] ASI_DMMU, %g2
++	srlx		%g2, CTX_PGSZ1_NUC_SHIFT, %o3
++	sllx		%o3, CTX_PGSZ1_NUC_SHIFT, %o3
++	or		%o0, %o3, %o0	/* Preserve nucleus page size fields */
++	stxa		%o0, [%o4] ASI_DMMU
++	andcc		%o1, 1, %g0
++	be,pn		%icc, 1f
++	 andn		%o1, 1, %o3
++	stxa		%g0, [%o3] ASI_IMMU_DEMAP
++1:	stxa		%g0, [%o3] ASI_DMMU_DEMAP
++	membar		#Sync
++	stxa		%g2, [%o4] ASI_DMMU
++	sethi		%hi(KERNBASE), %o4
++	flush		%o4
++	wrpr		%g0, 0, %tl
++	retl
++	 wrpr		%g7, 0x0, %pstate
++
+ __cheetah_flush_tlb_pending:	/* 27 insns */
+ 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
+ 	rdpr		%pstate, %g7
+@@ -269,6 +321,20 @@ __hypervisor_flush_tlb_mm: /* 10 insns *
+ 	retl
+ 	 nop
+ 
++__hypervisor_flush_tlb_page: /* 11 insns */
++	/* %o0 = context, %o1 = vaddr */
++	mov		%o0, %g2
++	mov		%o1, %o0              /* ARG0: vaddr + IMMU-bit */
++	mov		%g2, %o1	      /* ARG1: mmu context */
++	mov		HV_MMU_ALL, %o2	      /* ARG2: flags */
++	srlx		%o0, PAGE_SHIFT, %o0
++	sllx		%o0, PAGE_SHIFT, %o0
++	ta		HV_MMU_UNMAP_ADDR_TRAP
++	brnz,pn		%o0, __hypervisor_tlb_tl0_error
++	 mov		HV_MMU_UNMAP_ADDR_TRAP, %o1
++	retl
++	 nop
++
+ __hypervisor_flush_tlb_pending: /* 16 insns */
+ 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
+ 	sllx		%o1, 3, %g1
+@@ -339,6 +405,13 @@ cheetah_patch_cachetlbops:
+ 	call		tlb_patch_one
+ 	 mov		19, %o2
+ 
++	sethi		%hi(__flush_tlb_page), %o0
++	or		%o0, %lo(__flush_tlb_page), %o0
++	sethi		%hi(__cheetah_flush_tlb_page), %o1
++	or		%o1, %lo(__cheetah_flush_tlb_page), %o1
++	call		tlb_patch_one
++	 mov		22, %o2
++
+ 	sethi		%hi(__flush_tlb_pending), %o0
+ 	or		%o0, %lo(__flush_tlb_pending), %o0
+ 	sethi		%hi(__cheetah_flush_tlb_pending), %o1
+@@ -397,10 +470,9 @@ xcall_flush_tlb_mm:	/* 21 insns */
+ 	nop
+ 	nop
+ 
+-	.globl		xcall_flush_tlb_pending
+-xcall_flush_tlb_pending:	/* 21 insns */
+-	/* %g5=context, %g1=nr, %g7=vaddrs[] */
+-	sllx		%g1, 3, %g1
++	.globl		xcall_flush_tlb_page
++xcall_flush_tlb_page:	/* 17 insns */
++	/* %g5=context, %g1=vaddr */
+ 	mov		PRIMARY_CONTEXT, %g4
+ 	ldxa		[%g4] ASI_DMMU, %g2
+ 	srlx		%g2, CTX_PGSZ1_NUC_SHIFT, %g4
+@@ -408,20 +480,16 @@ xcall_flush_tlb_pending:	/* 21 insns */
+ 	or		%g5, %g4, %g5
+ 	mov		PRIMARY_CONTEXT, %g4
+ 	stxa		%g5, [%g4] ASI_DMMU
+-1:	sub		%g1, (1 << 3), %g1
+-	ldx		[%g7 + %g1], %g5
+-	andcc		%g5, 0x1, %g0
++	andcc		%g1, 0x1, %g0
+ 	be,pn		%icc, 2f
+-
+-	 andn		%g5, 0x1, %g5
++	 andn		%g1, 0x1, %g5
+ 	stxa		%g0, [%g5] ASI_IMMU_DEMAP
+ 2:	stxa		%g0, [%g5] ASI_DMMU_DEMAP
+ 	membar		#Sync
+-	brnz,pt		%g1, 1b
+-	 nop
+ 	stxa		%g2, [%g4] ASI_DMMU
+ 	retry
+ 	nop
++	nop
+ 
+ 	.globl		xcall_flush_tlb_kernel_range
+ xcall_flush_tlb_kernel_range:	/* 25 insns */
+@@ -596,15 +664,13 @@ __hypervisor_xcall_flush_tlb_mm: /* 21 i
+ 	membar		#Sync
+ 	retry
+ 
+-	.globl		__hypervisor_xcall_flush_tlb_pending
+-__hypervisor_xcall_flush_tlb_pending: /* 21 insns */
+-	/* %g5=ctx, %g1=nr, %g7=vaddrs[], %g2,%g3,%g4,g6=scratch */
+-	sllx		%g1, 3, %g1
++	.globl		__hypervisor_xcall_flush_tlb_page
++__hypervisor_xcall_flush_tlb_page: /* 17 insns */
++	/* %g5=ctx, %g1=vaddr */
+ 	mov		%o0, %g2
+ 	mov		%o1, %g3
+ 	mov		%o2, %g4
+-1:	sub		%g1, (1 << 3), %g1
+-	ldx		[%g7 + %g1], %o0	/* ARG0: virtual address */
++	mov		%g1, %o0	        /* ARG0: virtual address */
+ 	mov		%g5, %o1		/* ARG1: mmu context */
+ 	mov		HV_MMU_ALL, %o2		/* ARG2: flags */
+ 	srlx		%o0, PAGE_SHIFT, %o0
+@@ -613,8 +679,6 @@ __hypervisor_xcall_flush_tlb_pending: /*
+ 	mov		HV_MMU_UNMAP_ADDR_TRAP, %g6
+ 	brnz,a,pn	%o0, __hypervisor_tlb_xcall_error
+ 	 mov		%o0, %g5
+-	brnz,pt		%g1, 1b
+-	 nop
+ 	mov		%g2, %o0
+ 	mov		%g3, %o1
+ 	mov		%g4, %o2
+@@ -697,6 +761,13 @@ hypervisor_patch_cachetlbops:
+ 	call		tlb_patch_one
+ 	 mov		10, %o2
+ 
++	sethi		%hi(__flush_tlb_page), %o0
++	or		%o0, %lo(__flush_tlb_page), %o0
++	sethi		%hi(__hypervisor_flush_tlb_page), %o1
++	or		%o1, %lo(__hypervisor_flush_tlb_page), %o1
++	call		tlb_patch_one
++	 mov		11, %o2
++
+ 	sethi		%hi(__flush_tlb_pending), %o0
+ 	or		%o0, %lo(__flush_tlb_pending), %o0
+ 	sethi		%hi(__hypervisor_flush_tlb_pending), %o1
+@@ -728,12 +799,12 @@ hypervisor_patch_cachetlbops:
+ 	call		tlb_patch_one
+ 	 mov		21, %o2
+ 
+-	sethi		%hi(xcall_flush_tlb_pending), %o0
+-	or		%o0, %lo(xcall_flush_tlb_pending), %o0
+-	sethi		%hi(__hypervisor_xcall_flush_tlb_pending), %o1
+-	or		%o1, %lo(__hypervisor_xcall_flush_tlb_pending), %o1
++	sethi		%hi(xcall_flush_tlb_page), %o0
++	or		%o0, %lo(xcall_flush_tlb_page), %o0
++	sethi		%hi(__hypervisor_xcall_flush_tlb_page), %o1
++	or		%o1, %lo(__hypervisor_xcall_flush_tlb_page), %o1
+ 	call		tlb_patch_one
+-	 mov		21, %o2
++	 mov		17, %o2
+ 
+ 	sethi		%hi(xcall_flush_tlb_kernel_range), %o0
+ 	or		%o0, %lo(xcall_flush_tlb_kernel_range), %o0