]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.8-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 27 Apr 2013 01:03:10 +0000 (18:03 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 27 Apr 2013 01:03:10 +0000 (18:03 -0700)
added patches:
sparc64-fix-race-in-tlb-batch-processing.patch

queue-3.8/series
queue-3.8/sparc64-fix-race-in-tlb-batch-processing.patch [new file with mode: 0644]

index 396027508e8bc061843921a6c4dbc76c8d88553a..13f62c413711613d08e041a992d2334e07166b54 100644 (file)
@@ -1,3 +1,4 @@
 aio-fix-possible-invalid-memory-access-when-debug-is-enabled.patch
 tty-do-not-update-atime-mtime-on-read-write.patch
 tty-fix-atime-mtime-regression.patch
+sparc64-fix-race-in-tlb-batch-processing.patch
diff --git a/queue-3.8/sparc64-fix-race-in-tlb-batch-processing.patch b/queue-3.8/sparc64-fix-race-in-tlb-batch-processing.patch
new file mode 100644 (file)
index 0000000..0eabe0b
--- /dev/null
@@ -0,0 +1,614 @@
+From 84d414c702d79553f420aa9f342bc71ba3f37b8e Mon Sep 17 00:00:00 2001
+From: "David S. Miller" <davem@davemloft.net>
+Date: Fri, 19 Apr 2013 17:26:26 -0400
+Subject: sparc64: Fix race in TLB batch processing.
+
+From: "David S. Miller" <davem@davemloft.net>
+
+[ Commits f36391d2790d04993f48da6a45810033a2cdf847 and
+  f0af97070acbad5d6a361f485828223a4faaa0ee upstream. ]
+
+As reported by Dave Kleikamp, when we emit cross calls to do batched
+TLB flush processing we have a race because we do not synchronize on
+the sibling cpus completing the cross call.
+
+So meanwhile the TLB batch can be reset (tb->tlb_nr set to zero, etc.)
+and either flushes are missed or flushes will flush the wrong
+addresses.
+
+Fix this by using generic infrastructure to synchonize on the
+completion of the cross call.
+
+This first required getting the flush_tlb_pending() call out from
+switch_to() which operates with locks held and interrupts disabled.
+The problem is that smp_call_function_many() cannot be invoked with
+IRQs disabled and this is explicitly checked for with WARN_ON_ONCE().
+
+We get the batch processing outside of locked IRQ disabled sections by
+using some ideas from the powerpc port. Namely, we only batch inside
+of arch_{enter,leave}_lazy_mmu_mode() calls.  If we're not in such a
+region, we flush TLBs synchronously.
+
+1) Get rid of xcall_flush_tlb_pending and per-cpu type
+   implementations.
+
+2) Do TLB batch cross calls instead via:
+
+       smp_call_function_many()
+               tlb_pending_func()
+                       __flush_tlb_pending()
+
+3) Batch only in lazy mmu sequences:
+
+       a) Add 'active' member to struct tlb_batch
+       b) Define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+       c) Set 'active' in arch_enter_lazy_mmu_mode()
+       d) Run batch and clear 'active' in arch_leave_lazy_mmu_mode()
+       e) Check 'active' in tlb_batch_add_one() and do a synchronous
+           flush if it's clear.
+
+4) Add infrastructure for synchronous TLB page flushes.
+
+       a) Implement __flush_tlb_page and per-cpu variants, patch
+          as needed.
+       b) Likewise for xcall_flush_tlb_page.
+       c) Implement smp_flush_tlb_page() to invoke the cross-call.
+       d) Wire up global_flush_tlb_page() to the right routine based
+           upon CONFIG_SMP
+
+5) It turns out that singleton batches are very common, 2 out of every
+   3 batch flushes have only a single entry in them.
+
+   The batch flush waiting is very expensive, both because of the poll
+   on sibling cpu completeion, as well as because passing the tlb batch
+   pointer to the sibling cpus invokes a shared memory dereference.
+
+   Therefore, in flush_tlb_pending(), if there is only one entry in
+   the batch perform a completely asynchronous global_flush_tlb_page()
+   instead.
+
+Reported-by: Dave Kleikamp <dave.kleikamp@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/sparc/include/asm/pgtable_64.h   |    1 
+ arch/sparc/include/asm/switch_to_64.h |    3 
+ arch/sparc/include/asm/tlbflush_64.h  |   37 ++++++++--
+ arch/sparc/kernel/smp_64.c            |   41 ++++++++++-
+ arch/sparc/mm/tlb.c                   |   39 ++++++++++-
+ arch/sparc/mm/tsb.c                   |   57 ++++++++++++----
+ arch/sparc/mm/ultra.S                 |  119 +++++++++++++++++++++++++++-------
+ 7 files changed, 242 insertions(+), 55 deletions(-)
+
+--- a/arch/sparc/include/asm/pgtable_64.h
++++ b/arch/sparc/include/asm/pgtable_64.h
+@@ -915,6 +915,7 @@ static inline int io_remap_pfn_range(str
+       return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot);
+ }
++#include <asm/tlbflush.h>
+ #include <asm-generic/pgtable.h>
+ /* We provide our own get_unmapped_area to cope with VA holes and
+--- a/arch/sparc/include/asm/switch_to_64.h
++++ b/arch/sparc/include/asm/switch_to_64.h
+@@ -18,8 +18,7 @@ do {                                         \
+        * and 2 stores in this critical code path.  -DaveM
+        */
+ #define switch_to(prev, next, last)                                   \
+-do {  flush_tlb_pending();                                            \
+-      save_and_clear_fpu();                                           \
++do {  save_and_clear_fpu();                                           \
+       /* If you are tempted to conditionalize the following */        \
+       /* so that ASI is only written if it changes, think again. */   \
+       __asm__ __volatile__("wr %%g0, %0, %%asi"                       \
+--- a/arch/sparc/include/asm/tlbflush_64.h
++++ b/arch/sparc/include/asm/tlbflush_64.h
+@@ -11,24 +11,40 @@
+ struct tlb_batch {
+       struct mm_struct *mm;
+       unsigned long tlb_nr;
++      unsigned long active;
+       unsigned long vaddrs[TLB_BATCH_NR];
+ };
+ extern void flush_tsb_kernel_range(unsigned long start, unsigned long end);
+ extern void flush_tsb_user(struct tlb_batch *tb);
++extern void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr);
+ /* TLB flush operations. */
+-extern void flush_tlb_pending(void);
++static inline void flush_tlb_mm(struct mm_struct *mm)
++{
++}
++
++static inline void flush_tlb_page(struct vm_area_struct *vma,
++                                unsigned long vmaddr)
++{
++}
++
++static inline void flush_tlb_range(struct vm_area_struct *vma,
++                                 unsigned long start, unsigned long end)
++{
++}
++
++#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+-#define flush_tlb_range(vma,start,end)        \
+-      do { (void)(start); flush_tlb_pending(); } while (0)
+-#define flush_tlb_page(vma,addr)      flush_tlb_pending()
+-#define flush_tlb_mm(mm)              flush_tlb_pending()
++extern void flush_tlb_pending(void);
++extern void arch_enter_lazy_mmu_mode(void);
++extern void arch_leave_lazy_mmu_mode(void);
++#define arch_flush_lazy_mmu_mode()      do {} while (0)
+ /* Local cpu only.  */
+ extern void __flush_tlb_all(void);
+-
++extern void __flush_tlb_page(unsigned long context, unsigned long vaddr);
+ extern void __flush_tlb_kernel_range(unsigned long start, unsigned long end);
+ #ifndef CONFIG_SMP
+@@ -38,15 +54,24 @@ do {       flush_tsb_kernel_range(start,end);
+       __flush_tlb_kernel_range(start,end); \
+ } while (0)
++static inline void global_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
++{
++      __flush_tlb_page(CTX_HWBITS(mm->context), vaddr);
++}
++
+ #else /* CONFIG_SMP */
+ extern void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end);
++extern void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr);
+ #define flush_tlb_kernel_range(start, end) \
+ do {  flush_tsb_kernel_range(start,end); \
+       smp_flush_tlb_kernel_range(start, end); \
+ } while (0)
++#define global_flush_tlb_page(mm, vaddr) \
++      smp_flush_tlb_page(mm, vaddr)
++
+ #endif /* ! CONFIG_SMP */
+ #endif /* _SPARC64_TLBFLUSH_H */
+--- a/arch/sparc/kernel/smp_64.c
++++ b/arch/sparc/kernel/smp_64.c
+@@ -849,7 +849,7 @@ void smp_tsb_sync(struct mm_struct *mm)
+ }
+ extern unsigned long xcall_flush_tlb_mm;
+-extern unsigned long xcall_flush_tlb_pending;
++extern unsigned long xcall_flush_tlb_page;
+ extern unsigned long xcall_flush_tlb_kernel_range;
+ extern unsigned long xcall_fetch_glob_regs;
+ extern unsigned long xcall_fetch_glob_pmu;
+@@ -1074,22 +1074,55 @@ local_flush_and_out:
+       put_cpu();
+ }
++struct tlb_pending_info {
++      unsigned long ctx;
++      unsigned long nr;
++      unsigned long *vaddrs;
++};
++
++static void tlb_pending_func(void *info)
++{
++      struct tlb_pending_info *t = info;
++
++      __flush_tlb_pending(t->ctx, t->nr, t->vaddrs);
++}
++
+ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long *vaddrs)
+ {
+       u32 ctx = CTX_HWBITS(mm->context);
++      struct tlb_pending_info info;
+       int cpu = get_cpu();
++      info.ctx = ctx;
++      info.nr = nr;
++      info.vaddrs = vaddrs;
++
+       if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
+               cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
+       else
+-              smp_cross_call_masked(&xcall_flush_tlb_pending,
+-                                    ctx, nr, (unsigned long) vaddrs,
+-                                    mm_cpumask(mm));
++              smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
++                                     &info, 1);
+       __flush_tlb_pending(ctx, nr, vaddrs);
+       put_cpu();
+ }
++
++void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
++{
++      unsigned long context = CTX_HWBITS(mm->context);
++      int cpu = get_cpu();
++
++      if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
++              cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
++      else
++              smp_cross_call_masked(&xcall_flush_tlb_page,
++                                    context, vaddr, 0,
++                                    mm_cpumask(mm));
++      __flush_tlb_page(context, vaddr);
++
++      put_cpu();
++}
+ void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end)
+ {
+--- a/arch/sparc/mm/tlb.c
++++ b/arch/sparc/mm/tlb.c
+@@ -24,11 +24,17 @@ static DEFINE_PER_CPU(struct tlb_batch,
+ void flush_tlb_pending(void)
+ {
+       struct tlb_batch *tb = &get_cpu_var(tlb_batch);
++      struct mm_struct *mm = tb->mm;
+-      if (tb->tlb_nr) {
+-              flush_tsb_user(tb);
++      if (!tb->tlb_nr)
++              goto out;
+-              if (CTX_VALID(tb->mm->context)) {
++      flush_tsb_user(tb);
++
++      if (CTX_VALID(mm->context)) {
++              if (tb->tlb_nr == 1) {
++                      global_flush_tlb_page(mm, tb->vaddrs[0]);
++              } else {
+ #ifdef CONFIG_SMP
+                       smp_flush_tlb_pending(tb->mm, tb->tlb_nr,
+                                             &tb->vaddrs[0]);
+@@ -37,12 +43,30 @@ void flush_tlb_pending(void)
+                                           tb->tlb_nr, &tb->vaddrs[0]);
+ #endif
+               }
+-              tb->tlb_nr = 0;
+       }
++      tb->tlb_nr = 0;
++
++out:
+       put_cpu_var(tlb_batch);
+ }
++void arch_enter_lazy_mmu_mode(void)
++{
++      struct tlb_batch *tb = &__get_cpu_var(tlb_batch);
++
++      tb->active = 1;
++}
++
++void arch_leave_lazy_mmu_mode(void)
++{
++      struct tlb_batch *tb = &__get_cpu_var(tlb_batch);
++
++      if (tb->tlb_nr)
++              flush_tlb_pending();
++      tb->active = 0;
++}
++
+ static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,
+                             bool exec)
+ {
+@@ -60,6 +84,12 @@ static void tlb_batch_add_one(struct mm_
+               nr = 0;
+       }
++      if (!tb->active) {
++              global_flush_tlb_page(mm, vaddr);
++              flush_tsb_user_page(mm, vaddr);
++              goto out;
++      }
++
+       if (nr == 0)
+               tb->mm = mm;
+@@ -68,6 +98,7 @@ static void tlb_batch_add_one(struct mm_
+       if (nr >= TLB_BATCH_NR)
+               flush_tlb_pending();
++out:
+       put_cpu_var(tlb_batch);
+ }
+--- a/arch/sparc/mm/tsb.c
++++ b/arch/sparc/mm/tsb.c
+@@ -7,11 +7,10 @@
+ #include <linux/preempt.h>
+ #include <linux/slab.h>
+ #include <asm/page.h>
+-#include <asm/tlbflush.h>
+-#include <asm/tlb.h>
+-#include <asm/mmu_context.h>
+ #include <asm/pgtable.h>
++#include <asm/mmu_context.h>
+ #include <asm/tsb.h>
++#include <asm/tlb.h>
+ #include <asm/oplib.h>
+ extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
+@@ -46,23 +45,27 @@ void flush_tsb_kernel_range(unsigned lon
+       }
+ }
+-static void __flush_tsb_one(struct tlb_batch *tb, unsigned long hash_shift,
+-                          unsigned long tsb, unsigned long nentries)
++static void __flush_tsb_one_entry(unsigned long tsb, unsigned long v,
++                                unsigned long hash_shift,
++                                unsigned long nentries)
+ {
+-      unsigned long i;
++      unsigned long tag, ent, hash;
+-      for (i = 0; i < tb->tlb_nr; i++) {
+-              unsigned long v = tb->vaddrs[i];
+-              unsigned long tag, ent, hash;
++      v &= ~0x1UL;
++      hash = tsb_hash(v, hash_shift, nentries);
++      ent = tsb + (hash * sizeof(struct tsb));
++      tag = (v >> 22UL);
+-              v &= ~0x1UL;
++      tsb_flush(ent, tag);
++}
+-              hash = tsb_hash(v, hash_shift, nentries);
+-              ent = tsb + (hash * sizeof(struct tsb));
+-              tag = (v >> 22UL);
++static void __flush_tsb_one(struct tlb_batch *tb, unsigned long hash_shift,
++                          unsigned long tsb, unsigned long nentries)
++{
++      unsigned long i;
+-              tsb_flush(ent, tag);
+-      }
++      for (i = 0; i < tb->tlb_nr; i++)
++              __flush_tsb_one_entry(tsb, tb->vaddrs[i], hash_shift, nentries);
+ }
+ void flush_tsb_user(struct tlb_batch *tb)
+@@ -88,6 +91,30 @@ void flush_tsb_user(struct tlb_batch *tb
+       }
+ #endif
+       spin_unlock_irqrestore(&mm->context.lock, flags);
++}
++
++void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
++{
++      unsigned long nentries, base, flags;
++
++      spin_lock_irqsave(&mm->context.lock, flags);
++
++      base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
++      nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
++      if (tlb_type == cheetah_plus || tlb_type == hypervisor)
++              base = __pa(base);
++      __flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries);
++
++#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
++      if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
++              base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;
++              nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
++              if (tlb_type == cheetah_plus || tlb_type == hypervisor)
++                      base = __pa(base);
++              __flush_tsb_one_entry(base, vaddr, HPAGE_SHIFT, nentries);
++      }
++#endif
++      spin_unlock_irqrestore(&mm->context.lock, flags);
+ }
+ #define HV_PGSZ_IDX_BASE      HV_PGSZ_IDX_8K
+--- a/arch/sparc/mm/ultra.S
++++ b/arch/sparc/mm/ultra.S
+@@ -53,6 +53,33 @@ __flush_tlb_mm:             /* 18 insns */
+       nop
+       .align          32
++      .globl          __flush_tlb_page
++__flush_tlb_page:     /* 22 insns */
++      /* %o0 = context, %o1 = vaddr */
++      rdpr            %pstate, %g7
++      andn            %g7, PSTATE_IE, %g2
++      wrpr            %g2, %pstate
++      mov             SECONDARY_CONTEXT, %o4
++      ldxa            [%o4] ASI_DMMU, %g2
++      stxa            %o0, [%o4] ASI_DMMU
++      andcc           %o1, 1, %g0
++      andn            %o1, 1, %o3
++      be,pn           %icc, 1f
++       or             %o3, 0x10, %o3
++      stxa            %g0, [%o3] ASI_IMMU_DEMAP
++1:    stxa            %g0, [%o3] ASI_DMMU_DEMAP
++      membar          #Sync
++      stxa            %g2, [%o4] ASI_DMMU
++      sethi           %hi(KERNBASE), %o4
++      flush           %o4
++      retl
++       wrpr           %g7, 0x0, %pstate
++      nop
++      nop
++      nop
++      nop
++
++      .align          32
+       .globl          __flush_tlb_pending
+ __flush_tlb_pending:  /* 26 insns */
+       /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
+@@ -203,6 +230,31 @@ __cheetah_flush_tlb_mm: /* 19 insns */
+       retl
+        wrpr           %g7, 0x0, %pstate
++__cheetah_flush_tlb_page:     /* 22 insns */
++      /* %o0 = context, %o1 = vaddr */
++      rdpr            %pstate, %g7
++      andn            %g7, PSTATE_IE, %g2
++      wrpr            %g2, 0x0, %pstate
++      wrpr            %g0, 1, %tl
++      mov             PRIMARY_CONTEXT, %o4
++      ldxa            [%o4] ASI_DMMU, %g2
++      srlx            %g2, CTX_PGSZ1_NUC_SHIFT, %o3
++      sllx            %o3, CTX_PGSZ1_NUC_SHIFT, %o3
++      or              %o0, %o3, %o0   /* Preserve nucleus page size fields */
++      stxa            %o0, [%o4] ASI_DMMU
++      andcc           %o1, 1, %g0
++      be,pn           %icc, 1f
++       andn           %o1, 1, %o3
++      stxa            %g0, [%o3] ASI_IMMU_DEMAP
++1:    stxa            %g0, [%o3] ASI_DMMU_DEMAP
++      membar          #Sync
++      stxa            %g2, [%o4] ASI_DMMU
++      sethi           %hi(KERNBASE), %o4
++      flush           %o4
++      wrpr            %g0, 0, %tl
++      retl
++       wrpr           %g7, 0x0, %pstate
++
+ __cheetah_flush_tlb_pending:  /* 27 insns */
+       /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
+       rdpr            %pstate, %g7
+@@ -269,6 +321,20 @@ __hypervisor_flush_tlb_mm: /* 10 insns *
+       retl
+        nop
++__hypervisor_flush_tlb_page: /* 11 insns */
++      /* %o0 = context, %o1 = vaddr */
++      mov             %o0, %g2
++      mov             %o1, %o0              /* ARG0: vaddr + IMMU-bit */
++      mov             %g2, %o1              /* ARG1: mmu context */
++      mov             HV_MMU_ALL, %o2       /* ARG2: flags */
++      srlx            %o0, PAGE_SHIFT, %o0
++      sllx            %o0, PAGE_SHIFT, %o0
++      ta              HV_MMU_UNMAP_ADDR_TRAP
++      brnz,pn         %o0, __hypervisor_tlb_tl0_error
++       mov            HV_MMU_UNMAP_ADDR_TRAP, %o1
++      retl
++       nop
++
+ __hypervisor_flush_tlb_pending: /* 16 insns */
+       /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
+       sllx            %o1, 3, %g1
+@@ -339,6 +405,13 @@ cheetah_patch_cachetlbops:
+       call            tlb_patch_one
+        mov            19, %o2
++      sethi           %hi(__flush_tlb_page), %o0
++      or              %o0, %lo(__flush_tlb_page), %o0
++      sethi           %hi(__cheetah_flush_tlb_page), %o1
++      or              %o1, %lo(__cheetah_flush_tlb_page), %o1
++      call            tlb_patch_one
++       mov            22, %o2
++
+       sethi           %hi(__flush_tlb_pending), %o0
+       or              %o0, %lo(__flush_tlb_pending), %o0
+       sethi           %hi(__cheetah_flush_tlb_pending), %o1
+@@ -397,10 +470,9 @@ xcall_flush_tlb_mm:       /* 21 insns */
+       nop
+       nop
+-      .globl          xcall_flush_tlb_pending
+-xcall_flush_tlb_pending:      /* 21 insns */
+-      /* %g5=context, %g1=nr, %g7=vaddrs[] */
+-      sllx            %g1, 3, %g1
++      .globl          xcall_flush_tlb_page
++xcall_flush_tlb_page: /* 17 insns */
++      /* %g5=context, %g1=vaddr */
+       mov             PRIMARY_CONTEXT, %g4
+       ldxa            [%g4] ASI_DMMU, %g2
+       srlx            %g2, CTX_PGSZ1_NUC_SHIFT, %g4
+@@ -408,20 +480,16 @@ xcall_flush_tlb_pending: /* 21 insns */
+       or              %g5, %g4, %g5
+       mov             PRIMARY_CONTEXT, %g4
+       stxa            %g5, [%g4] ASI_DMMU
+-1:    sub             %g1, (1 << 3), %g1
+-      ldx             [%g7 + %g1], %g5
+-      andcc           %g5, 0x1, %g0
++      andcc           %g1, 0x1, %g0
+       be,pn           %icc, 2f
+-
+-       andn           %g5, 0x1, %g5
++       andn           %g1, 0x1, %g5
+       stxa            %g0, [%g5] ASI_IMMU_DEMAP
+ 2:    stxa            %g0, [%g5] ASI_DMMU_DEMAP
+       membar          #Sync
+-      brnz,pt         %g1, 1b
+-       nop
+       stxa            %g2, [%g4] ASI_DMMU
+       retry
+       nop
++      nop
+       .globl          xcall_flush_tlb_kernel_range
+ xcall_flush_tlb_kernel_range: /* 25 insns */
+@@ -656,15 +724,13 @@ __hypervisor_xcall_flush_tlb_mm: /* 21 i
+       membar          #Sync
+       retry
+-      .globl          __hypervisor_xcall_flush_tlb_pending
+-__hypervisor_xcall_flush_tlb_pending: /* 21 insns */
+-      /* %g5=ctx, %g1=nr, %g7=vaddrs[], %g2,%g3,%g4,g6=scratch */
+-      sllx            %g1, 3, %g1
++      .globl          __hypervisor_xcall_flush_tlb_page
++__hypervisor_xcall_flush_tlb_page: /* 17 insns */
++      /* %g5=ctx, %g1=vaddr */
+       mov             %o0, %g2
+       mov             %o1, %g3
+       mov             %o2, %g4
+-1:    sub             %g1, (1 << 3), %g1
+-      ldx             [%g7 + %g1], %o0        /* ARG0: virtual address */
++      mov             %g1, %o0                /* ARG0: virtual address */
+       mov             %g5, %o1                /* ARG1: mmu context */
+       mov             HV_MMU_ALL, %o2         /* ARG2: flags */
+       srlx            %o0, PAGE_SHIFT, %o0
+@@ -673,8 +739,6 @@ __hypervisor_xcall_flush_tlb_pending: /*
+       mov             HV_MMU_UNMAP_ADDR_TRAP, %g6
+       brnz,a,pn       %o0, __hypervisor_tlb_xcall_error
+        mov            %o0, %g5
+-      brnz,pt         %g1, 1b
+-       nop
+       mov             %g2, %o0
+       mov             %g3, %o1
+       mov             %g4, %o2
+@@ -757,6 +821,13 @@ hypervisor_patch_cachetlbops:
+       call            tlb_patch_one
+        mov            10, %o2
++      sethi           %hi(__flush_tlb_page), %o0
++      or              %o0, %lo(__flush_tlb_page), %o0
++      sethi           %hi(__hypervisor_flush_tlb_page), %o1
++      or              %o1, %lo(__hypervisor_flush_tlb_page), %o1
++      call            tlb_patch_one
++       mov            11, %o2
++
+       sethi           %hi(__flush_tlb_pending), %o0
+       or              %o0, %lo(__flush_tlb_pending), %o0
+       sethi           %hi(__hypervisor_flush_tlb_pending), %o1
+@@ -788,12 +859,12 @@ hypervisor_patch_cachetlbops:
+       call            tlb_patch_one
+        mov            21, %o2
+-      sethi           %hi(xcall_flush_tlb_pending), %o0
+-      or              %o0, %lo(xcall_flush_tlb_pending), %o0
+-      sethi           %hi(__hypervisor_xcall_flush_tlb_pending), %o1
+-      or              %o1, %lo(__hypervisor_xcall_flush_tlb_pending), %o1
++      sethi           %hi(xcall_flush_tlb_page), %o0
++      or              %o0, %lo(xcall_flush_tlb_page), %o0
++      sethi           %hi(__hypervisor_xcall_flush_tlb_page), %o1
++      or              %o1, %lo(__hypervisor_xcall_flush_tlb_page), %o1
+       call            tlb_patch_one
+-       mov            21, %o2
++       mov            17, %o2
+       sethi           %hi(xcall_flush_tlb_kernel_range), %o0
+       or              %o0, %lo(xcall_flush_tlb_kernel_range), %o0