]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 6 May 2019 06:50:55 +0000 (08:50 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 6 May 2019 06:50:55 +0000 (08:50 +0200)
added patches:
perf-x86-amd-update-generic-hardware-cache-events-for-family-17h.patch

queue-4.9/perf-x86-amd-update-generic-hardware-cache-events-for-family-17h.patch [new file with mode: 0644]
queue-4.9/series

diff --git a/queue-4.9/perf-x86-amd-update-generic-hardware-cache-events-for-family-17h.patch b/queue-4.9/perf-x86-amd-update-generic-hardware-cache-events-for-family-17h.patch
new file mode 100644 (file)
index 0000000..284b814
--- /dev/null
@@ -0,0 +1,276 @@
+From 0e3b74e26280f2cf8753717a950b97d424da6046 Mon Sep 17 00:00:00 2001
+From: Kim Phillips <kim.phillips@amd.com>
+Date: Thu, 2 May 2019 15:29:47 +0000
+Subject: perf/x86/amd: Update generic hardware cache events for Family 17h
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Kim Phillips <kim.phillips@amd.com>
+
+commit 0e3b74e26280f2cf8753717a950b97d424da6046 upstream.
+
+Add a new amd_hw_cache_event_ids_f17h assignment structure set
+for AMD families 17h and above, since a lot has changed.  Specifically:
+
+L1 Data Cache
+
+The data cache access counter remains the same on Family 17h.
+
+For DC misses, PMCx041's definition changes with Family 17h,
+so instead we use the L2 cache accesses from L1 data cache
+misses counter (PMCx060,umask=0xc8).
+
+For DC hardware prefetch events, Family 17h breaks compatibility
+for PMCx067 "Data Prefetcher", so instead, we use PMCx05a "Hardware
+Prefetch DC Fills."
+
+L1 Instruction Cache
+
+PMCs 0x80 and 0x81 (32-byte IC fetches and misses) are backward
+compatible on Family 17h.
+
+For prefetches, we remove the erroneous PMCx04B assignment which
+counts how many software data cache prefetch load instructions were
+dispatched.
+
+LL - Last Level Cache
+
+Removing PMCs 7D, 7E, and 7F assignments, as they do not exist
+on Family 17h, where the last level cache is L3.  L3 counters
+can be accessed using the existing AMD Uncore driver.
+
+Data TLB
+
+On Intel machines, data TLB accesses ("dTLB-loads") are assigned
+to counters that count load/store instructions retired.  This
+is inconsistent with instruction TLB accesses, where Intel
+implementations report iTLB misses that hit in the STLB.
+
+Ideally, dTLB-loads would count higher level dTLB misses that hit
+in lower level TLBs, and dTLB-load-misses would report those
+that also missed in those lower-level TLBs, therefore causing
+a page table walk.  That would be consistent with instruction
+TLB operation, remove the redundancy between dTLB-loads and
+L1-dcache-loads, and prevent perf from producing artificially
+low percentage ratios, i.e. the "0.01%" below:
+
+        42,550,869      L1-dcache-loads
+        41,591,860      dTLB-loads
+             4,802      dTLB-load-misses          #    0.01% of all dTLB cache hits
+         7,283,682      L1-dcache-stores
+         7,912,392      dTLB-stores
+               310      dTLB-store-misses
+
+On AMD Families prior to 17h, the "Data Cache Accesses" counter is
+used, which is slightly better than load/store instructions retired,
+but still counts in terms of individual load/store operations
+instead of TLB operations.
+
+So, for AMD Families 17h and higher, this patch assigns "dTLB-loads"
+to a counter for L1 dTLB misses that hit in the L2 dTLB, and
+"dTLB-load-misses" to a counter for L1 DTLB misses that caused
+L2 DTLB misses and therefore also caused page table walks.  This
+results in a much more accurate view of data TLB performance:
+
+        60,961,781      L1-dcache-loads
+             4,601      dTLB-loads
+               963      dTLB-load-misses          #   20.93% of all dTLB cache hits
+
+Note that for all AMD families, data loads and stores are combined
+in a single accesses counter, so no 'L1-dcache-stores' are reported
+separately, and stores are counted with loads in 'L1-dcache-loads'.
+
+Also note that the "% of all dTLB cache hits" string is misleading
+because (a) "dTLB cache": although TLBs can be considered caches for
+page tables, in this context, it can be misinterpreted as data cache
+hits because the figures are similar (at least on Intel), and (b) not
+all those loads (technically accesses) technically "hit" at that
+hardware level.  "% of all dTLB accesses" would be more clear/accurate.
+
+Instruction TLB
+
+On Intel machines, 'iTLB-loads' measure iTLB misses that hit in the
+STLB, and 'iTLB-load-misses' measure iTLB misses that also missed in
+the STLB and completed a page table walk.
+
+For AMD Family 17h and above, for 'iTLB-loads' we replace the
+erroneous instruction cache fetches counter with PMCx084
+"L1 ITLB Miss, L2 ITLB Hit".
+
+For 'iTLB-load-misses' we still use PMCx085 "L1 ITLB Miss,
+L2 ITLB Miss", but set a 0xff umask because without it the event
+does not get counted.
+
+Branch Predictor (BPU)
+
+PMCs 0xc2 and 0xc3 continue to be valid across all AMD Families.
+
+Node Level Events
+
+Family 17h does not have a PMCx0e9 counter, and corresponding counters
+have not been made available publicly, so for now, we mark them as
+unsupported for Families 17h and above.
+
+Reference:
+
+  "Open-Source Register Reference For AMD Family 17h Processors Models 00h-2Fh"
+  Released 7/17/2018, Publication #56255, Revision 3.03:
+  https://www.amd.com/system/files/TechDocs/56255_OSRR.pdf
+
+[ mingo: tidied up the line breaks. ]
+Signed-off-by: Kim Phillips <kim.phillips@amd.com>
+Cc: <stable@vger.kernel.org> # v4.9+
+Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
+Cc: Jiri Olsa <jolsa@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Martin Liška <mliska@suse.cz>
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Pu Wen <puwen@hygon.cn>
+Cc: Stephane Eranian <eranian@google.com>
+Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Thomas Lendacky <Thomas.Lendacky@amd.com>
+Cc: Vince Weaver <vincent.weaver@maine.edu>
+Cc: linux-kernel@vger.kernel.org
+Cc: linux-perf-users@vger.kernel.org
+Fixes: e40ed1542dd7 ("perf/x86: Add perf support for AMD family-17h processors")
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/events/amd/core.c |  111 +++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 108 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/events/amd/core.c
++++ b/arch/x86/events/amd/core.c
+@@ -112,6 +112,110 @@ static __initconst const u64 amd_hw_cach
+  },
+ };
++static __initconst const u64 amd_hw_cache_event_ids_f17h
++                              [PERF_COUNT_HW_CACHE_MAX]
++                              [PERF_COUNT_HW_CACHE_OP_MAX]
++                              [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
++[C(L1D)] = {
++      [C(OP_READ)] = {
++              [C(RESULT_ACCESS)] = 0x0040, /* Data Cache Accesses */
++              [C(RESULT_MISS)]   = 0xc860, /* L2$ access from DC Miss */
++      },
++      [C(OP_WRITE)] = {
++              [C(RESULT_ACCESS)] = 0,
++              [C(RESULT_MISS)]   = 0,
++      },
++      [C(OP_PREFETCH)] = {
++              [C(RESULT_ACCESS)] = 0xff5a, /* h/w prefetch DC Fills */
++              [C(RESULT_MISS)]   = 0,
++      },
++},
++[C(L1I)] = {
++      [C(OP_READ)] = {
++              [C(RESULT_ACCESS)] = 0x0080, /* Instruction cache fetches  */
++              [C(RESULT_MISS)]   = 0x0081, /* Instruction cache misses   */
++      },
++      [C(OP_WRITE)] = {
++              [C(RESULT_ACCESS)] = -1,
++              [C(RESULT_MISS)]   = -1,
++      },
++      [C(OP_PREFETCH)] = {
++              [C(RESULT_ACCESS)] = 0,
++              [C(RESULT_MISS)]   = 0,
++      },
++},
++[C(LL)] = {
++      [C(OP_READ)] = {
++              [C(RESULT_ACCESS)] = 0,
++              [C(RESULT_MISS)]   = 0,
++      },
++      [C(OP_WRITE)] = {
++              [C(RESULT_ACCESS)] = 0,
++              [C(RESULT_MISS)]   = 0,
++      },
++      [C(OP_PREFETCH)] = {
++              [C(RESULT_ACCESS)] = 0,
++              [C(RESULT_MISS)]   = 0,
++      },
++},
++[C(DTLB)] = {
++      [C(OP_READ)] = {
++              [C(RESULT_ACCESS)] = 0xff45, /* All L2 DTLB accesses */
++              [C(RESULT_MISS)]   = 0xf045, /* L2 DTLB misses (PT walks) */
++      },
++      [C(OP_WRITE)] = {
++              [C(RESULT_ACCESS)] = 0,
++              [C(RESULT_MISS)]   = 0,
++      },
++      [C(OP_PREFETCH)] = {
++              [C(RESULT_ACCESS)] = 0,
++              [C(RESULT_MISS)]   = 0,
++      },
++},
++[C(ITLB)] = {
++      [C(OP_READ)] = {
++              [C(RESULT_ACCESS)] = 0x0084, /* L1 ITLB misses, L2 ITLB hits */
++              [C(RESULT_MISS)]   = 0xff85, /* L1 ITLB misses, L2 misses */
++      },
++      [C(OP_WRITE)] = {
++              [C(RESULT_ACCESS)] = -1,
++              [C(RESULT_MISS)]   = -1,
++      },
++      [C(OP_PREFETCH)] = {
++              [C(RESULT_ACCESS)] = -1,
++              [C(RESULT_MISS)]   = -1,
++      },
++},
++[C(BPU)] = {
++      [C(OP_READ)] = {
++              [C(RESULT_ACCESS)] = 0x00c2, /* Retired Branch Instr.      */
++              [C(RESULT_MISS)]   = 0x00c3, /* Retired Mispredicted BI    */
++      },
++      [C(OP_WRITE)] = {
++              [C(RESULT_ACCESS)] = -1,
++              [C(RESULT_MISS)]   = -1,
++      },
++      [C(OP_PREFETCH)] = {
++              [C(RESULT_ACCESS)] = -1,
++              [C(RESULT_MISS)]   = -1,
++      },
++},
++[C(NODE)] = {
++      [C(OP_READ)] = {
++              [C(RESULT_ACCESS)] = 0,
++              [C(RESULT_MISS)]   = 0,
++      },
++      [C(OP_WRITE)] = {
++              [C(RESULT_ACCESS)] = -1,
++              [C(RESULT_MISS)]   = -1,
++      },
++      [C(OP_PREFETCH)] = {
++              [C(RESULT_ACCESS)] = -1,
++              [C(RESULT_MISS)]   = -1,
++      },
++},
++};
++
+ /*
+  * AMD Performance Monitor K7 and later, up to and including Family 16h:
+  */
+@@ -731,9 +835,10 @@ __init int amd_pmu_init(void)
+               x86_pmu.amd_nb_constraints = 0;
+       }
+-      /* Events are common for all AMDs */
+-      memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+-             sizeof(hw_cache_event_ids));
++      if (boot_cpu_data.x86 >= 0x17)
++              memcpy(hw_cache_event_ids, amd_hw_cache_event_ids_f17h, sizeof(hw_cache_event_ids));
++      else
++              memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+       return 0;
+ }
index cbf095ab1935a9fea23297d8f10dc27b1bb3d6aa..c3a3cb3ca5048371d9eab842b7f68c3cc2ef538a 100644 (file)
@@ -51,3 +51,4 @@ sh-fix-multiple-function-definition-build-errors.patch
 xsysace-fix-error-handling-in-ace_setup.patch
 arm-orion-don-t-use-using-64-bit-dma-masks.patch
 arm-iop-don-t-use-using-64-bit-dma-masks.patch
+perf-x86-amd-update-generic-hardware-cache-events-for-family-17h.patch