]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/4.4.177/perf-bench-copy-kernel-files-needed-to-build-mem-cpy-set-x86_64-benchmarks.patch
Linux 4.4.177
[thirdparty/kernel/stable-queue.git] / releases / 4.4.177 / perf-bench-copy-kernel-files-needed-to-build-mem-cpy-set-x86_64-benchmarks.patch
1 From 7d7d1bf1d1dabe435ef50efb051724b8664749cb Mon Sep 17 00:00:00 2001
2 From: Arnaldo Carvalho de Melo <acme@redhat.com>
3 Date: Mon, 11 Jul 2016 12:36:41 -0300
4 Subject: perf bench: Copy kernel files needed to build mem{cpy,set} x86_64 benchmarks
5
6 From: Arnaldo Carvalho de Melo <acme@redhat.com>
7
8 commit 7d7d1bf1d1dabe435ef50efb051724b8664749cb upstream.
9
10 We can't access kernel files directly from tools/, so copy the required
11 bits, and make sure that we detect when the original files, in the
12 kernel, gets modified.
13
14 Cc: Adrian Hunter <adrian.hunter@intel.com>
15 Cc: David Ahern <dsahern@gmail.com>
16 Cc: Jiri Olsa <jolsa@kernel.org>
17 Cc: Namhyung Kim <namhyung@kernel.org>
18 Cc: Wang Nan <wangnan0@huawei.com>
19 Link: http://lkml.kernel.org/n/tip-z7e76274ch5j4nugv048qacb@git.kernel.org
20 Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
21 Signed-off-by: Daniel Díaz <daniel.diaz@linaro.org>
22 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
23
24 ---
25 tools/arch/x86/include/asm/cpufeatures.h | 336 +++++++++++++++++++++++++
26 tools/arch/x86/include/asm/disabled-features.h | 65 ++++
27 tools/arch/x86/include/asm/required-features.h | 106 +++++++
28 tools/arch/x86/lib/memcpy_64.S | 179 +++++++++++++
29 tools/arch/x86/lib/memset_64.S | 138 ++++++++++
30 tools/include/asm/alternative-asm.h | 9
31 tools/perf/MANIFEST | 8
32 tools/perf/Makefile.perf | 15 +
33 tools/perf/bench/mem-memcpy-x86-64-asm.S | 2
34 tools/perf/bench/mem-memset-x86-64-asm.S | 2
35 tools/perf/util/include/asm/alternative-asm.h | 9
36 11 files changed, 856 insertions(+), 13 deletions(-)
37
38 --- /dev/null
39 +++ b/tools/arch/x86/include/asm/cpufeatures.h
40 @@ -0,0 +1,336 @@
41 +#ifndef _ASM_X86_CPUFEATURES_H
42 +#define _ASM_X86_CPUFEATURES_H
43 +
44 +#ifndef _ASM_X86_REQUIRED_FEATURES_H
45 +#include <asm/required-features.h>
46 +#endif
47 +
48 +#ifndef _ASM_X86_DISABLED_FEATURES_H
49 +#include <asm/disabled-features.h>
50 +#endif
51 +
52 +/*
53 + * Defines x86 CPU feature bits
54 + */
55 +#define NCAPINTS 19 /* N 32-bit words worth of info */
56 +#define NBUGINTS 1 /* N 32-bit bug flags */
57 +
58 +/*
59 + * Note: If the comment begins with a quoted string, that string is used
60 + * in /proc/cpuinfo instead of the macro name. If the string is "",
61 + * this feature bit is not displayed in /proc/cpuinfo at all.
62 + */
63 +
64 +/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
65 +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
66 +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
67 +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
68 +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
69 +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
70 +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
71 +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
72 +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
73 +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
74 +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
75 +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
76 +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
77 +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
78 +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
79 +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
80 + /* (plus FCMOVcc, FCOMI with FPU) */
81 +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
82 +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
83 +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
84 +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
85 +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
86 +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
87 +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
88 +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
89 +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
90 +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
91 +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
92 +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
93 +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
94 +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
95 +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
96 +
97 +/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
98 +/* Don't duplicate feature flags which are redundant with Intel! */
99 +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
100 +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
101 +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
102 +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
103 +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
104 +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
105 +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
106 +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
107 +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
108 +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
109 +
110 +/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
111 +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
112 +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
113 +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
114 +
115 +/* Other features, Linux-defined mapping, word 3 */
116 +/* This range is used for feature bits which conflict or are synthesized */
117 +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
118 +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
119 +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
120 +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
121 +/* cpu types for specific tunings: */
122 +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
123 +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
124 +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
125 +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
126 +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
127 +#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
128 +/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
129 +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
130 +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
131 +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
132 +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
133 +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
134 +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
135 +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
136 +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
137 +/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
138 +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
139 +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
140 +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
141 +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
142 +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
143 +/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
144 +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
145 +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
146 +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
147 +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
148 +
149 +/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
150 +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
151 +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
152 +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
153 +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
154 +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
155 +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
156 +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
157 +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
158 +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
159 +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
160 +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
161 +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
162 +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
163 +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
164 +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
165 +#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
166 +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
167 +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
168 +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
169 +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
170 +#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
171 +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
172 +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
173 +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
174 +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
175 +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
176 +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
177 +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
178 +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
179 +#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
180 +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
181 +
182 +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
183 +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
184 +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
185 +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
186 +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
187 +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
188 +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
189 +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
190 +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
191 +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
192 +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
193 +
194 +/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
195 +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
196 +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
197 +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
198 +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
199 +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
200 +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
201 +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
202 +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
203 +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
204 +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
205 +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
206 +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
207 +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
208 +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
209 +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
210 +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
211 +#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
212 +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
213 +#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
214 +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
215 +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
216 +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
217 +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
218 +#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
219 +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
220 +
221 +/*
222 + * Auxiliary flags: Linux defined - For features scattered in various
223 + * CPUID levels like 0x6, 0xA etc, word 7.
224 + *
225 + * Reuse free bits when adding new feature flags!
226 + */
227 +
228 +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
229 +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
230 +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
231 +
232 +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
233 +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
234 +
235 +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
236 +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
237 +
238 +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
239 +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
240 +
241 +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
242 +#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
243 +
244 +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
245 +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
246 +
247 +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled*/
248 +#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
249 +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
250 +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation */
251 +
252 +#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
253 +#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
254 +#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
255 +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
256 +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
257 +
258 +/* Virtualization flags: Linux defined, word 8 */
259 +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
260 +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
261 +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
262 +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
263 +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
264 +
265 +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
266 +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
267 +
268 +
269 +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
270 +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
271 +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
272 +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
273 +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
274 +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
275 +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
276 +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
277 +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
278 +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
279 +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
280 +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
281 +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
282 +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
283 +#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
284 +#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
285 +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
286 +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
287 +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
288 +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
289 +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
290 +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
291 +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
292 +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
293 +
294 +/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
295 +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
296 +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
297 +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
298 +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
299 +
300 +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
301 +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
302 +
303 +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
304 +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
305 +
306 +/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
307 +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
308 +#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
309 +#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
310 +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
311 +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
312 +
313 +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
314 +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
315 +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
316 +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
317 +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
318 +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
319 +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
320 +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
321 +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
322 +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
323 +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
324 +
325 +/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
326 +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
327 +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
328 +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
329 +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
330 +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
331 +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
332 +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
333 +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
334 +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
335 +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
336 +
337 +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
338 +#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
339 +#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
340 +
341 +/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
342 +#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
343 +#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */
344 +#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */
345 +
346 +
347 +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
348 +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
349 +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
350 +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
351 +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
352 +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
353 +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
354 +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
355 +
356 +/*
357 + * BUG word(s)
358 + */
359 +#define X86_BUG(x) (NCAPINTS*32 + (x))
360 +
361 +#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
362 +#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
363 +#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
364 +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
365 +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
366 +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
367 +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
368 +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
369 +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
370 +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
371 +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
372 +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
373 +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
374 +#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
375 +
376 +#endif /* _ASM_X86_CPUFEATURES_H */
377 --- /dev/null
378 +++ b/tools/arch/x86/include/asm/disabled-features.h
379 @@ -0,0 +1,65 @@
380 +#ifndef _ASM_X86_DISABLED_FEATURES_H
381 +#define _ASM_X86_DISABLED_FEATURES_H
382 +
383 +/* These features, although they might be available in a CPU
384 + * will not be used because the compile options to support
385 + * them are not present.
386 + *
387 + * This code allows them to be checked and disabled at
388 + * compile time without an explicit #ifdef. Use
389 + * cpu_feature_enabled().
390 + */
391 +
392 +#ifdef CONFIG_X86_INTEL_MPX
393 +# define DISABLE_MPX 0
394 +#else
395 +# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31))
396 +#endif
397 +
398 +#ifdef CONFIG_X86_64
399 +# define DISABLE_VME (1<<(X86_FEATURE_VME & 31))
400 +# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31))
401 +# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31))
402 +# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31))
403 +# define DISABLE_PCID 0
404 +#else
405 +# define DISABLE_VME 0
406 +# define DISABLE_K6_MTRR 0
407 +# define DISABLE_CYRIX_ARR 0
408 +# define DISABLE_CENTAUR_MCR 0
409 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31))
410 +#endif /* CONFIG_X86_64 */
411 +
412 +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
413 +# define DISABLE_PKU 0
414 +# define DISABLE_OSPKE 0
415 +#else
416 +# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31))
417 +# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
418 +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
419 +
420 +/*
421 + * Make sure to add features to the correct mask
422 + */
423 +#define DISABLED_MASK0 (DISABLE_VME)
424 +#define DISABLED_MASK1 0
425 +#define DISABLED_MASK2 0
426 +#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
427 +#define DISABLED_MASK4 (DISABLE_PCID)
428 +#define DISABLED_MASK5 0
429 +#define DISABLED_MASK6 0
430 +#define DISABLED_MASK7 0
431 +#define DISABLED_MASK8 0
432 +#define DISABLED_MASK9 (DISABLE_MPX)
433 +#define DISABLED_MASK10 0
434 +#define DISABLED_MASK11 0
435 +#define DISABLED_MASK12 0
436 +#define DISABLED_MASK13 0
437 +#define DISABLED_MASK14 0
438 +#define DISABLED_MASK15 0
439 +#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
440 +#define DISABLED_MASK17 0
441 +#define DISABLED_MASK18 0
442 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
443 +
444 +#endif /* _ASM_X86_DISABLED_FEATURES_H */
445 --- /dev/null
446 +++ b/tools/arch/x86/include/asm/required-features.h
447 @@ -0,0 +1,106 @@
448 +#ifndef _ASM_X86_REQUIRED_FEATURES_H
449 +#define _ASM_X86_REQUIRED_FEATURES_H
450 +
451 +/* Define minimum CPUID feature set for kernel These bits are checked
452 + really early to actually display a visible error message before the
453 + kernel dies. Make sure to assign features to the proper mask!
454 +
455 + Some requirements that are not in CPUID yet are also in the
456 + CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too.
457 +
458 + The real information is in arch/x86/Kconfig.cpu, this just converts
459 + the CONFIGs into a bitmask */
460 +
461 +#ifndef CONFIG_MATH_EMULATION
462 +# define NEED_FPU (1<<(X86_FEATURE_FPU & 31))
463 +#else
464 +# define NEED_FPU 0
465 +#endif
466 +
467 +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
468 +# define NEED_PAE (1<<(X86_FEATURE_PAE & 31))
469 +#else
470 +# define NEED_PAE 0
471 +#endif
472 +
473 +#ifdef CONFIG_X86_CMPXCHG64
474 +# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31))
475 +#else
476 +# define NEED_CX8 0
477 +#endif
478 +
479 +#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64)
480 +# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31))
481 +#else
482 +# define NEED_CMOV 0
483 +#endif
484 +
485 +#ifdef CONFIG_X86_USE_3DNOW
486 +# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
487 +#else
488 +# define NEED_3DNOW 0
489 +#endif
490 +
491 +#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
492 +# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
493 +#else
494 +# define NEED_NOPL 0
495 +#endif
496 +
497 +#ifdef CONFIG_MATOM
498 +# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31))
499 +#else
500 +# define NEED_MOVBE 0
501 +#endif
502 +
503 +#ifdef CONFIG_X86_64
504 +#ifdef CONFIG_PARAVIRT
505 +/* Paravirtualized systems may not have PSE or PGE available */
506 +#define NEED_PSE 0
507 +#define NEED_PGE 0
508 +#else
509 +#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
510 +#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
511 +#endif
512 +#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
513 +#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
514 +#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
515 +#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
516 +#define NEED_LM (1<<(X86_FEATURE_LM & 31))
517 +#else
518 +#define NEED_PSE 0
519 +#define NEED_MSR 0
520 +#define NEED_PGE 0
521 +#define NEED_FXSR 0
522 +#define NEED_XMM 0
523 +#define NEED_XMM2 0
524 +#define NEED_LM 0
525 +#endif
526 +
527 +#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\
528 + NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\
529 + NEED_XMM|NEED_XMM2)
530 +#define SSE_MASK (NEED_XMM|NEED_XMM2)
531 +
532 +#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW)
533 +
534 +#define REQUIRED_MASK2 0
535 +#define REQUIRED_MASK3 (NEED_NOPL)
536 +#define REQUIRED_MASK4 (NEED_MOVBE)
537 +#define REQUIRED_MASK5 0
538 +#define REQUIRED_MASK6 0
539 +#define REQUIRED_MASK7 0
540 +#define REQUIRED_MASK8 0
541 +#define REQUIRED_MASK9 0
542 +#define REQUIRED_MASK10 0
543 +#define REQUIRED_MASK11 0
544 +#define REQUIRED_MASK12 0
545 +#define REQUIRED_MASK13 0
546 +#define REQUIRED_MASK14 0
547 +#define REQUIRED_MASK15 0
548 +#define REQUIRED_MASK16 0
549 +#define REQUIRED_MASK17 0
550 +#define REQUIRED_MASK18 0
551 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
552 +
553 +#endif /* _ASM_X86_REQUIRED_FEATURES_H */
554 --- /dev/null
555 +++ b/tools/arch/x86/lib/memcpy_64.S
556 @@ -0,0 +1,179 @@
557 +/* Copyright 2002 Andi Kleen */
558 +
559 +#include <linux/linkage.h>
560 +#include <asm/cpufeatures.h>
561 +#include <asm/alternative-asm.h>
562 +
563 +/*
564 + * We build a jump to memcpy_orig by default which gets NOPped out on
565 + * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
566 + * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
567 + * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
568 + */
569 +
570 +.weak memcpy
571 +
572 +/*
573 + * memcpy - Copy a memory block.
574 + *
575 + * Input:
576 + * rdi destination
577 + * rsi source
578 + * rdx count
579 + *
580 + * Output:
581 + * rax original destination
582 + */
583 +ENTRY(__memcpy)
584 +ENTRY(memcpy)
585 + ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
586 + "jmp memcpy_erms", X86_FEATURE_ERMS
587 +
588 + movq %rdi, %rax
589 + movq %rdx, %rcx
590 + shrq $3, %rcx
591 + andl $7, %edx
592 + rep movsq
593 + movl %edx, %ecx
594 + rep movsb
595 + ret
596 +ENDPROC(memcpy)
597 +ENDPROC(__memcpy)
598 +
599 +/*
600 + * memcpy_erms() - enhanced fast string memcpy. This is faster and
601 + * simpler than memcpy. Use memcpy_erms when possible.
602 + */
603 +ENTRY(memcpy_erms)
604 + movq %rdi, %rax
605 + movq %rdx, %rcx
606 + rep movsb
607 + ret
608 +ENDPROC(memcpy_erms)
609 +
610 +ENTRY(memcpy_orig)
611 + movq %rdi, %rax
612 +
613 + cmpq $0x20, %rdx
614 + jb .Lhandle_tail
615 +
616 + /*
617 + * We check whether memory false dependence could occur,
618 + * then jump to corresponding copy mode.
619 + */
620 + cmp %dil, %sil
621 + jl .Lcopy_backward
622 + subq $0x20, %rdx
623 +.Lcopy_forward_loop:
624 + subq $0x20, %rdx
625 +
626 + /*
627 + * Move in blocks of 4x8 bytes:
628 + */
629 + movq 0*8(%rsi), %r8
630 + movq 1*8(%rsi), %r9
631 + movq 2*8(%rsi), %r10
632 + movq 3*8(%rsi), %r11
633 + leaq 4*8(%rsi), %rsi
634 +
635 + movq %r8, 0*8(%rdi)
636 + movq %r9, 1*8(%rdi)
637 + movq %r10, 2*8(%rdi)
638 + movq %r11, 3*8(%rdi)
639 + leaq 4*8(%rdi), %rdi
640 + jae .Lcopy_forward_loop
641 + addl $0x20, %edx
642 + jmp .Lhandle_tail
643 +
644 +.Lcopy_backward:
645 + /*
646 + * Calculate copy position to tail.
647 + */
648 + addq %rdx, %rsi
649 + addq %rdx, %rdi
650 + subq $0x20, %rdx
651 + /*
652 + * At most 3 ALU operations in one cycle,
653 + * so append NOPS in the same 16 bytes trunk.
654 + */
655 + .p2align 4
656 +.Lcopy_backward_loop:
657 + subq $0x20, %rdx
658 + movq -1*8(%rsi), %r8
659 + movq -2*8(%rsi), %r9
660 + movq -3*8(%rsi), %r10
661 + movq -4*8(%rsi), %r11
662 + leaq -4*8(%rsi), %rsi
663 + movq %r8, -1*8(%rdi)
664 + movq %r9, -2*8(%rdi)
665 + movq %r10, -3*8(%rdi)
666 + movq %r11, -4*8(%rdi)
667 + leaq -4*8(%rdi), %rdi
668 + jae .Lcopy_backward_loop
669 +
670 + /*
671 + * Calculate copy position to head.
672 + */
673 + addl $0x20, %edx
674 + subq %rdx, %rsi
675 + subq %rdx, %rdi
676 +.Lhandle_tail:
677 + cmpl $16, %edx
678 + jb .Lless_16bytes
679 +
680 + /*
681 + * Move data from 16 bytes to 31 bytes.
682 + */
683 + movq 0*8(%rsi), %r8
684 + movq 1*8(%rsi), %r9
685 + movq -2*8(%rsi, %rdx), %r10
686 + movq -1*8(%rsi, %rdx), %r11
687 + movq %r8, 0*8(%rdi)
688 + movq %r9, 1*8(%rdi)
689 + movq %r10, -2*8(%rdi, %rdx)
690 + movq %r11, -1*8(%rdi, %rdx)
691 + retq
692 + .p2align 4
693 +.Lless_16bytes:
694 + cmpl $8, %edx
695 + jb .Lless_8bytes
696 + /*
697 + * Move data from 8 bytes to 15 bytes.
698 + */
699 + movq 0*8(%rsi), %r8
700 + movq -1*8(%rsi, %rdx), %r9
701 + movq %r8, 0*8(%rdi)
702 + movq %r9, -1*8(%rdi, %rdx)
703 + retq
704 + .p2align 4
705 +.Lless_8bytes:
706 + cmpl $4, %edx
707 + jb .Lless_3bytes
708 +
709 + /*
710 + * Move data from 4 bytes to 7 bytes.
711 + */
712 + movl (%rsi), %ecx
713 + movl -4(%rsi, %rdx), %r8d
714 + movl %ecx, (%rdi)
715 + movl %r8d, -4(%rdi, %rdx)
716 + retq
717 + .p2align 4
718 +.Lless_3bytes:
719 + subl $1, %edx
720 + jb .Lend
721 + /*
722 + * Move data from 1 bytes to 3 bytes.
723 + */
724 + movzbl (%rsi), %ecx
725 + jz .Lstore_1byte
726 + movzbq 1(%rsi), %r8
727 + movzbq (%rsi, %rdx), %r9
728 + movb %r8b, 1(%rdi)
729 + movb %r9b, (%rdi, %rdx)
730 +.Lstore_1byte:
731 + movb %cl, (%rdi)
732 +
733 +.Lend:
734 + retq
735 +ENDPROC(memcpy_orig)
736 --- /dev/null
737 +++ b/tools/arch/x86/lib/memset_64.S
738 @@ -0,0 +1,138 @@
739 +/* Copyright 2002 Andi Kleen, SuSE Labs */
740 +
741 +#include <linux/linkage.h>
742 +#include <asm/cpufeatures.h>
743 +#include <asm/alternative-asm.h>
744 +
745 +.weak memset
746 +
747 +/*
748 + * ISO C memset - set a memory block to a byte value. This function uses fast
749 + * string to get better performance than the original function. The code is
750 + * simpler and shorter than the orignal function as well.
751 + *
752 + * rdi destination
753 + * rsi value (char)
754 + * rdx count (bytes)
755 + *
756 + * rax original destination
757 + */
758 +ENTRY(memset)
759 +ENTRY(__memset)
760 + /*
761 + * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
762 + * to use it when possible. If not available, use fast string instructions.
763 + *
764 + * Otherwise, use original memset function.
765 + */
766 + ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
767 + "jmp memset_erms", X86_FEATURE_ERMS
768 +
769 + movq %rdi,%r9
770 + movq %rdx,%rcx
771 + andl $7,%edx
772 + shrq $3,%rcx
773 + /* expand byte value */
774 + movzbl %sil,%esi
775 + movabs $0x0101010101010101,%rax
776 + imulq %rsi,%rax
777 + rep stosq
778 + movl %edx,%ecx
779 + rep stosb
780 + movq %r9,%rax
781 + ret
782 +ENDPROC(memset)
783 +ENDPROC(__memset)
784 +
785 +/*
786 + * ISO C memset - set a memory block to a byte value. This function uses
787 + * enhanced rep stosb to override the fast string function.
788 + * The code is simpler and shorter than the fast string function as well.
789 + *
790 + * rdi destination
791 + * rsi value (char)
792 + * rdx count (bytes)
793 + *
794 + * rax original destination
795 + */
796 +ENTRY(memset_erms)
797 + movq %rdi,%r9
798 + movb %sil,%al
799 + movq %rdx,%rcx
800 + rep stosb
801 + movq %r9,%rax
802 + ret
803 +ENDPROC(memset_erms)
804 +
805 +ENTRY(memset_orig)
806 + movq %rdi,%r10
807 +
808 + /* expand byte value */
809 + movzbl %sil,%ecx
810 + movabs $0x0101010101010101,%rax
811 + imulq %rcx,%rax
812 +
813 + /* align dst */
814 + movl %edi,%r9d
815 + andl $7,%r9d
816 + jnz .Lbad_alignment
817 +.Lafter_bad_alignment:
818 +
819 + movq %rdx,%rcx
820 + shrq $6,%rcx
821 + jz .Lhandle_tail
822 +
823 + .p2align 4
824 +.Lloop_64:
825 + decq %rcx
826 + movq %rax,(%rdi)
827 + movq %rax,8(%rdi)
828 + movq %rax,16(%rdi)
829 + movq %rax,24(%rdi)
830 + movq %rax,32(%rdi)
831 + movq %rax,40(%rdi)
832 + movq %rax,48(%rdi)
833 + movq %rax,56(%rdi)
834 + leaq 64(%rdi),%rdi
835 + jnz .Lloop_64
836 +
837 + /* Handle tail in loops. The loops should be faster than hard
838 + to predict jump tables. */
839 + .p2align 4
840 +.Lhandle_tail:
841 + movl %edx,%ecx
842 + andl $63&(~7),%ecx
843 + jz .Lhandle_7
844 + shrl $3,%ecx
845 + .p2align 4
846 +.Lloop_8:
847 + decl %ecx
848 + movq %rax,(%rdi)
849 + leaq 8(%rdi),%rdi
850 + jnz .Lloop_8
851 +
852 +.Lhandle_7:
853 + andl $7,%edx
854 + jz .Lende
855 + .p2align 4
856 +.Lloop_1:
857 + decl %edx
858 + movb %al,(%rdi)
859 + leaq 1(%rdi),%rdi
860 + jnz .Lloop_1
861 +
862 +.Lende:
863 + movq %r10,%rax
864 + ret
865 +
866 +.Lbad_alignment:
867 + cmpq $7,%rdx
868 + jbe .Lhandle_7
869 + movq %rax,(%rdi) /* unaligned store */
870 + movq $8,%r8
871 + subq %r9,%r8
872 + addq %r8,%rdi
873 + subq %r8,%rdx
874 + jmp .Lafter_bad_alignment
875 +.Lfinal:
876 +ENDPROC(memset_orig)
877 --- /dev/null
878 +++ b/tools/include/asm/alternative-asm.h
879 @@ -0,0 +1,9 @@
880 +#ifndef _TOOLS_ASM_ALTERNATIVE_ASM_H
881 +#define _TOOLS_ASM_ALTERNATIVE_ASM_H
882 +
883 +/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
884 +
885 +#define altinstruction_entry #
886 +#define ALTERNATIVE_2 #
887 +
888 +#endif
889 --- a/tools/perf/MANIFEST
890 +++ b/tools/perf/MANIFEST
891 @@ -11,6 +11,11 @@ tools/arch/sparc/include/asm/barrier_32.
892 tools/arch/sparc/include/asm/barrier_64.h
893 tools/arch/tile/include/asm/barrier.h
894 tools/arch/x86/include/asm/barrier.h
895 +tools/arch/x86/include/asm/cpufeatures.h
896 +tools/arch/x86/include/asm/disabled-features.h
897 +tools/arch/x86/include/asm/required-features.h
898 +tools/arch/x86/lib/memcpy_64.S
899 +tools/arch/x86/lib/memset_64.S
900 tools/arch/xtensa/include/asm/barrier.h
901 tools/scripts
902 tools/build
903 @@ -25,6 +30,7 @@ tools/lib/rbtree.c
904 tools/lib/symbol/kallsyms.c
905 tools/lib/symbol/kallsyms.h
906 tools/lib/util/find_next_bit.c
907 +tools/include/asm/alternative-asm.h
908 tools/include/asm/atomic.h
909 tools/include/asm/barrier.h
910 tools/include/asm/bug.h
911 @@ -65,8 +71,6 @@ include/linux/swab.h
912 arch/*/include/asm/unistd*.h
913 arch/*/include/uapi/asm/unistd*.h
914 arch/*/include/uapi/asm/perf_regs.h
915 -arch/*/lib/memcpy*.S
916 -arch/*/lib/memset*.S
917 include/linux/poison.h
918 include/linux/hw_breakpoint.h
919 include/uapi/linux/perf_event.h
920 --- a/tools/perf/Makefile.perf
921 +++ b/tools/perf/Makefile.perf
922 @@ -310,6 +310,21 @@ export srctree OUTPUT RM CC LD AR CFLAGS
923 include $(srctree)/tools/build/Makefile.include
924
925 $(PERF_IN): prepare FORCE
926 + @(test -f ../../arch/x86/include/asm/disabled-features.h && ( \
927 + (diff -B ../arch/x86/include/asm/disabled-features.h ../../arch/x86/include/asm/disabled-features.h >/dev/null) \
928 + || echo "Warning: tools/arch/x86/include/asm/disabled-features.h differs from kernel" >&2 )) || true
929 + @(test -f ../../arch/x86/include/asm/required-features.h && ( \
930 + (diff -B ../arch/x86/include/asm/required-features.h ../../arch/x86/include/asm/required-features.h >/dev/null) \
931 + || echo "Warning: tools/arch/x86/include/asm/required-features.h differs from kernel" >&2 )) || true
932 + @(test -f ../../arch/x86/include/asm/cpufeatures.h && ( \
933 + (diff -B ../arch/x86/include/asm/cpufeatures.h ../../arch/x86/include/asm/cpufeatures.h >/dev/null) \
934 + || echo "Warning: tools/arch/x86/include/asm/cpufeatures.h differs from kernel" >&2 )) || true
935 + @(test -f ../../arch/x86/lib/memcpy_64.S && ( \
936 + (diff -B ../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memcpy_64.S >/dev/null) \
937 + || echo "Warning: tools/arch/x86/lib/memcpy_64.S differs from kernel" >&2 )) || true
938 + @(test -f ../../arch/x86/lib/memset_64.S && ( \
939 + (diff -B ../arch/x86/lib/memset_64.S ../../arch/x86/lib/memset_64.S >/dev/null) \
940 + || echo "Warning: tools/arch/x86/lib/memset_64.S differs from kernel" >&2 )) || true
941 $(Q)$(MAKE) $(build)=perf
942
943 $(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(LIBTRACEEVENT_DYNAMIC_LIST)
944 --- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
945 +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
946 @@ -1,7 +1,7 @@
947 #define memcpy MEMCPY /* don't hide glibc's memcpy() */
948 #define altinstr_replacement text
949 #define globl p2align 4; .globl
950 -#include "../../../arch/x86/lib/memcpy_64.S"
951 +#include "../../arch/x86/lib/memcpy_64.S"
952 /*
953 * We need to provide note.GNU-stack section, saying that we want
954 * NOT executable stack. Otherwise the final linking will assume that
955 --- a/tools/perf/bench/mem-memset-x86-64-asm.S
956 +++ b/tools/perf/bench/mem-memset-x86-64-asm.S
957 @@ -1,7 +1,7 @@
958 #define memset MEMSET /* don't hide glibc's memset() */
959 #define altinstr_replacement text
960 #define globl p2align 4; .globl
961 -#include "../../../arch/x86/lib/memset_64.S"
962 +#include "../../arch/x86/lib/memset_64.S"
963
964 /*
965 * We need to provide note.GNU-stack section, saying that we want
966 --- a/tools/perf/util/include/asm/alternative-asm.h
967 +++ /dev/null
968 @@ -1,9 +0,0 @@
969 -#ifndef _PERF_ASM_ALTERNATIVE_ASM_H
970 -#define _PERF_ASM_ALTERNATIVE_ASM_H
971 -
972 -/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
973 -
974 -#define altinstruction_entry #
975 -#define ALTERNATIVE_2 #
976 -
977 -#endif