]>
Commit | Line | Data |
---|---|---|
82b46271 GKH |
1 | From 7d7d1bf1d1dabe435ef50efb051724b8664749cb Mon Sep 17 00:00:00 2001 |
2 | From: Arnaldo Carvalho de Melo <acme@redhat.com> | |
3 | Date: Mon, 11 Jul 2016 12:36:41 -0300 | |
4 | Subject: perf bench: Copy kernel files needed to build mem{cpy,set} x86_64 benchmarks | |
5 | ||
6 | From: Arnaldo Carvalho de Melo <acme@redhat.com> | |
7 | ||
8 | commit 7d7d1bf1d1dabe435ef50efb051724b8664749cb upstream. | |
9 | ||
10 | We can't access kernel files directly from tools/, so copy the required | |
11 | bits, and make sure that we detect when the original files, in the | |
12 | kernel, gets modified. | |
13 | ||
14 | Cc: Adrian Hunter <adrian.hunter@intel.com> | |
15 | Cc: David Ahern <dsahern@gmail.com> | |
16 | Cc: Jiri Olsa <jolsa@kernel.org> | |
17 | Cc: Namhyung Kim <namhyung@kernel.org> | |
18 | Cc: Wang Nan <wangnan0@huawei.com> | |
19 | Link: http://lkml.kernel.org/n/tip-z7e76274ch5j4nugv048qacb@git.kernel.org | |
20 | Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> | |
21 | Signed-off-by: Daniel Díaz <daniel.diaz@linaro.org> | |
22 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
23 | ||
24 | --- | |
25 | tools/arch/x86/include/asm/cpufeatures.h | 336 +++++++++++++++++++++++++ | |
26 | tools/arch/x86/include/asm/disabled-features.h | 65 ++++ | |
27 | tools/arch/x86/include/asm/required-features.h | 106 +++++++ | |
28 | tools/arch/x86/lib/memcpy_64.S | 179 +++++++++++++ | |
29 | tools/arch/x86/lib/memset_64.S | 138 ++++++++++ | |
30 | tools/include/asm/alternative-asm.h | 9 | |
31 | tools/perf/MANIFEST | 8 | |
32 | tools/perf/Makefile.perf | 15 + | |
33 | tools/perf/bench/mem-memcpy-x86-64-asm.S | 2 | |
34 | tools/perf/bench/mem-memset-x86-64-asm.S | 2 | |
35 | tools/perf/util/include/asm/alternative-asm.h | 9 | |
36 | 11 files changed, 856 insertions(+), 13 deletions(-) | |
37 | ||
38 | --- /dev/null | |
39 | +++ b/tools/arch/x86/include/asm/cpufeatures.h | |
40 | @@ -0,0 +1,336 @@ | |
41 | +#ifndef _ASM_X86_CPUFEATURES_H | |
42 | +#define _ASM_X86_CPUFEATURES_H | |
43 | + | |
44 | +#ifndef _ASM_X86_REQUIRED_FEATURES_H | |
45 | +#include <asm/required-features.h> | |
46 | +#endif | |
47 | + | |
48 | +#ifndef _ASM_X86_DISABLED_FEATURES_H | |
49 | +#include <asm/disabled-features.h> | |
50 | +#endif | |
51 | + | |
52 | +/* | |
53 | + * Defines x86 CPU feature bits | |
54 | + */ | |
55 | +#define NCAPINTS 19 /* N 32-bit words worth of info */ | |
56 | +#define NBUGINTS 1 /* N 32-bit bug flags */ | |
57 | + | |
58 | +/* | |
59 | + * Note: If the comment begins with a quoted string, that string is used | |
60 | + * in /proc/cpuinfo instead of the macro name. If the string is "", | |
61 | + * this feature bit is not displayed in /proc/cpuinfo at all. | |
62 | + */ | |
63 | + | |
64 | +/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ | |
65 | +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ | |
66 | +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ | |
67 | +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ | |
68 | +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ | |
69 | +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ | |
70 | +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ | |
71 | +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ | |
72 | +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ | |
73 | +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ | |
74 | +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ | |
75 | +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ | |
76 | +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ | |
77 | +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ | |
78 | +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ | |
79 | +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ | |
80 | + /* (plus FCMOVcc, FCOMI with FPU) */ | |
81 | +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ | |
82 | +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ | |
83 | +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ | |
84 | +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ | |
85 | +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ | |
86 | +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ | |
87 | +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ | |
88 | +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ | |
89 | +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ | |
90 | +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ | |
91 | +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ | |
92 | +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ | |
93 | +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ | |
94 | +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ | |
95 | +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ | |
96 | + | |
97 | +/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ | |
98 | +/* Don't duplicate feature flags which are redundant with Intel! */ | |
99 | +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ | |
100 | +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ | |
101 | +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ | |
102 | +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ | |
103 | +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ | |
104 | +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ | |
105 | +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ | |
106 | +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ | |
107 | +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ | |
108 | +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ | |
109 | + | |
110 | +/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ | |
111 | +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ | |
112 | +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ | |
113 | +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ | |
114 | + | |
115 | +/* Other features, Linux-defined mapping, word 3 */ | |
116 | +/* This range is used for feature bits which conflict or are synthesized */ | |
117 | +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ | |
118 | +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ | |
119 | +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ | |
120 | +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ | |
121 | +/* cpu types for specific tunings: */ | |
122 | +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ | |
123 | +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ | |
124 | +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ | |
125 | +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ | |
126 | +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ | |
127 | +#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ | |
128 | +/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ | |
129 | +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ | |
130 | +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ | |
131 | +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ | |
132 | +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ | |
133 | +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ | |
134 | +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ | |
135 | +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ | |
136 | +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ | |
137 | +/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ | |
138 | +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ | |
139 | +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ | |
140 | +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ | |
141 | +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ | |
142 | +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ | |
143 | +/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ | |
144 | +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ | |
145 | +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ | |
146 | +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ | |
147 | +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ | |
148 | + | |
149 | +/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ | |
150 | +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ | |
151 | +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ | |
152 | +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ | |
153 | +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ | |
154 | +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ | |
155 | +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ | |
156 | +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ | |
157 | +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ | |
158 | +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ | |
159 | +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ | |
160 | +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ | |
161 | +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ | |
162 | +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ | |
163 | +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ | |
164 | +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ | |
165 | +#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ | |
166 | +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ | |
167 | +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ | |
168 | +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ | |
169 | +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ | |
170 | +#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ | |
171 | +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ | |
172 | +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ | |
173 | +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ | |
174 | +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ | |
175 | +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ | |
176 | +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ | |
177 | +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ | |
178 | +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ | |
179 | +#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ | |
180 | +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ | |
181 | + | |
182 | +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ | |
183 | +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ | |
184 | +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ | |
185 | +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ | |
186 | +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ | |
187 | +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ | |
188 | +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ | |
189 | +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ | |
190 | +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ | |
191 | +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ | |
192 | +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ | |
193 | + | |
194 | +/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ | |
195 | +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ | |
196 | +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ | |
197 | +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ | |
198 | +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ | |
199 | +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ | |
200 | +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ | |
201 | +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ | |
202 | +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ | |
203 | +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ | |
204 | +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ | |
205 | +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ | |
206 | +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ | |
207 | +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ | |
208 | +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ | |
209 | +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ | |
210 | +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ | |
211 | +#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ | |
212 | +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ | |
213 | +#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ | |
214 | +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ | |
215 | +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ | |
216 | +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ | |
217 | +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ | |
218 | +#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ | |
219 | +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ | |
220 | + | |
221 | +/* | |
222 | + * Auxiliary flags: Linux defined - For features scattered in various | |
223 | + * CPUID levels like 0x6, 0xA etc, word 7. | |
224 | + * | |
225 | + * Reuse free bits when adding new feature flags! | |
226 | + */ | |
227 | + | |
228 | +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ | |
229 | +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ | |
230 | +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */ | |
231 | + | |
232 | +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ | |
233 | +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ | |
234 | + | |
235 | +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ | |
236 | +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ | |
237 | + | |
238 | +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ | |
239 | +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ | |
240 | + | |
241 | +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ | |
242 | +#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ | |
243 | + | |
244 | +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ | |
245 | +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ | |
246 | + | |
247 | +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled*/ | |
248 | +#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ | |
249 | +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ | |
250 | +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation */ | |
251 | + | |
252 | +#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ | |
253 | +#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ | |
254 | +#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ | |
255 | +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ | |
256 | +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ | |
257 | + | |
258 | +/* Virtualization flags: Linux defined, word 8 */ | |
259 | +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ | |
260 | +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ | |
261 | +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ | |
262 | +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ | |
263 | +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ | |
264 | + | |
265 | +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ | |
266 | +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ | |
267 | + | |
268 | + | |
269 | +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ | |
270 | +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ | |
271 | +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ | |
272 | +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ | |
273 | +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ | |
274 | +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ | |
275 | +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ | |
276 | +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ | |
277 | +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ | |
278 | +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ | |
279 | +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ | |
280 | +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ | |
281 | +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ | |
282 | +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ | |
283 | +#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ | |
284 | +#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ | |
285 | +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ | |
286 | +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ | |
287 | +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ | |
288 | +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ | |
289 | +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ | |
290 | +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ | |
291 | +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ | |
292 | +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ | |
293 | + | |
294 | +/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ | |
295 | +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ | |
296 | +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ | |
297 | +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ | |
298 | +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ | |
299 | + | |
300 | +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ | |
301 | +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ | |
302 | + | |
303 | +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ | |
304 | +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ | |
305 | + | |
306 | +/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ | |
307 | +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ | |
308 | +#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ | |
309 | +#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ | |
310 | +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ | |
311 | +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ | |
312 | + | |
313 | +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ | |
314 | +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ | |
315 | +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ | |
316 | +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ | |
317 | +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ | |
318 | +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ | |
319 | +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ | |
320 | +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ | |
321 | +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ | |
322 | +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ | |
323 | +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ | |
324 | + | |
325 | +/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ | |
326 | +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ | |
327 | +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ | |
328 | +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ | |
329 | +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ | |
330 | +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ | |
331 | +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ | |
332 | +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ | |
333 | +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ | |
334 | +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ | |
335 | +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ | |
336 | + | |
337 | +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ | |
338 | +#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ | |
339 | +#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ | |
340 | + | |
341 | +/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ | |
342 | +#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ | |
343 | +#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ | |
344 | +#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ | |
345 | + | |
346 | + | |
347 | +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ | |
348 | +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ | |
349 | +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ | |
350 | +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ | |
351 | +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ | |
352 | +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ | |
353 | +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ | |
354 | +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ | |
355 | + | |
356 | +/* | |
357 | + * BUG word(s) | |
358 | + */ | |
359 | +#define X86_BUG(x) (NCAPINTS*32 + (x)) | |
360 | + | |
361 | +#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ | |
362 | +#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ | |
363 | +#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ | |
364 | +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ | |
365 | +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ | |
366 | +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ | |
367 | +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ | |
368 | +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ | |
369 | +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ | |
370 | +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ | |
371 | +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ | |
372 | +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ | |
373 | +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ | |
374 | +#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ | |
375 | + | |
376 | +#endif /* _ASM_X86_CPUFEATURES_H */ | |
377 | --- /dev/null | |
378 | +++ b/tools/arch/x86/include/asm/disabled-features.h | |
379 | @@ -0,0 +1,65 @@ | |
380 | +#ifndef _ASM_X86_DISABLED_FEATURES_H | |
381 | +#define _ASM_X86_DISABLED_FEATURES_H | |
382 | + | |
383 | +/* These features, although they might be available in a CPU | |
384 | + * will not be used because the compile options to support | |
385 | + * them are not present. | |
386 | + * | |
387 | + * This code allows them to be checked and disabled at | |
388 | + * compile time without an explicit #ifdef. Use | |
389 | + * cpu_feature_enabled(). | |
390 | + */ | |
391 | + | |
392 | +#ifdef CONFIG_X86_INTEL_MPX | |
393 | +# define DISABLE_MPX 0 | |
394 | +#else | |
395 | +# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) | |
396 | +#endif | |
397 | + | |
398 | +#ifdef CONFIG_X86_64 | |
399 | +# define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) | |
400 | +# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) | |
401 | +# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) | |
402 | +# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) | |
403 | +# define DISABLE_PCID 0 | |
404 | +#else | |
405 | +# define DISABLE_VME 0 | |
406 | +# define DISABLE_K6_MTRR 0 | |
407 | +# define DISABLE_CYRIX_ARR 0 | |
408 | +# define DISABLE_CENTAUR_MCR 0 | |
409 | +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) | |
410 | +#endif /* CONFIG_X86_64 */ | |
411 | + | |
412 | +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS | |
413 | +# define DISABLE_PKU 0 | |
414 | +# define DISABLE_OSPKE 0 | |
415 | +#else | |
416 | +# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) | |
417 | +# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) | |
418 | +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ | |
419 | + | |
420 | +/* | |
421 | + * Make sure to add features to the correct mask | |
422 | + */ | |
423 | +#define DISABLED_MASK0 (DISABLE_VME) | |
424 | +#define DISABLED_MASK1 0 | |
425 | +#define DISABLED_MASK2 0 | |
426 | +#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) | |
427 | +#define DISABLED_MASK4 (DISABLE_PCID) | |
428 | +#define DISABLED_MASK5 0 | |
429 | +#define DISABLED_MASK6 0 | |
430 | +#define DISABLED_MASK7 0 | |
431 | +#define DISABLED_MASK8 0 | |
432 | +#define DISABLED_MASK9 (DISABLE_MPX) | |
433 | +#define DISABLED_MASK10 0 | |
434 | +#define DISABLED_MASK11 0 | |
435 | +#define DISABLED_MASK12 0 | |
436 | +#define DISABLED_MASK13 0 | |
437 | +#define DISABLED_MASK14 0 | |
438 | +#define DISABLED_MASK15 0 | |
439 | +#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) | |
440 | +#define DISABLED_MASK17 0 | |
441 | +#define DISABLED_MASK18 0 | |
442 | +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) | |
443 | + | |
444 | +#endif /* _ASM_X86_DISABLED_FEATURES_H */ | |
445 | --- /dev/null | |
446 | +++ b/tools/arch/x86/include/asm/required-features.h | |
447 | @@ -0,0 +1,106 @@ | |
448 | +#ifndef _ASM_X86_REQUIRED_FEATURES_H | |
449 | +#define _ASM_X86_REQUIRED_FEATURES_H | |
450 | + | |
451 | +/* Define minimum CPUID feature set for kernel These bits are checked | |
452 | + really early to actually display a visible error message before the | |
453 | + kernel dies. Make sure to assign features to the proper mask! | |
454 | + | |
455 | + Some requirements that are not in CPUID yet are also in the | |
456 | + CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too. | |
457 | + | |
458 | + The real information is in arch/x86/Kconfig.cpu, this just converts | |
459 | + the CONFIGs into a bitmask */ | |
460 | + | |
461 | +#ifndef CONFIG_MATH_EMULATION | |
462 | +# define NEED_FPU (1<<(X86_FEATURE_FPU & 31)) | |
463 | +#else | |
464 | +# define NEED_FPU 0 | |
465 | +#endif | |
466 | + | |
467 | +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) | |
468 | +# define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) | |
469 | +#else | |
470 | +# define NEED_PAE 0 | |
471 | +#endif | |
472 | + | |
473 | +#ifdef CONFIG_X86_CMPXCHG64 | |
474 | +# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) | |
475 | +#else | |
476 | +# define NEED_CX8 0 | |
477 | +#endif | |
478 | + | |
479 | +#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64) | |
480 | +# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31)) | |
481 | +#else | |
482 | +# define NEED_CMOV 0 | |
483 | +#endif | |
484 | + | |
485 | +#ifdef CONFIG_X86_USE_3DNOW | |
486 | +# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31)) | |
487 | +#else | |
488 | +# define NEED_3DNOW 0 | |
489 | +#endif | |
490 | + | |
491 | +#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) | |
492 | +# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) | |
493 | +#else | |
494 | +# define NEED_NOPL 0 | |
495 | +#endif | |
496 | + | |
497 | +#ifdef CONFIG_MATOM | |
498 | +# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31)) | |
499 | +#else | |
500 | +# define NEED_MOVBE 0 | |
501 | +#endif | |
502 | + | |
503 | +#ifdef CONFIG_X86_64 | |
504 | +#ifdef CONFIG_PARAVIRT | |
505 | +/* Paravirtualized systems may not have PSE or PGE available */ | |
506 | +#define NEED_PSE 0 | |
507 | +#define NEED_PGE 0 | |
508 | +#else | |
509 | +#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) | |
510 | +#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) | |
511 | +#endif | |
512 | +#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) | |
513 | +#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) | |
514 | +#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) | |
515 | +#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) | |
516 | +#define NEED_LM (1<<(X86_FEATURE_LM & 31)) | |
517 | +#else | |
518 | +#define NEED_PSE 0 | |
519 | +#define NEED_MSR 0 | |
520 | +#define NEED_PGE 0 | |
521 | +#define NEED_FXSR 0 | |
522 | +#define NEED_XMM 0 | |
523 | +#define NEED_XMM2 0 | |
524 | +#define NEED_LM 0 | |
525 | +#endif | |
526 | + | |
527 | +#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\ | |
528 | + NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\ | |
529 | + NEED_XMM|NEED_XMM2) | |
530 | +#define SSE_MASK (NEED_XMM|NEED_XMM2) | |
531 | + | |
532 | +#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) | |
533 | + | |
534 | +#define REQUIRED_MASK2 0 | |
535 | +#define REQUIRED_MASK3 (NEED_NOPL) | |
536 | +#define REQUIRED_MASK4 (NEED_MOVBE) | |
537 | +#define REQUIRED_MASK5 0 | |
538 | +#define REQUIRED_MASK6 0 | |
539 | +#define REQUIRED_MASK7 0 | |
540 | +#define REQUIRED_MASK8 0 | |
541 | +#define REQUIRED_MASK9 0 | |
542 | +#define REQUIRED_MASK10 0 | |
543 | +#define REQUIRED_MASK11 0 | |
544 | +#define REQUIRED_MASK12 0 | |
545 | +#define REQUIRED_MASK13 0 | |
546 | +#define REQUIRED_MASK14 0 | |
547 | +#define REQUIRED_MASK15 0 | |
548 | +#define REQUIRED_MASK16 0 | |
549 | +#define REQUIRED_MASK17 0 | |
550 | +#define REQUIRED_MASK18 0 | |
551 | +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) | |
552 | + | |
553 | +#endif /* _ASM_X86_REQUIRED_FEATURES_H */ | |
554 | --- /dev/null | |
555 | +++ b/tools/arch/x86/lib/memcpy_64.S | |
556 | @@ -0,0 +1,179 @@ | |
557 | +/* Copyright 2002 Andi Kleen */ | |
558 | + | |
559 | +#include <linux/linkage.h> | |
560 | +#include <asm/cpufeatures.h> | |
561 | +#include <asm/alternative-asm.h> | |
562 | + | |
563 | +/* | |
564 | + * We build a jump to memcpy_orig by default which gets NOPped out on | |
565 | + * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | |
566 | + * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | |
567 | + * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | |
568 | + */ | |
569 | + | |
570 | +.weak memcpy | |
571 | + | |
572 | +/* | |
573 | + * memcpy - Copy a memory block. | |
574 | + * | |
575 | + * Input: | |
576 | + * rdi destination | |
577 | + * rsi source | |
578 | + * rdx count | |
579 | + * | |
580 | + * Output: | |
581 | + * rax original destination | |
582 | + */ | |
583 | +ENTRY(__memcpy) | |
584 | +ENTRY(memcpy) | |
585 | + ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | |
586 | + "jmp memcpy_erms", X86_FEATURE_ERMS | |
587 | + | |
588 | + movq %rdi, %rax | |
589 | + movq %rdx, %rcx | |
590 | + shrq $3, %rcx | |
591 | + andl $7, %edx | |
592 | + rep movsq | |
593 | + movl %edx, %ecx | |
594 | + rep movsb | |
595 | + ret | |
596 | +ENDPROC(memcpy) | |
597 | +ENDPROC(__memcpy) | |
598 | + | |
599 | +/* | |
600 | + * memcpy_erms() - enhanced fast string memcpy. This is faster and | |
601 | + * simpler than memcpy. Use memcpy_erms when possible. | |
602 | + */ | |
603 | +ENTRY(memcpy_erms) | |
604 | + movq %rdi, %rax | |
605 | + movq %rdx, %rcx | |
606 | + rep movsb | |
607 | + ret | |
608 | +ENDPROC(memcpy_erms) | |
609 | + | |
610 | +ENTRY(memcpy_orig) | |
611 | + movq %rdi, %rax | |
612 | + | |
613 | + cmpq $0x20, %rdx | |
614 | + jb .Lhandle_tail | |
615 | + | |
616 | + /* | |
617 | + * We check whether memory false dependence could occur, | |
618 | + * then jump to corresponding copy mode. | |
619 | + */ | |
620 | + cmp %dil, %sil | |
621 | + jl .Lcopy_backward | |
622 | + subq $0x20, %rdx | |
623 | +.Lcopy_forward_loop: | |
624 | + subq $0x20, %rdx | |
625 | + | |
626 | + /* | |
627 | + * Move in blocks of 4x8 bytes: | |
628 | + */ | |
629 | + movq 0*8(%rsi), %r8 | |
630 | + movq 1*8(%rsi), %r9 | |
631 | + movq 2*8(%rsi), %r10 | |
632 | + movq 3*8(%rsi), %r11 | |
633 | + leaq 4*8(%rsi), %rsi | |
634 | + | |
635 | + movq %r8, 0*8(%rdi) | |
636 | + movq %r9, 1*8(%rdi) | |
637 | + movq %r10, 2*8(%rdi) | |
638 | + movq %r11, 3*8(%rdi) | |
639 | + leaq 4*8(%rdi), %rdi | |
640 | + jae .Lcopy_forward_loop | |
641 | + addl $0x20, %edx | |
642 | + jmp .Lhandle_tail | |
643 | + | |
644 | +.Lcopy_backward: | |
645 | + /* | |
646 | + * Calculate copy position to tail. | |
647 | + */ | |
648 | + addq %rdx, %rsi | |
649 | + addq %rdx, %rdi | |
650 | + subq $0x20, %rdx | |
651 | + /* | |
652 | + * At most 3 ALU operations in one cycle, | |
653 | + * so append NOPS in the same 16 bytes trunk. | |
654 | + */ | |
655 | + .p2align 4 | |
656 | +.Lcopy_backward_loop: | |
657 | + subq $0x20, %rdx | |
658 | + movq -1*8(%rsi), %r8 | |
659 | + movq -2*8(%rsi), %r9 | |
660 | + movq -3*8(%rsi), %r10 | |
661 | + movq -4*8(%rsi), %r11 | |
662 | + leaq -4*8(%rsi), %rsi | |
663 | + movq %r8, -1*8(%rdi) | |
664 | + movq %r9, -2*8(%rdi) | |
665 | + movq %r10, -3*8(%rdi) | |
666 | + movq %r11, -4*8(%rdi) | |
667 | + leaq -4*8(%rdi), %rdi | |
668 | + jae .Lcopy_backward_loop | |
669 | + | |
670 | + /* | |
671 | + * Calculate copy position to head. | |
672 | + */ | |
673 | + addl $0x20, %edx | |
674 | + subq %rdx, %rsi | |
675 | + subq %rdx, %rdi | |
676 | +.Lhandle_tail: | |
677 | + cmpl $16, %edx | |
678 | + jb .Lless_16bytes | |
679 | + | |
680 | + /* | |
681 | + * Move data from 16 bytes to 31 bytes. | |
682 | + */ | |
683 | + movq 0*8(%rsi), %r8 | |
684 | + movq 1*8(%rsi), %r9 | |
685 | + movq -2*8(%rsi, %rdx), %r10 | |
686 | + movq -1*8(%rsi, %rdx), %r11 | |
687 | + movq %r8, 0*8(%rdi) | |
688 | + movq %r9, 1*8(%rdi) | |
689 | + movq %r10, -2*8(%rdi, %rdx) | |
690 | + movq %r11, -1*8(%rdi, %rdx) | |
691 | + retq | |
692 | + .p2align 4 | |
693 | +.Lless_16bytes: | |
694 | + cmpl $8, %edx | |
695 | + jb .Lless_8bytes | |
696 | + /* | |
697 | + * Move data from 8 bytes to 15 bytes. | |
698 | + */ | |
699 | + movq 0*8(%rsi), %r8 | |
700 | + movq -1*8(%rsi, %rdx), %r9 | |
701 | + movq %r8, 0*8(%rdi) | |
702 | + movq %r9, -1*8(%rdi, %rdx) | |
703 | + retq | |
704 | + .p2align 4 | |
705 | +.Lless_8bytes: | |
706 | + cmpl $4, %edx | |
707 | + jb .Lless_3bytes | |
708 | + | |
709 | + /* | |
710 | + * Move data from 4 bytes to 7 bytes. | |
711 | + */ | |
712 | + movl (%rsi), %ecx | |
713 | + movl -4(%rsi, %rdx), %r8d | |
714 | + movl %ecx, (%rdi) | |
715 | + movl %r8d, -4(%rdi, %rdx) | |
716 | + retq | |
717 | + .p2align 4 | |
718 | +.Lless_3bytes: | |
719 | + subl $1, %edx | |
720 | + jb .Lend | |
721 | + /* | |
722 | + * Move data from 1 bytes to 3 bytes. | |
723 | + */ | |
724 | + movzbl (%rsi), %ecx | |
725 | + jz .Lstore_1byte | |
726 | + movzbq 1(%rsi), %r8 | |
727 | + movzbq (%rsi, %rdx), %r9 | |
728 | + movb %r8b, 1(%rdi) | |
729 | + movb %r9b, (%rdi, %rdx) | |
730 | +.Lstore_1byte: | |
731 | + movb %cl, (%rdi) | |
732 | + | |
733 | +.Lend: | |
734 | + retq | |
735 | +ENDPROC(memcpy_orig) | |
736 | --- /dev/null | |
737 | +++ b/tools/arch/x86/lib/memset_64.S | |
738 | @@ -0,0 +1,138 @@ | |
739 | +/* Copyright 2002 Andi Kleen, SuSE Labs */ | |
740 | + | |
741 | +#include <linux/linkage.h> | |
742 | +#include <asm/cpufeatures.h> | |
743 | +#include <asm/alternative-asm.h> | |
744 | + | |
745 | +.weak memset | |
746 | + | |
747 | +/* | |
748 | + * ISO C memset - set a memory block to a byte value. This function uses fast | |
749 | + * string to get better performance than the original function. The code is | |
750 | + * simpler and shorter than the orignal function as well. | |
751 | + * | |
752 | + * rdi destination | |
753 | + * rsi value (char) | |
754 | + * rdx count (bytes) | |
755 | + * | |
756 | + * rax original destination | |
757 | + */ | |
758 | +ENTRY(memset) | |
759 | +ENTRY(__memset) | |
760 | + /* | |
761 | + * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended | |
762 | + * to use it when possible. If not available, use fast string instructions. | |
763 | + * | |
764 | + * Otherwise, use original memset function. | |
765 | + */ | |
766 | + ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ | |
767 | + "jmp memset_erms", X86_FEATURE_ERMS | |
768 | + | |
769 | + movq %rdi,%r9 | |
770 | + movq %rdx,%rcx | |
771 | + andl $7,%edx | |
772 | + shrq $3,%rcx | |
773 | + /* expand byte value */ | |
774 | + movzbl %sil,%esi | |
775 | + movabs $0x0101010101010101,%rax | |
776 | + imulq %rsi,%rax | |
777 | + rep stosq | |
778 | + movl %edx,%ecx | |
779 | + rep stosb | |
780 | + movq %r9,%rax | |
781 | + ret | |
782 | +ENDPROC(memset) | |
783 | +ENDPROC(__memset) | |
784 | + | |
785 | +/* | |
786 | + * ISO C memset - set a memory block to a byte value. This function uses | |
787 | + * enhanced rep stosb to override the fast string function. | |
788 | + * The code is simpler and shorter than the fast string function as well. | |
789 | + * | |
790 | + * rdi destination | |
791 | + * rsi value (char) | |
792 | + * rdx count (bytes) | |
793 | + * | |
794 | + * rax original destination | |
795 | + */ | |
796 | +ENTRY(memset_erms) | |
797 | + movq %rdi,%r9 | |
798 | + movb %sil,%al | |
799 | + movq %rdx,%rcx | |
800 | + rep stosb | |
801 | + movq %r9,%rax | |
802 | + ret | |
803 | +ENDPROC(memset_erms) | |
804 | + | |
805 | +ENTRY(memset_orig) | |
806 | + movq %rdi,%r10 | |
807 | + | |
808 | + /* expand byte value */ | |
809 | + movzbl %sil,%ecx | |
810 | + movabs $0x0101010101010101,%rax | |
811 | + imulq %rcx,%rax | |
812 | + | |
813 | + /* align dst */ | |
814 | + movl %edi,%r9d | |
815 | + andl $7,%r9d | |
816 | + jnz .Lbad_alignment | |
817 | +.Lafter_bad_alignment: | |
818 | + | |
819 | + movq %rdx,%rcx | |
820 | + shrq $6,%rcx | |
821 | + jz .Lhandle_tail | |
822 | + | |
823 | + .p2align 4 | |
824 | +.Lloop_64: | |
825 | + decq %rcx | |
826 | + movq %rax,(%rdi) | |
827 | + movq %rax,8(%rdi) | |
828 | + movq %rax,16(%rdi) | |
829 | + movq %rax,24(%rdi) | |
830 | + movq %rax,32(%rdi) | |
831 | + movq %rax,40(%rdi) | |
832 | + movq %rax,48(%rdi) | |
833 | + movq %rax,56(%rdi) | |
834 | + leaq 64(%rdi),%rdi | |
835 | + jnz .Lloop_64 | |
836 | + | |
837 | + /* Handle tail in loops. The loops should be faster than hard | |
838 | + to predict jump tables. */ | |
839 | + .p2align 4 | |
840 | +.Lhandle_tail: | |
841 | + movl %edx,%ecx | |
842 | + andl $63&(~7),%ecx | |
843 | + jz .Lhandle_7 | |
844 | + shrl $3,%ecx | |
845 | + .p2align 4 | |
846 | +.Lloop_8: | |
847 | + decl %ecx | |
848 | + movq %rax,(%rdi) | |
849 | + leaq 8(%rdi),%rdi | |
850 | + jnz .Lloop_8 | |
851 | + | |
852 | +.Lhandle_7: | |
853 | + andl $7,%edx | |
854 | + jz .Lende | |
855 | + .p2align 4 | |
856 | +.Lloop_1: | |
857 | + decl %edx | |
858 | + movb %al,(%rdi) | |
859 | + leaq 1(%rdi),%rdi | |
860 | + jnz .Lloop_1 | |
861 | + | |
862 | +.Lende: | |
863 | + movq %r10,%rax | |
864 | + ret | |
865 | + | |
866 | +.Lbad_alignment: | |
867 | + cmpq $7,%rdx | |
868 | + jbe .Lhandle_7 | |
869 | + movq %rax,(%rdi) /* unaligned store */ | |
870 | + movq $8,%r8 | |
871 | + subq %r9,%r8 | |
872 | + addq %r8,%rdi | |
873 | + subq %r8,%rdx | |
874 | + jmp .Lafter_bad_alignment | |
875 | +.Lfinal: | |
876 | +ENDPROC(memset_orig) | |
877 | --- /dev/null | |
878 | +++ b/tools/include/asm/alternative-asm.h | |
879 | @@ -0,0 +1,9 @@ | |
880 | +#ifndef _TOOLS_ASM_ALTERNATIVE_ASM_H | |
881 | +#define _TOOLS_ASM_ALTERNATIVE_ASM_H | |
882 | + | |
883 | +/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ | |
884 | + | |
885 | +#define altinstruction_entry # | |
886 | +#define ALTERNATIVE_2 # | |
887 | + | |
888 | +#endif | |
889 | --- a/tools/perf/MANIFEST | |
890 | +++ b/tools/perf/MANIFEST | |
891 | @@ -11,6 +11,11 @@ tools/arch/sparc/include/asm/barrier_32. | |
892 | tools/arch/sparc/include/asm/barrier_64.h | |
893 | tools/arch/tile/include/asm/barrier.h | |
894 | tools/arch/x86/include/asm/barrier.h | |
895 | +tools/arch/x86/include/asm/cpufeatures.h | |
896 | +tools/arch/x86/include/asm/disabled-features.h | |
897 | +tools/arch/x86/include/asm/required-features.h | |
898 | +tools/arch/x86/lib/memcpy_64.S | |
899 | +tools/arch/x86/lib/memset_64.S | |
900 | tools/arch/xtensa/include/asm/barrier.h | |
901 | tools/scripts | |
902 | tools/build | |
903 | @@ -25,6 +30,7 @@ tools/lib/rbtree.c | |
904 | tools/lib/symbol/kallsyms.c | |
905 | tools/lib/symbol/kallsyms.h | |
906 | tools/lib/util/find_next_bit.c | |
907 | +tools/include/asm/alternative-asm.h | |
908 | tools/include/asm/atomic.h | |
909 | tools/include/asm/barrier.h | |
910 | tools/include/asm/bug.h | |
911 | @@ -65,8 +71,6 @@ include/linux/swab.h | |
912 | arch/*/include/asm/unistd*.h | |
913 | arch/*/include/uapi/asm/unistd*.h | |
914 | arch/*/include/uapi/asm/perf_regs.h | |
915 | -arch/*/lib/memcpy*.S | |
916 | -arch/*/lib/memset*.S | |
917 | include/linux/poison.h | |
918 | include/linux/hw_breakpoint.h | |
919 | include/uapi/linux/perf_event.h | |
920 | --- a/tools/perf/Makefile.perf | |
921 | +++ b/tools/perf/Makefile.perf | |
922 | @@ -310,6 +310,21 @@ export srctree OUTPUT RM CC LD AR CFLAGS | |
923 | include $(srctree)/tools/build/Makefile.include | |
924 | ||
925 | $(PERF_IN): prepare FORCE | |
926 | + @(test -f ../../arch/x86/include/asm/disabled-features.h && ( \ | |
927 | + (diff -B ../arch/x86/include/asm/disabled-features.h ../../arch/x86/include/asm/disabled-features.h >/dev/null) \ | |
928 | + || echo "Warning: tools/arch/x86/include/asm/disabled-features.h differs from kernel" >&2 )) || true | |
929 | + @(test -f ../../arch/x86/include/asm/required-features.h && ( \ | |
930 | + (diff -B ../arch/x86/include/asm/required-features.h ../../arch/x86/include/asm/required-features.h >/dev/null) \ | |
931 | + || echo "Warning: tools/arch/x86/include/asm/required-features.h differs from kernel" >&2 )) || true | |
932 | + @(test -f ../../arch/x86/include/asm/cpufeatures.h && ( \ | |
933 | + (diff -B ../arch/x86/include/asm/cpufeatures.h ../../arch/x86/include/asm/cpufeatures.h >/dev/null) \ | |
934 | + || echo "Warning: tools/arch/x86/include/asm/cpufeatures.h differs from kernel" >&2 )) || true | |
935 | + @(test -f ../../arch/x86/lib/memcpy_64.S && ( \ | |
936 | + (diff -B ../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memcpy_64.S >/dev/null) \ | |
937 | + || echo "Warning: tools/arch/x86/lib/memcpy_64.S differs from kernel" >&2 )) || true | |
938 | + @(test -f ../../arch/x86/lib/memset_64.S && ( \ | |
939 | + (diff -B ../arch/x86/lib/memset_64.S ../../arch/x86/lib/memset_64.S >/dev/null) \ | |
940 | + || echo "Warning: tools/arch/x86/lib/memset_64.S differs from kernel" >&2 )) || true | |
941 | $(Q)$(MAKE) $(build)=perf | |
942 | ||
943 | $(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(LIBTRACEEVENT_DYNAMIC_LIST) | |
944 | --- a/tools/perf/bench/mem-memcpy-x86-64-asm.S | |
945 | +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S | |
946 | @@ -1,7 +1,7 @@ | |
947 | #define memcpy MEMCPY /* don't hide glibc's memcpy() */ | |
948 | #define altinstr_replacement text | |
949 | #define globl p2align 4; .globl | |
950 | -#include "../../../arch/x86/lib/memcpy_64.S" | |
951 | +#include "../../arch/x86/lib/memcpy_64.S" | |
952 | /* | |
953 | * We need to provide note.GNU-stack section, saying that we want | |
954 | * NOT executable stack. Otherwise the final linking will assume that | |
955 | --- a/tools/perf/bench/mem-memset-x86-64-asm.S | |
956 | +++ b/tools/perf/bench/mem-memset-x86-64-asm.S | |
957 | @@ -1,7 +1,7 @@ | |
958 | #define memset MEMSET /* don't hide glibc's memset() */ | |
959 | #define altinstr_replacement text | |
960 | #define globl p2align 4; .globl | |
961 | -#include "../../../arch/x86/lib/memset_64.S" | |
962 | +#include "../../arch/x86/lib/memset_64.S" | |
963 | ||
964 | /* | |
965 | * We need to provide note.GNU-stack section, saying that we want | |
966 | --- a/tools/perf/util/include/asm/alternative-asm.h | |
967 | +++ /dev/null | |
968 | @@ -1,9 +0,0 @@ | |
969 | -#ifndef _PERF_ASM_ALTERNATIVE_ASM_H | |
970 | -#define _PERF_ASM_ALTERNATIVE_ASM_H | |
971 | - | |
972 | -/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ | |
973 | - | |
974 | -#define altinstruction_entry # | |
975 | -#define ALTERNATIVE_2 # | |
976 | - | |
977 | -#endif |