]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - arch/x86/kernel/alternative.c
x86/alternative: Optimize returns patching
[thirdparty/kernel/stable.git] / arch / x86 / kernel / alternative.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
c767a54b
JP
2#define pr_fmt(fmt) "SMP alternatives: " fmt
3
9a0b5817 4#include <linux/module.h>
f6a57033 5#include <linux/sched.h>
d769811c 6#include <linux/perf_event.h>
2f1dafe5 7#include <linux/mutex.h>
9a0b5817 8#include <linux/list.h>
8b5a10fc 9#include <linux/stringify.h>
ca15ca40 10#include <linux/highmem.h>
19d36ccd
AK
11#include <linux/mm.h>
12#include <linux/vmalloc.h>
3945dab4 13#include <linux/memory.h>
3d55cc8a 14#include <linux/stop_machine.h>
5a0e3ad6 15#include <linux/slab.h>
fd4363ff 16#include <linux/kdebug.h>
c13324a5 17#include <linux/kprobes.h>
b3fd8e83 18#include <linux/mmu_context.h>
c0213b0a 19#include <linux/bsearch.h>
9998a983 20#include <linux/sync_core.h>
35de5b06 21#include <asm/text-patching.h>
9a0b5817
GH
22#include <asm/alternative.h>
23#include <asm/sections.h>
8f4e956b
AK
24#include <asm/mce.h>
25#include <asm/nmi.h>
e587cadd 26#include <asm/cacheflush.h>
78ff7fae 27#include <asm/tlbflush.h>
3a125539 28#include <asm/insn.h>
e587cadd 29#include <asm/io.h>
78ff7fae 30#include <asm/fixmap.h>
4e629211 31#include <asm/paravirt.h>
75085009 32#include <asm/asm-prototypes.h>
9a0b5817 33
5e907bb0
IM
34int __read_mostly alternatives_patched;
35
36EXPORT_SYMBOL_GPL(alternatives_patched);
37
ab144f5e
AK
38#define MAX_PATCH_LEN (255-1)
39
6becb502
PZ
40#define DA_ALL (~0)
41#define DA_ALT 0x01
42#define DA_RET 0x02
43#define DA_RETPOLINE 0x04
44#define DA_ENDBR 0x08
45#define DA_SMP 0x10
46
47static unsigned int __initdata_or_module debug_alternative;
b7fb4af0 48
d167a518
GH
49static int __init debug_alt(char *str)
50{
6becb502
PZ
51 if (str && *str == '=')
52 str++;
53
54 if (!str || kstrtouint(str, 0, &debug_alternative))
55 debug_alternative = DA_ALL;
56
d167a518
GH
57 return 1;
58}
d167a518
GH
59__setup("debug-alternative", debug_alt);
60
09488165
JB
61static int noreplace_smp;
62
b7fb4af0
JF
63static int __init setup_noreplace_smp(char *str)
64{
65 noreplace_smp = 1;
66 return 1;
67}
68__setup("noreplace-smp", setup_noreplace_smp);
69
6becb502 70#define DPRINTK(type, fmt, args...) \
db477a33 71do { \
6becb502 72 if (debug_alternative & DA_##type) \
1b2e335e 73 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
c767a54b 74} while (0)
d167a518 75
6becb502 76#define DUMP_BYTES(type, buf, len, fmt, args...) \
48c7a250 77do { \
6becb502 78 if (unlikely(debug_alternative & DA_##type)) { \
48c7a250
BP
79 int j; \
80 \
81 if (!(len)) \
82 break; \
83 \
1b2e335e 84 printk(KERN_DEBUG pr_fmt(fmt), ##args); \
48c7a250
BP
85 for (j = 0; j < (len) - 1; j++) \
86 printk(KERN_CONT "%02hhx ", buf[j]); \
87 printk(KERN_CONT "%02hhx\n", buf[j]); \
88 } \
89} while (0)
90
64e1f587 91static const unsigned char x86nops[] =
dc326fca 92{
a89dfde3
PZ
93 BYTES_NOP1,
94 BYTES_NOP2,
95 BYTES_NOP3,
96 BYTES_NOP4,
97 BYTES_NOP5,
98 BYTES_NOP6,
99 BYTES_NOP7,
100 BYTES_NOP8,
9a0b5817 101};
d167a518 102
a89dfde3 103const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
dc326fca 104{
32c464f5 105 NULL,
a89dfde3
PZ
106 x86nops,
107 x86nops + 1,
108 x86nops + 1 + 2,
109 x86nops + 1 + 2 + 3,
110 x86nops + 1 + 2 + 3 + 4,
111 x86nops + 1 + 2 + 3 + 4 + 5,
112 x86nops + 1 + 2 + 3 + 4 + 5 + 6,
113 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
32c464f5 114};
9a0b5817 115
6c480f22 116/*
b6c881b2
PZ
117 * Fill the buffer with a single effective instruction of size @len.
118 *
6c480f22
PZ
119 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
120 * for every single-byte NOP, try to generate the maximally available NOP of
121 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
122 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
123 * *jump* over instead of executing long and daft NOPs.
124 */
125static void __init_or_module add_nop(u8 *instr, unsigned int len)
139ec7c4 126{
6c480f22
PZ
127 u8 *target = instr + len;
128
129 if (!len)
130 return;
131
132 if (len <= ASM_NOP_MAX) {
133 memcpy(instr, x86_nops[len], len);
134 return;
139ec7c4 135 }
6c480f22
PZ
136
137 if (len < 128) {
138 __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
139 instr += JMP8_INSN_SIZE;
140 } else {
141 __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
142 instr += JMP32_INSN_SIZE;
143 }
144
145 for (;instr < target; instr++)
146 *instr = INT3_INSN_OPCODE;
139ec7c4
RR
147}
148
75085009 149extern s32 __retpoline_sites[], __retpoline_sites_end[];
15e67227 150extern s32 __return_sites[], __return_sites_end[];
931ab636 151extern s32 __cfi_sites[], __cfi_sites_end[];
ed53a0d9 152extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
d167a518 153extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
5967ed87 154extern s32 __smp_locks[], __smp_locks_end[];
0a203df5 155void text_poke_early(void *addr, const void *opcode, size_t len);
d167a518 156
b6c881b2
PZ
157/*
158 * Matches NOP and NOPL, not any of the other possible NOPs.
159 */
6c480f22 160static bool insn_is_nop(struct insn *insn)
2b31e8ed 161{
6c480f22
PZ
162 if (insn->opcode.bytes[0] == 0x90)
163 return true;
2b31e8ed 164
6c480f22
PZ
165 if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
166 return true;
2b31e8ed 167
6c480f22 168 /* TODO: more nops */
2b31e8ed 169
6c480f22
PZ
170 return false;
171}
2b31e8ed 172
b6c881b2
PZ
173/*
174 * Find the offset of the first non-NOP instruction starting at @offset
175 * but no further than @len.
176 */
6c480f22
PZ
177static int skip_nops(u8 *instr, int offset, int len)
178{
179 struct insn insn;
2b31e8ed 180
6c480f22
PZ
181 for (; offset < len; offset += insn.length) {
182 if (insn_decode_kernel(&insn, &instr[offset]))
183 break;
2b31e8ed 184
6c480f22
PZ
185 if (!insn_is_nop(&insn))
186 break;
187 }
2b31e8ed 188
6c480f22 189 return offset;
2b31e8ed
BP
190}
191
b6c881b2
PZ
192/*
193 * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
194 * to the end of the NOP sequence into a single NOP.
195 */
196static bool __optimize_nops(u8 *instr, size_t len, struct insn *insn,
197 int *next, int *prev, int *target)
198{
199 int i = *next - insn->length;
200
201 switch (insn->opcode.bytes[0]) {
202 case JMP8_INSN_OPCODE:
203 case JMP32_INSN_OPCODE:
204 *prev = i;
205 *target = *next + insn->immediate.value;
206 return false;
207 }
208
209 if (insn_is_nop(insn)) {
210 int nop = i;
211
212 *next = skip_nops(instr, *next, len);
213 if (*target && *next == *target)
214 nop = *prev;
215
216 add_nop(instr + nop, *next - nop);
217 DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
218 return true;
219 }
220
221 *target = 0;
222 return false;
223}
224
34bfab0e
BP
225/*
226 * "noinline" to cause control flow change and thus invalidate I$ and
227 * cause refetch after modification.
228 */
75085009 229static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
4fd4b6e5 230{
b6c881b2
PZ
231 int prev, target = 0;
232
6c480f22
PZ
233 for (int next, i = 0; i < len; i = next) {
234 struct insn insn;
66c117d7 235
23c1ad53
PZ
236 if (insn_decode_kernel(&insn, &instr[i]))
237 return;
238
6c480f22 239 next = i + insn.length;
23c1ad53 240
b6c881b2 241 __optimize_nops(instr, len, &insn, &next, &prev, &target);
612e8e93 242 }
4fd4b6e5
BP
243}
244
270a69c4
PZ
245/*
246 * In this context, "source" is where the instructions are placed in the
247 * section .altinstr_replacement, for example during kernel build by the
248 * toolchain.
249 * "Destination" is where the instructions are being patched in by this
250 * machinery.
251 *
252 * The source offset is:
253 *
254 * src_imm = target - src_next_ip (1)
255 *
256 * and the target offset is:
257 *
258 * dst_imm = target - dst_next_ip (2)
259 *
260 * so rework (1) as an expression for target like:
261 *
262 * target = src_imm + src_next_ip (1a)
263 *
264 * and substitute in (2) to get:
265 *
266 * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
267 *
268 * Now, since the instruction stream is 'identical' at src and dst (it
269 * is being copied after all) it can be stated that:
270 *
271 * src_next_ip = src + ip_offset
272 * dst_next_ip = dst + ip_offset (4)
273 *
274 * Substitute (4) in (3) and observe ip_offset being cancelled out to
275 * obtain:
276 *
277 * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
278 * = src_imm + src - dst + ip_offset - ip_offset
279 * = src_imm + src - dst (5)
280 *
281 * IOW, only the relative displacement of the code block matters.
282 */
283
284#define apply_reloc_n(n_, p_, d_) \
285 do { \
286 s32 v = *(s##n_ *)(p_); \
287 v += (d_); \
288 BUG_ON((v >> 31) != (v >> (n_-1))); \
289 *(s##n_ *)(p_) = (s##n_)v; \
290 } while (0)
291
292
293static __always_inline
294void apply_reloc(int n, void *ptr, uintptr_t diff)
295{
296 switch (n) {
297 case 1: apply_reloc_n(8, ptr, diff); break;
298 case 2: apply_reloc_n(16, ptr, diff); break;
299 case 4: apply_reloc_n(32, ptr, diff); break;
300 default: BUG();
301 }
302}
303
304static __always_inline
305bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
306{
307 u8 *target = src + offset;
308 /*
309 * If the target is inside the patched block, it's relative to the
310 * block itself and does not need relocation.
311 */
312 return (target < src || target > src + src_len);
313}
314
315static void __init_or_module noinline
316apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
317{
b6c881b2
PZ
318 int prev, target = 0;
319
270a69c4
PZ
320 for (int next, i = 0; i < len; i = next) {
321 struct insn insn;
322
323 if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
324 return;
325
326 next = i + insn.length;
327
b6c881b2
PZ
328 if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
329 continue;
330
270a69c4
PZ
331 switch (insn.opcode.bytes[0]) {
332 case 0x0f:
333 if (insn.opcode.bytes[1] < 0x80 ||
334 insn.opcode.bytes[1] > 0x8f)
335 break;
336
337 fallthrough; /* Jcc.d32 */
338 case 0x70 ... 0x7f: /* Jcc.d8 */
339 case JMP8_INSN_OPCODE:
340 case JMP32_INSN_OPCODE:
341 case CALL_INSN_OPCODE:
342 if (need_reloc(next + insn.immediate.value, src, src_len)) {
343 apply_reloc(insn.immediate.nbytes,
344 buf + i + insn_offset_immediate(&insn),
345 src - dest);
346 }
347
348 /*
349 * Where possible, convert JMP.d32 into JMP.d8.
350 */
351 if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
352 s32 imm = insn.immediate.value;
353 imm += src - dest;
354 imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
355 if ((imm >> 31) == (imm >> 7)) {
356 buf[i+0] = JMP8_INSN_OPCODE;
357 buf[i+1] = (s8)imm;
358
359 memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
360 }
361 }
362 break;
363 }
364
365 if (insn_rip_relative(&insn)) {
366 if (need_reloc(next + insn.displacement.value, src, src_len)) {
367 apply_reloc(insn.displacement.nbytes,
368 buf + i + insn_offset_displacement(&insn),
369 src - dest);
370 }
371 }
270a69c4
PZ
372 }
373}
374
db477a33
BP
375/*
376 * Replace instructions with better alternatives for this CPU type. This runs
377 * before SMP is initialized to avoid SMP problems with self modifying code.
378 * This implies that asymmetric systems where APs have less capabilities than
379 * the boot processor are not handled. Tough. Make sure you disable such
380 * features by hand.
34bfab0e
BP
381 *
382 * Marked "noinline" to cause control flow change and thus insn cache
383 * to refetch changed I$ lines.
db477a33 384 */
34bfab0e
BP
385void __init_or_module noinline apply_alternatives(struct alt_instr *start,
386 struct alt_instr *end)
9a0b5817 387{
9a0b5817 388 struct alt_instr *a;
59e97e4d 389 u8 *instr, *replacement;
1fc654cf 390 u8 insn_buff[MAX_PATCH_LEN];
9a0b5817 391
6becb502 392 DPRINTK(ALT, "alt table %px, -> %px", start, end);
50973133
FY
393 /*
394 * The scan order should be from start to end. A later scanned
db477a33 395 * alternative code can overwrite previously scanned alternative code.
50973133
FY
396 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
397 * patch code.
398 *
399 * So be careful if you want to change the scan order to any other
400 * order.
401 */
9a0b5817 402 for (a = start; a < end; a++) {
1fc654cf 403 int insn_buff_sz = 0;
48c7a250 404
59e97e4d
AL
405 instr = (u8 *)&a->instr_offset + a->instr_offset;
406 replacement = (u8 *)&a->repl_offset + a->repl_offset;
1fc654cf 407 BUG_ON(a->instrlen > sizeof(insn_buff));
5d1dd961 408 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
dda7bb76
JG
409
410 /*
411 * Patch if either:
412 * - feature is present
5d1dd961 413 * - feature not present but ALT_FLAG_NOT is set to mean,
dda7bb76
JG
414 * patch if feature is *NOT* present.
415 */
270a69c4
PZ
416 if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
417 optimize_nops(instr, a->instrlen);
418 continue;
419 }
59e97e4d 420
6becb502 421 DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
5d1dd961
BPA
422 (a->flags & ALT_FLAG_NOT) ? "!" : "",
423 a->cpuid >> 5,
424 a->cpuid & 0x1f,
c1d4e419 425 instr, instr, a->instrlen,
23c1ad53 426 replacement, a->replacementlen);
db477a33 427
1fc654cf
IM
428 memcpy(insn_buff, replacement, a->replacementlen);
429 insn_buff_sz = a->replacementlen;
59e97e4d 430
23c1ad53
PZ
431 for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
432 insn_buff[insn_buff_sz] = 0x90;
433
270a69c4
PZ
434 apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
435
436 DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
437 DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
6becb502 438 DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
59e97e4d 439
1fc654cf 440 text_poke_early(instr, insn_buff, insn_buff_sz);
9a0b5817
GH
441 }
442}
443
ac0ee0a9
PZ
444static inline bool is_jcc32(struct insn *insn)
445{
446 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
447 return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
448}
449
03f16cd0 450#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
75085009
PZ
451
452/*
453 * CALL/JMP *%\reg
454 */
455static int emit_indirect(int op, int reg, u8 *bytes)
456{
457 int i = 0;
458 u8 modrm;
459
460 switch (op) {
461 case CALL_INSN_OPCODE:
462 modrm = 0x10; /* Reg = 2; CALL r/m */
463 break;
464
465 case JMP32_INSN_OPCODE:
466 modrm = 0x20; /* Reg = 4; JMP r/m */
467 break;
468
469 default:
470 WARN_ON_ONCE(1);
471 return -1;
472 }
473
474 if (reg >= 8) {
475 bytes[i++] = 0x41; /* REX.B prefix */
476 reg -= 8;
477 }
478
479 modrm |= 0xc0; /* Mod = 3 */
480 modrm += reg;
481
482 bytes[i++] = 0xff; /* opcode */
483 bytes[i++] = modrm;
484
485 return i;
486}
487
3b6c1747
PZ
488static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
489{
490 u8 op = insn->opcode.bytes[0];
491 int i = 0;
492
493 /*
494 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
495 * tail-calls. Deal with them.
496 */
497 if (is_jcc32(insn)) {
498 bytes[i++] = op;
499 op = insn->opcode.bytes[1];
500 goto clang_jcc;
501 }
502
503 if (insn->length == 6)
504 bytes[i++] = 0x2e; /* CS-prefix */
505
506 switch (op) {
507 case CALL_INSN_OPCODE:
508 __text_gen_insn(bytes+i, op, addr+i,
509 __x86_indirect_call_thunk_array[reg],
510 CALL_INSN_SIZE);
511 i += CALL_INSN_SIZE;
512 break;
513
514 case JMP32_INSN_OPCODE:
515clang_jcc:
516 __text_gen_insn(bytes+i, op, addr+i,
517 __x86_indirect_jump_thunk_array[reg],
518 JMP32_INSN_SIZE);
519 i += JMP32_INSN_SIZE;
520 break;
521
522 default:
ae25e00b 523 WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
3b6c1747
PZ
524 return -1;
525 }
526
527 WARN_ON_ONCE(i != insn->length);
528
529 return i;
530}
531
75085009
PZ
532/*
533 * Rewrite the compiler generated retpoline thunk calls.
534 *
535 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
536 * indirect instructions, avoiding the extra indirection.
537 *
538 * For example, convert:
539 *
540 * CALL __x86_indirect_thunk_\reg
541 *
542 * into:
543 *
544 * CALL *%\reg
545 *
d45476d9 546 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
75085009
PZ
547 */
548static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
549{
550 retpoline_thunk_t *target;
2f0cbb2a
PZ
551 int reg, ret, i = 0;
552 u8 op, cc;
75085009
PZ
553
554 target = addr + insn->length + insn->immediate.value;
555 reg = target - __x86_indirect_thunk_array;
556
557 if (WARN_ON_ONCE(reg & ~0xf))
558 return -1;
559
560 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
561 BUG_ON(reg == 4);
562
bbe2df3f 563 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
3b6c1747
PZ
564 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
565 if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
566 return emit_call_track_retpoline(addr, insn, reg, bytes);
567
75085009 568 return -1;
3b6c1747 569 }
75085009 570
2f0cbb2a
PZ
571 op = insn->opcode.bytes[0];
572
573 /*
574 * Convert:
575 *
576 * Jcc.d32 __x86_indirect_thunk_\reg
577 *
578 * into:
579 *
580 * Jncc.d8 1f
bbe2df3f 581 * [ LFENCE ]
2f0cbb2a 582 * JMP *%\reg
bbe2df3f 583 * [ NOP ]
2f0cbb2a
PZ
584 * 1:
585 */
3b6c1747 586 if (is_jcc32(insn)) {
2f0cbb2a
PZ
587 cc = insn->opcode.bytes[1] & 0xf;
588 cc ^= 1; /* invert condition */
589
590 bytes[i++] = 0x70 + cc; /* Jcc.d8 */
591 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
592
593 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
594 op = JMP32_INSN_OPCODE;
595 }
596
bbe2df3f 597 /*
d45476d9 598 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
bbe2df3f 599 */
d45476d9 600 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
bbe2df3f
PZ
601 bytes[i++] = 0x0f;
602 bytes[i++] = 0xae;
603 bytes[i++] = 0xe8; /* LFENCE */
604 }
605
2f0cbb2a
PZ
606 ret = emit_indirect(op, reg, bytes + i);
607 if (ret < 0)
608 return ret;
609 i += ret;
75085009 610
8c03af3e
PZ
611 /*
612 * The compiler is supposed to EMIT an INT3 after every unconditional
613 * JMP instruction due to AMD BTC. However, if the compiler is too old
614 * or SLS isn't enabled, we still need an INT3 after indirect JMPs
615 * even on Intel.
616 */
617 if (op == JMP32_INSN_OPCODE && i < insn->length)
618 bytes[i++] = INT3_INSN_OPCODE;
619
75085009
PZ
620 for (; i < insn->length;)
621 bytes[i++] = BYTES_NOP1;
622
623 return i;
624}
625
626/*
627 * Generated by 'objtool --retpoline'.
628 */
629void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
630{
631 s32 *s;
632
633 for (s = start; s < end; s++) {
634 void *addr = (void *)s + *s;
635 struct insn insn;
636 int len, ret;
637 u8 bytes[16];
638 u8 op1, op2;
639
640 ret = insn_decode_kernel(&insn, addr);
641 if (WARN_ON_ONCE(ret < 0))
642 continue;
643
644 op1 = insn.opcode.bytes[0];
645 op2 = insn.opcode.bytes[1];
646
647 switch (op1) {
648 case CALL_INSN_OPCODE:
649 case JMP32_INSN_OPCODE:
650 break;
651
2f0cbb2a
PZ
652 case 0x0f: /* escape */
653 if (op2 >= 0x80 && op2 <= 0x8f)
654 break;
655 fallthrough;
75085009
PZ
656 default:
657 WARN_ON_ONCE(1);
658 continue;
659 }
660
6becb502 661 DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
d4b5a5c9
PZ
662 addr, addr, insn.length,
663 addr + insn.length + insn.immediate.value);
664
75085009
PZ
665 len = patch_retpoline(addr, &insn, bytes);
666 if (len == insn.length) {
667 optimize_nops(bytes, len);
6becb502
PZ
668 DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
669 DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
75085009
PZ
670 text_poke_early(addr, bytes, len);
671 }
672 }
673}
674
f43b9876 675#ifdef CONFIG_RETHUNK
770ae1b7
PZ
676
677#ifdef CONFIG_CALL_THUNKS
678void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
679#endif
680
15e67227
PZ
681/*
682 * Rewrite the compiler generated return thunk tail-calls.
683 *
684 * For example, convert:
685 *
686 * JMP __x86_return_thunk
687 *
688 * into:
689 *
690 * RET
691 */
692static int patch_return(void *addr, struct insn *insn, u8 *bytes)
693{
694 int i = 0;
695
d2408e04 696 /* Patch the custom return thunks... */
770ae1b7 697 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
770ae1b7
PZ
698 i = JMP32_INSN_SIZE;
699 __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
700 } else {
d2408e04 701 /* ... or patch them out if not needed. */
770ae1b7
PZ
702 bytes[i++] = RET_INSN_OPCODE;
703 }
15e67227
PZ
704
705 for (; i < insn->length;)
706 bytes[i++] = INT3_INSN_OPCODE;
15e67227
PZ
707 return i;
708}
709
710void __init_or_module noinline apply_returns(s32 *start, s32 *end)
711{
712 s32 *s;
713
d2408e04
BPA
714 /*
715 * Do not patch out the default return thunks if those needed are the
716 * ones generated by the compiler.
717 */
718 if (cpu_feature_enabled(X86_FEATURE_RETHUNK) &&
719 (x86_return_thunk == __x86_return_thunk))
720 return;
721
15e67227 722 for (s = start; s < end; s++) {
ee88d363 723 void *dest = NULL, *addr = (void *)s + *s;
15e67227
PZ
724 struct insn insn;
725 int len, ret;
726 u8 bytes[16];
ee88d363 727 u8 op;
15e67227
PZ
728
729 ret = insn_decode_kernel(&insn, addr);
730 if (WARN_ON_ONCE(ret < 0))
731 continue;
732
ee88d363
PZ
733 op = insn.opcode.bytes[0];
734 if (op == JMP32_INSN_OPCODE)
735 dest = addr + insn.length + insn.immediate.value;
736
737 if (__static_call_fixup(addr, op, dest) ||
65cdf0d6
KC
738 WARN_ONCE(dest != &__x86_return_thunk,
739 "missing return thunk: %pS-%pS: %*ph",
740 addr, dest, 5, addr))
15e67227
PZ
741 continue;
742
6becb502 743 DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
15e67227
PZ
744 addr, addr, insn.length,
745 addr + insn.length + insn.immediate.value);
746
747 len = patch_return(addr, &insn, bytes);
748 if (len == insn.length) {
6becb502
PZ
749 DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
750 DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
15e67227
PZ
751 text_poke_early(addr, bytes, len);
752 }
753 }
754}
f43b9876
PZ
755#else
756void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
757#endif /* CONFIG_RETHUNK */
758
03f16cd0 759#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
75085009
PZ
760
761void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
15e67227 762void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
75085009 763
03f16cd0 764#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
75085009 765
ed53a0d9
PZ
766#ifdef CONFIG_X86_KERNEL_IBT
767
931ab636
PZ
768static void poison_endbr(void *addr, bool warn)
769{
770 u32 endbr, poison = gen_endbr_poison();
771
772 if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
773 return;
774
775 if (!is_endbr(endbr)) {
776 WARN_ON_ONCE(warn);
777 return;
778 }
779
6becb502 780 DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
931ab636
PZ
781
782 /*
783 * When we have IBT, the lack of ENDBR will trigger #CP
784 */
6becb502
PZ
785 DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
786 DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
931ab636
PZ
787 text_poke_early(addr, &poison, 4);
788}
789
ed53a0d9
PZ
790/*
791 * Generated by: objtool --ibt
792 */
793void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
794{
795 s32 *s;
796
797 for (s = start; s < end; s++) {
ed53a0d9
PZ
798 void *addr = (void *)s + *s;
799
931ab636
PZ
800 poison_endbr(addr, true);
801 if (IS_ENABLED(CONFIG_FINEIBT))
802 poison_endbr(addr - 16, false);
803 }
804}
805
806#else
807
94a85511 808void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { }
931ab636
PZ
809
810#endif /* CONFIG_X86_KERNEL_IBT */
811
812#ifdef CONFIG_FINEIBT
082c4c81
PZ
813
814enum cfi_mode {
815 CFI_DEFAULT,
816 CFI_OFF,
817 CFI_KCFI,
818 CFI_FINEIBT,
819};
820
821static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
0c3e806e
PZ
822static bool cfi_rand __ro_after_init = true;
823static u32 cfi_seed __ro_after_init;
824
825/*
826 * Re-hash the CFI hash with a boot-time seed while making sure the result is
827 * not a valid ENDBR instruction.
828 */
829static u32 cfi_rehash(u32 hash)
830{
831 hash ^= cfi_seed;
832 while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
833 bool lsb = hash & 1;
834 hash >>= 1;
835 if (lsb)
836 hash ^= 0x80200003;
837 }
838 return hash;
839}
082c4c81
PZ
840
841static __init int cfi_parse_cmdline(char *str)
842{
843 if (!str)
844 return -EINVAL;
845
846 while (str) {
847 char *next = strchr(str, ',');
848 if (next) {
849 *next = 0;
850 next++;
851 }
852
853 if (!strcmp(str, "auto")) {
854 cfi_mode = CFI_DEFAULT;
855 } else if (!strcmp(str, "off")) {
856 cfi_mode = CFI_OFF;
0c3e806e 857 cfi_rand = false;
082c4c81
PZ
858 } else if (!strcmp(str, "kcfi")) {
859 cfi_mode = CFI_KCFI;
860 } else if (!strcmp(str, "fineibt")) {
861 cfi_mode = CFI_FINEIBT;
0c3e806e
PZ
862 } else if (!strcmp(str, "norand")) {
863 cfi_rand = false;
082c4c81
PZ
864 } else {
865 pr_err("Ignoring unknown cfi option (%s).", str);
866 }
867
868 str = next;
869 }
870
871 return 0;
872}
873early_param("cfi", cfi_parse_cmdline);
874
931ab636
PZ
875/*
876 * kCFI FineIBT
877 *
878 * __cfi_\func: __cfi_\func:
879 * movl $0x12345678,%eax // 5 endbr64 // 4
880 * nop subl $0x12345678,%r10d // 7
881 * nop jz 1f // 2
882 * nop ud2 // 2
883 * nop 1: nop // 1
884 * nop
885 * nop
886 * nop
887 * nop
888 * nop
889 * nop
890 * nop
891 *
892 *
893 * caller: caller:
894 * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
895 * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
896 * je 1f // 2 nop4 // 4
897 * ud2 // 2
898 * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
899 *
900 */
901
902asm( ".pushsection .rodata \n"
903 "fineibt_preamble_start: \n"
904 " endbr64 \n"
905 " subl $0x12345678, %r10d \n"
906 " je fineibt_preamble_end \n"
907 " ud2 \n"
908 " nop \n"
909 "fineibt_preamble_end: \n"
910 ".popsection\n"
911);
912
913extern u8 fineibt_preamble_start[];
914extern u8 fineibt_preamble_end[];
915
916#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
917#define fineibt_preamble_hash 7
918
919asm( ".pushsection .rodata \n"
920 "fineibt_caller_start: \n"
921 " movl $0x12345678, %r10d \n"
922 " sub $16, %r11 \n"
923 ASM_NOP4
924 "fineibt_caller_end: \n"
925 ".popsection \n"
926);
927
928extern u8 fineibt_caller_start[];
929extern u8 fineibt_caller_end[];
930
931#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
932#define fineibt_caller_hash 2
933
934#define fineibt_caller_jmp (fineibt_caller_size - 2)
935
936static u32 decode_preamble_hash(void *addr)
937{
938 u8 *p = addr;
939
940 /* b8 78 56 34 12 mov $0x12345678,%eax */
941 if (p[0] == 0xb8)
942 return *(u32 *)(addr + 1);
943
944 return 0; /* invalid hash value */
945}
946
947static u32 decode_caller_hash(void *addr)
948{
949 u8 *p = addr;
950
951 /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
952 if (p[0] == 0x41 && p[1] == 0xba)
953 return -*(u32 *)(addr + 2);
954
955 /* e8 0c 78 56 34 12 jmp.d8 +12 */
956 if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
957 return -*(u32 *)(addr + 2);
958
959 return 0; /* invalid hash value */
960}
961
962/* .retpoline_sites */
963static int cfi_disable_callers(s32 *start, s32 *end)
964{
965 /*
966 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
967 * in tact for later usage. Also see decode_caller_hash() and
968 * cfi_rewrite_callers().
969 */
970 const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
971 s32 *s;
ed53a0d9 972
931ab636
PZ
973 for (s = start; s < end; s++) {
974 void *addr = (void *)s + *s;
975 u32 hash;
976
977 addr -= fineibt_caller_size;
978 hash = decode_caller_hash(addr);
979 if (!hash) /* nocfi callers */
ed53a0d9
PZ
980 continue;
981
931ab636
PZ
982 text_poke_early(addr, jmp, 2);
983 }
ed53a0d9 984
931ab636
PZ
985 return 0;
986}
987
0c3e806e
PZ
988static int cfi_enable_callers(s32 *start, s32 *end)
989{
990 /*
991 * Re-enable kCFI, undo what cfi_disable_callers() did.
992 */
993 const u8 mov[] = { 0x41, 0xba };
994 s32 *s;
995
996 for (s = start; s < end; s++) {
997 void *addr = (void *)s + *s;
998 u32 hash;
999
1000 addr -= fineibt_caller_size;
1001 hash = decode_caller_hash(addr);
1002 if (!hash) /* nocfi callers */
ed53a0d9
PZ
1003 continue;
1004
0c3e806e
PZ
1005 text_poke_early(addr, mov, 2);
1006 }
ed53a0d9 1007
0c3e806e
PZ
1008 return 0;
1009}
1010
931ab636 1011/* .cfi_sites */
0c3e806e
PZ
1012static int cfi_rand_preamble(s32 *start, s32 *end)
1013{
1014 s32 *s;
1015
1016 for (s = start; s < end; s++) {
1017 void *addr = (void *)s + *s;
1018 u32 hash;
1019
1020 hash = decode_preamble_hash(addr);
1021 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1022 addr, addr, 5, addr))
1023 return -EINVAL;
1024
1025 hash = cfi_rehash(hash);
1026 text_poke_early(addr + 1, &hash, 4);
1027 }
1028
1029 return 0;
1030}
1031
931ab636
PZ
1032static int cfi_rewrite_preamble(s32 *start, s32 *end)
1033{
1034 s32 *s;
1035
1036 for (s = start; s < end; s++) {
1037 void *addr = (void *)s + *s;
1038 u32 hash;
1039
1040 hash = decode_preamble_hash(addr);
1041 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1042 addr, addr, 5, addr))
1043 return -EINVAL;
1044
1045 text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
1046 WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
1047 text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
ed53a0d9 1048 }
931ab636
PZ
1049
1050 return 0;
1051}
1052
1053/* .retpoline_sites */
0c3e806e
PZ
1054static int cfi_rand_callers(s32 *start, s32 *end)
1055{
1056 s32 *s;
1057
1058 for (s = start; s < end; s++) {
1059 void *addr = (void *)s + *s;
1060 u32 hash;
1061
1062 addr -= fineibt_caller_size;
1063 hash = decode_caller_hash(addr);
1064 if (hash) {
1065 hash = -cfi_rehash(hash);
1066 text_poke_early(addr + 2, &hash, 4);
1067 }
1068 }
1069
1070 return 0;
1071}
1072
931ab636
PZ
1073static int cfi_rewrite_callers(s32 *start, s32 *end)
1074{
1075 s32 *s;
1076
1077 for (s = start; s < end; s++) {
1078 void *addr = (void *)s + *s;
1079 u32 hash;
1080
1081 addr -= fineibt_caller_size;
1082 hash = decode_caller_hash(addr);
1083 if (hash) {
1084 text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
1085 WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
1086 text_poke_early(addr + fineibt_caller_hash, &hash, 4);
1087 }
1088 /* rely on apply_retpolines() */
1089 }
1090
1091 return 0;
1092}
1093
1094static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1095 s32 *start_cfi, s32 *end_cfi, bool builtin)
1096{
1097 int ret;
1098
1099 if (WARN_ONCE(fineibt_preamble_size != 16,
1100 "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
1101 return;
1102
082c4c81
PZ
1103 if (cfi_mode == CFI_DEFAULT) {
1104 cfi_mode = CFI_KCFI;
1105 if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
1106 cfi_mode = CFI_FINEIBT;
1107 }
1108
0c3e806e
PZ
1109 /*
1110 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1111 * rewrite them. This disables all CFI. If this succeeds but any of the
1112 * later stages fails, we're without CFI.
1113 */
1114 ret = cfi_disable_callers(start_retpoline, end_retpoline);
1115 if (ret)
1116 goto err;
1117
1118 if (cfi_rand) {
1119 if (builtin)
1120 cfi_seed = get_random_u32();
1121
1122 ret = cfi_rand_preamble(start_cfi, end_cfi);
082c4c81
PZ
1123 if (ret)
1124 goto err;
1125
0c3e806e
PZ
1126 ret = cfi_rand_callers(start_retpoline, end_retpoline);
1127 if (ret)
1128 goto err;
ed53a0d9 1129 }
0c3e806e
PZ
1130
1131 switch (cfi_mode) {
1132 case CFI_OFF:
082c4c81
PZ
1133 if (builtin)
1134 pr_info("Disabling CFI\n");
931ab636
PZ
1135 return;
1136
082c4c81 1137 case CFI_KCFI:
0c3e806e
PZ
1138 ret = cfi_enable_callers(start_retpoline, end_retpoline);
1139 if (ret)
1140 goto err;
1141
082c4c81
PZ
1142 if (builtin)
1143 pr_info("Using kCFI\n");
1144 return;
931ab636 1145
082c4c81 1146 case CFI_FINEIBT:
082c4c81
PZ
1147 ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1148 if (ret)
1149 goto err;
931ab636 1150
082c4c81
PZ
1151 ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1152 if (ret)
1153 goto err;
931ab636 1154
082c4c81
PZ
1155 if (builtin)
1156 pr_info("Using FineIBT CFI\n");
1157 return;
931ab636 1158
082c4c81
PZ
1159 default:
1160 break;
1161 }
931ab636
PZ
1162
1163err:
1164 pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
ed53a0d9
PZ
1165}
1166
1167#else
1168
931ab636
PZ
1169static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1170 s32 *start_cfi, s32 *end_cfi, bool builtin)
1171{
1172}
ed53a0d9 1173
931ab636
PZ
1174#endif
1175
1176void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1177 s32 *start_cfi, s32 *end_cfi)
1178{
1179 return __apply_fineibt(start_retpoline, end_retpoline,
1180 start_cfi, end_cfi,
1181 /* .builtin = */ false);
1182}
ed53a0d9 1183
8ec4d41f 1184#ifdef CONFIG_SMP
5967ed87
JB
1185static void alternatives_smp_lock(const s32 *start, const s32 *end,
1186 u8 *text, u8 *text_end)
9a0b5817 1187{
5967ed87 1188 const s32 *poff;
9a0b5817 1189
5967ed87
JB
1190 for (poff = start; poff < end; poff++) {
1191 u8 *ptr = (u8 *)poff + *poff;
1192
1193 if (!*poff || ptr < text || ptr >= text_end)
9a0b5817 1194 continue;
f88f07e0 1195 /* turn DS segment override prefix into lock prefix */
d9c5841e
PA
1196 if (*ptr == 0x3e)
1197 text_poke(ptr, ((unsigned char []){0xf0}), 1);
4b8073e4 1198 }
9a0b5817
GH
1199}
1200
5967ed87
JB
1201static void alternatives_smp_unlock(const s32 *start, const s32 *end,
1202 u8 *text, u8 *text_end)
9a0b5817 1203{
5967ed87 1204 const s32 *poff;
9a0b5817 1205
5967ed87
JB
1206 for (poff = start; poff < end; poff++) {
1207 u8 *ptr = (u8 *)poff + *poff;
1208
1209 if (!*poff || ptr < text || ptr >= text_end)
9a0b5817 1210 continue;
f88f07e0 1211 /* turn lock prefix into DS segment override prefix */
d9c5841e
PA
1212 if (*ptr == 0xf0)
1213 text_poke(ptr, ((unsigned char []){0x3E}), 1);
4b8073e4 1214 }
9a0b5817
GH
1215}
1216
1217struct smp_alt_module {
1218 /* what is this ??? */
1219 struct module *mod;
1220 char *name;
1221
1222 /* ptrs to lock prefixes */
5967ed87
JB
1223 const s32 *locks;
1224 const s32 *locks_end;
9a0b5817
GH
1225
1226 /* .text segment, needed to avoid patching init code ;) */
1227 u8 *text;
1228 u8 *text_end;
1229
1230 struct list_head next;
1231};
1232static LIST_HEAD(smp_alt_modules);
e846d139 1233static bool uniproc_patched = false; /* protected by text_mutex */
9a0b5817 1234
8b5a10fc
JB
1235void __init_or_module alternatives_smp_module_add(struct module *mod,
1236 char *name,
1237 void *locks, void *locks_end,
1238 void *text, void *text_end)
9a0b5817
GH
1239{
1240 struct smp_alt_module *smp;
9a0b5817 1241
e846d139 1242 mutex_lock(&text_mutex);
816afe4f
RR
1243 if (!uniproc_patched)
1244 goto unlock;
b7fb4af0 1245
816afe4f
RR
1246 if (num_possible_cpus() == 1)
1247 /* Don't bother remembering, we'll never have to undo it. */
1248 goto smp_unlock;
9a0b5817
GH
1249
1250 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
1251 if (NULL == smp)
816afe4f
RR
1252 /* we'll run the (safe but slow) SMP code then ... */
1253 goto unlock;
9a0b5817
GH
1254
1255 smp->mod = mod;
1256 smp->name = name;
1257 smp->locks = locks;
1258 smp->locks_end = locks_end;
1259 smp->text = text;
1260 smp->text_end = text_end;
6becb502 1261 DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
db477a33 1262 smp->locks, smp->locks_end,
9a0b5817
GH
1263 smp->text, smp->text_end, smp->name);
1264
9a0b5817 1265 list_add_tail(&smp->next, &smp_alt_modules);
816afe4f
RR
1266smp_unlock:
1267 alternatives_smp_unlock(locks, locks_end, text, text_end);
1268unlock:
e846d139 1269 mutex_unlock(&text_mutex);
9a0b5817
GH
1270}
1271
8b5a10fc 1272void __init_or_module alternatives_smp_module_del(struct module *mod)
9a0b5817
GH
1273{
1274 struct smp_alt_module *item;
9a0b5817 1275
e846d139 1276 mutex_lock(&text_mutex);
9a0b5817
GH
1277 list_for_each_entry(item, &smp_alt_modules, next) {
1278 if (mod != item->mod)
1279 continue;
1280 list_del(&item->next);
9a0b5817 1281 kfree(item);
816afe4f 1282 break;
9a0b5817 1283 }
e846d139 1284 mutex_unlock(&text_mutex);
9a0b5817
GH
1285}
1286
816afe4f 1287void alternatives_enable_smp(void)
9a0b5817
GH
1288{
1289 struct smp_alt_module *mod;
9a0b5817 1290
816afe4f
RR
1291 /* Why bother if there are no other CPUs? */
1292 BUG_ON(num_possible_cpus() == 1);
9a0b5817 1293
e846d139 1294 mutex_lock(&text_mutex);
ca74a6f8 1295
816afe4f 1296 if (uniproc_patched) {
c767a54b 1297 pr_info("switching to SMP code\n");
816afe4f 1298 BUG_ON(num_online_cpus() != 1);
53756d37
JF
1299 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
1300 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
9a0b5817
GH
1301 list_for_each_entry(mod, &smp_alt_modules, next)
1302 alternatives_smp_lock(mod->locks, mod->locks_end,
1303 mod->text, mod->text_end);
816afe4f 1304 uniproc_patched = false;
9a0b5817 1305 }
e846d139 1306 mutex_unlock(&text_mutex);
9a0b5817
GH
1307}
1308
e846d139
ZC
1309/*
1310 * Return 1 if the address range is reserved for SMP-alternatives.
1311 * Must hold text_mutex.
1312 */
2cfa1978
MH
1313int alternatives_text_reserved(void *start, void *end)
1314{
1315 struct smp_alt_module *mod;
5967ed87 1316 const s32 *poff;
076dc4a6
MH
1317 u8 *text_start = start;
1318 u8 *text_end = end;
2cfa1978 1319
e846d139
ZC
1320 lockdep_assert_held(&text_mutex);
1321
2cfa1978 1322 list_for_each_entry(mod, &smp_alt_modules, next) {
076dc4a6 1323 if (mod->text > text_end || mod->text_end < text_start)
2cfa1978 1324 continue;
5967ed87
JB
1325 for (poff = mod->locks; poff < mod->locks_end; poff++) {
1326 const u8 *ptr = (const u8 *)poff + *poff;
1327
1328 if (text_start <= ptr && text_end > ptr)
2cfa1978 1329 return 1;
5967ed87 1330 }
2cfa1978
MH
1331 }
1332
1333 return 0;
1334}
48c7a250 1335#endif /* CONFIG_SMP */
8ec4d41f 1336
139ec7c4 1337#ifdef CONFIG_PARAVIRT
6c480f22
PZ
1338
1339/* Use this to add nops to a buffer, then text_poke the whole buffer. */
1340static void __init_or_module add_nops(void *insns, unsigned int len)
1341{
1342 while (len > 0) {
1343 unsigned int noplen = len;
1344 if (noplen > ASM_NOP_MAX)
1345 noplen = ASM_NOP_MAX;
1346 memcpy(insns, x86_nops[noplen], noplen);
1347 insns += noplen;
1348 len -= noplen;
1349 }
1350}
1351
8b5a10fc
JB
1352void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
1353 struct paravirt_patch_site *end)
139ec7c4 1354{
98de032b 1355 struct paravirt_patch_site *p;
1fc654cf 1356 char insn_buff[MAX_PATCH_LEN];
139ec7c4
RR
1357
1358 for (p = start; p < end; p++) {
1359 unsigned int used;
1360
ab144f5e 1361 BUG_ON(p->len > MAX_PATCH_LEN);
d34fda4a 1362 /* prep the buffer with the original instructions */
1fc654cf 1363 memcpy(insn_buff, p->instr, p->len);
054ac8ad 1364 used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
7f63c41c 1365
63f70270
JF
1366 BUG_ON(used > p->len);
1367
139ec7c4 1368 /* Pad the rest with nops */
1fc654cf
IM
1369 add_nops(insn_buff + used, p->len - used);
1370 text_poke_early(p->instr, insn_buff, p->len);
139ec7c4 1371 }
139ec7c4 1372}
98de032b 1373extern struct paravirt_patch_site __start_parainstructions[],
139ec7c4
RR
1374 __stop_parainstructions[];
1375#endif /* CONFIG_PARAVIRT */
1376
7457c0da
PZ
1377/*
1378 * Self-test for the INT3 based CALL emulation code.
1379 *
1380 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
1381 * properly and that there is a stack gap between the INT3 frame and the
1382 * previous context. Without this gap doing a virtual PUSH on the interrupted
1383 * stack would corrupt the INT3 IRET frame.
1384 *
1385 * See entry_{32,64}.S for more details.
1386 */
ecc60610
PZ
1387
1388/*
1389 * We define the int3_magic() function in assembly to control the calling
1390 * convention such that we can 'call' it from assembly.
1391 */
1392
1393extern void int3_magic(unsigned int *ptr); /* defined in asm */
1394
1395asm (
1396" .pushsection .init.text, \"ax\", @progbits\n"
1397" .type int3_magic, @function\n"
1398"int3_magic:\n"
3e3f0695 1399 ANNOTATE_NOENDBR
ecc60610 1400" movl $1, (%" _ASM_ARG1 ")\n"
b17c2baa 1401 ASM_RET
ecc60610
PZ
1402" .size int3_magic, .-int3_magic\n"
1403" .popsection\n"
1404);
7457c0da 1405
99c95c5d 1406extern void int3_selftest_ip(void); /* defined in asm below */
7457c0da
PZ
1407
1408static int __init
1409int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
1410{
3e3f0695 1411 unsigned long selftest = (unsigned long)&int3_selftest_ip;
7457c0da
PZ
1412 struct die_args *args = data;
1413 struct pt_regs *regs = args->regs;
1414
3e3f0695
PZ
1415 OPTIMIZER_HIDE_VAR(selftest);
1416
7457c0da
PZ
1417 if (!regs || user_mode(regs))
1418 return NOTIFY_DONE;
1419
1420 if (val != DIE_INT3)
1421 return NOTIFY_DONE;
1422
3e3f0695 1423 if (regs->ip - INT3_INSN_SIZE != selftest)
7457c0da
PZ
1424 return NOTIFY_DONE;
1425
1426 int3_emulate_call(regs, (unsigned long)&int3_magic);
1427 return NOTIFY_STOP;
1428}
1429
99c95c5d
PZ
1430/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
1431static noinline void __init int3_selftest(void)
7457c0da
PZ
1432{
1433 static __initdata struct notifier_block int3_exception_nb = {
1434 .notifier_call = int3_exception_notify,
1435 .priority = INT_MAX-1, /* last */
1436 };
1437 unsigned int val = 0;
1438
1439 BUG_ON(register_die_notifier(&int3_exception_nb));
1440
1441 /*
1442 * Basically: int3_magic(&val); but really complicated :-)
1443 *
99c95c5d
PZ
1444 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
1445 * notifier above will emulate CALL for us.
7457c0da 1446 */
3e3f0695
PZ
1447 asm volatile ("int3_selftest_ip:\n\t"
1448 ANNOTATE_NOENDBR
1449 " int3; nop; nop; nop; nop\n\t"
ecc60610
PZ
1450 : ASM_CALL_CONSTRAINT
1451 : __ASM_SEL_RAW(a, D) (&val)
1452 : "memory");
7457c0da
PZ
1453
1454 BUG_ON(val != 1);
1455
1456 unregister_die_notifier(&int3_exception_nb);
1457}
1458
270a69c4
PZ
1459static __initdata int __alt_reloc_selftest_addr;
1460
1461__visible noinline void __init __alt_reloc_selftest(void *arg)
1462{
1463 WARN_ON(arg != &__alt_reloc_selftest_addr);
1464}
1465
1466static noinline void __init alt_reloc_selftest(void)
1467{
1468 /*
1469 * Tests apply_relocation().
1470 *
1471 * This has a relative immediate (CALL) in a place other than the first
1472 * instruction and additionally on x86_64 we get a RIP-relative LEA:
1473 *
1474 * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
1475 * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
1476 *
1477 * Getting this wrong will either crash and burn or tickle the WARN
1478 * above.
1479 */
1480 asm_inline volatile (
1481 ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
1482 : /* output */
1483 : [mem] "m" (__alt_reloc_selftest_addr)
1484 : _ASM_ARG1
1485 );
1486}
1487
9a0b5817
GH
1488void __init alternative_instructions(void)
1489{
7457c0da
PZ
1490 int3_selftest();
1491
1492 /*
1493 * The patching is not fully atomic, so try to avoid local
1494 * interruptions that might execute the to be patched code.
1495 * Other CPUs are not running.
1496 */
8f4e956b 1497 stop_nmi();
123aa76e
AK
1498
1499 /*
1500 * Don't stop machine check exceptions while patching.
1501 * MCEs only happen when something got corrupted and in this
1502 * case we must do something about the corruption.
32b1cbe3 1503 * Ignoring it is worse than an unlikely patching race.
123aa76e
AK
1504 * Also machine checks tend to be broadcast and if one CPU
1505 * goes into machine check the others follow quickly, so we don't
1506 * expect a machine check to cause undue problems during to code
1507 * patching.
1508 */
8f4e956b 1509
4e629211
JG
1510 /*
1511 * Paravirt patching and alternative patching can be combined to
1512 * replace a function call with a short direct code sequence (e.g.
1513 * by setting a constant return value instead of doing that in an
1514 * external function).
1515 * In order to make this work the following sequence is required:
1516 * 1. set (artificial) features depending on used paravirt
1517 * functions which can later influence alternative patching
1518 * 2. apply paravirt patching (generally replacing an indirect
1519 * function call with a direct one)
1520 * 3. apply alternative patching (e.g. replacing a direct function
1521 * call with a custom code sequence)
1522 * Doing paravirt patching after alternative patching would clobber
1523 * the optimization of the custom code with a function call again.
1524 */
1525 paravirt_set_cap();
1526
1527 /*
1528 * First patch paravirt functions, such that we overwrite the indirect
1529 * call with the direct call.
1530 */
1531 apply_paravirt(__parainstructions, __parainstructions_end);
1532
931ab636
PZ
1533 __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
1534 __cfi_sites, __cfi_sites_end, true);
1535
75085009
PZ
1536 /*
1537 * Rewrite the retpolines, must be done before alternatives since
1538 * those can rewrite the retpoline thunks.
1539 */
1540 apply_retpolines(__retpoline_sites, __retpoline_sites_end);
15e67227 1541 apply_returns(__return_sites, __return_sites_end);
75085009 1542
4e629211
JG
1543 /*
1544 * Then patch alternatives, such that those paravirt calls that are in
1545 * alternatives can be overwritten by their immediate fragments.
1546 */
9a0b5817
GH
1547 apply_alternatives(__alt_instructions, __alt_instructions_end);
1548
e81dc127
TG
1549 /*
1550 * Now all calls are established. Apply the call thunks if
1551 * required.
1552 */
1553 callthunks_patch_builtin_calls();
1554
ed53a0d9
PZ
1555 apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
1556
8ec4d41f 1557#ifdef CONFIG_SMP
816afe4f
RR
1558 /* Patch to UP if other cpus not imminent. */
1559 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
1560 uniproc_patched = true;
9a0b5817
GH
1561 alternatives_smp_module_add(NULL, "core kernel",
1562 __smp_locks, __smp_locks_end,
1563 _text, _etext);
9a0b5817 1564 }
8f4e956b 1565
7457c0da 1566 if (!uniproc_patched || num_possible_cpus() == 1) {
f68fd5f4
FW
1567 free_init_pages("SMP alternatives",
1568 (unsigned long)__smp_locks,
1569 (unsigned long)__smp_locks_end);
7457c0da 1570 }
816afe4f
RR
1571#endif
1572
8f4e956b 1573 restart_nmi();
5e907bb0 1574 alternatives_patched = 1;
270a69c4
PZ
1575
1576 alt_reloc_selftest();
9a0b5817 1577}
19d36ccd 1578
e587cadd
MD
1579/**
1580 * text_poke_early - Update instructions on a live kernel at boot time
1581 * @addr: address to modify
1582 * @opcode: source of the copy
1583 * @len: length to copy
1584 *
19d36ccd
AK
1585 * When you use this code to patch more than one byte of an instruction
1586 * you need to make sure that other CPUs cannot execute this code in parallel.
e587cadd 1587 * Also no thread must be currently preempted in the middle of these
32b1cbe3
MA
1588 * instructions. And on the local CPU you need to be protected against NMI or
1589 * MCE handlers seeing an inconsistent instruction while you patch.
19d36ccd 1590 */
0a203df5
NA
1591void __init_or_module text_poke_early(void *addr, const void *opcode,
1592 size_t len)
19d36ccd 1593{
e587cadd 1594 unsigned long flags;
f2c65fb3
NA
1595
1596 if (boot_cpu_has(X86_FEATURE_NX) &&
1597 is_module_text_address((unsigned long)addr)) {
1598 /*
1599 * Modules text is marked initially as non-executable, so the
1600 * code cannot be running and speculative code-fetches are
1601 * prevented. Just change the code.
1602 */
1603 memcpy(addr, opcode, len);
1604 } else {
1605 local_irq_save(flags);
1606 memcpy(addr, opcode, len);
1607 local_irq_restore(flags);
1608 sync_core();
1609
1610 /*
1611 * Could also do a CLFLUSH here to speed up CPU recovery; but
1612 * that causes hangs on some VIA CPUs.
1613 */
1614 }
e587cadd
MD
1615}
1616
9020d395
TG
1617typedef struct {
1618 struct mm_struct *mm;
1619} temp_mm_state_t;
1620
1621/*
1622 * Using a temporary mm allows to set temporary mappings that are not accessible
1623 * by other CPUs. Such mappings are needed to perform sensitive memory writes
1624 * that override the kernel memory protections (e.g., W^X), without exposing the
1625 * temporary page-table mappings that are required for these write operations to
1626 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1627 * mapping is torn down.
1628 *
1629 * Context: The temporary mm needs to be used exclusively by a single core. To
1630 * harden security IRQs must be disabled while the temporary mm is
1631 * loaded, thereby preventing interrupt handler bugs from overriding
1632 * the kernel memory protection.
1633 */
1634static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1635{
1636 temp_mm_state_t temp_state;
1637
1638 lockdep_assert_irqs_disabled();
abee7c49
JG
1639
1640 /*
1641 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1642 * with a stale address space WITHOUT being in lazy mode after
1643 * restoring the previous mm.
1644 */
2f4305b1 1645 if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
abee7c49
JG
1646 leave_mm(smp_processor_id());
1647
9020d395
TG
1648 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1649 switch_mm_irqs_off(NULL, mm, current);
1650
1651 /*
1652 * If breakpoints are enabled, disable them while the temporary mm is
1653 * used. Userspace might set up watchpoints on addresses that are used
1654 * in the temporary mm, which would lead to wrong signals being sent or
1655 * crashes.
1656 *
1657 * Note that breakpoints are not disabled selectively, which also causes
1658 * kernel breakpoints (e.g., perf's) to be disabled. This might be
1659 * undesirable, but still seems reasonable as the code that runs in the
1660 * temporary mm should be short.
1661 */
1662 if (hw_breakpoint_active())
1663 hw_breakpoint_disable();
1664
1665 return temp_state;
1666}
1667
1668static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1669{
1670 lockdep_assert_irqs_disabled();
1671 switch_mm_irqs_off(NULL, prev_state.mm, current);
1672
1673 /*
1674 * Restore the breakpoints if they were disabled before the temporary mm
1675 * was loaded.
1676 */
1677 if (hw_breakpoint_active())
1678 hw_breakpoint_restore();
1679}
1680
4fc19708
NA
1681__ro_after_init struct mm_struct *poking_mm;
1682__ro_after_init unsigned long poking_addr;
1683
aadd1b67
SL
1684static void text_poke_memcpy(void *dst, const void *src, size_t len)
1685{
1686 memcpy(dst, src, len);
1687}
1688
1689static void text_poke_memset(void *dst, const void *src, size_t len)
1690{
1691 int c = *(const int *)src;
1692
1693 memset(dst, c, len);
1694}
1695
1696typedef void text_poke_f(void *dst, const void *src, size_t len);
1697
1698static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
e587cadd 1699{
b3fd8e83
NA
1700 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1701 struct page *pages[2] = {NULL};
1702 temp_mm_state_t prev;
78ff7fae 1703 unsigned long flags;
b3fd8e83
NA
1704 pte_t pte, *ptep;
1705 spinlock_t *ptl;
1706 pgprot_t pgprot;
e587cadd 1707
6fffacb3 1708 /*
b3fd8e83
NA
1709 * While boot memory allocator is running we cannot use struct pages as
1710 * they are not yet initialized. There is no way to recover.
6fffacb3
PT
1711 */
1712 BUG_ON(!after_bootmem);
1713
b7b66baa
MD
1714 if (!core_kernel_text((unsigned long)addr)) {
1715 pages[0] = vmalloc_to_page(addr);
b3fd8e83
NA
1716 if (cross_page_boundary)
1717 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
15a601eb 1718 } else {
b7b66baa 1719 pages[0] = virt_to_page(addr);
00c6b2d5 1720 WARN_ON(!PageReserved(pages[0]));
b3fd8e83
NA
1721 if (cross_page_boundary)
1722 pages[1] = virt_to_page(addr + PAGE_SIZE);
e587cadd 1723 }
b3fd8e83
NA
1724 /*
1725 * If something went wrong, crash and burn since recovery paths are not
1726 * implemented.
1727 */
1728 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1729
b3fd8e83
NA
1730 /*
1731 * Map the page without the global bit, as TLB flushing is done with
1732 * flush_tlb_mm_range(), which is intended for non-global PTEs.
1733 */
1734 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1735
1736 /*
1737 * The lock is not really needed, but this allows to avoid open-coding.
1738 */
1739 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1740
1741 /*
1742 * This must not fail; preallocated in poking_init().
1743 */
1744 VM_BUG_ON(!ptep);
1745
a6d996cb
SAS
1746 local_irq_save(flags);
1747
b3fd8e83
NA
1748 pte = mk_pte(pages[0], pgprot);
1749 set_pte_at(poking_mm, poking_addr, ptep, pte);
1750
1751 if (cross_page_boundary) {
1752 pte = mk_pte(pages[1], pgprot);
1753 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1754 }
1755
1756 /*
1757 * Loading the temporary mm behaves as a compiler barrier, which
1758 * guarantees that the PTE will be set at the time memcpy() is done.
1759 */
1760 prev = use_temporary_mm(poking_mm);
1761
1762 kasan_disable_current();
aadd1b67 1763 func((u8 *)poking_addr + offset_in_page(addr), src, len);
b3fd8e83
NA
1764 kasan_enable_current();
1765
1766 /*
1767 * Ensure that the PTE is only cleared after the instructions of memcpy
1768 * were issued by using a compiler barrier.
1769 */
1770 barrier();
1771
1772 pte_clear(poking_mm, poking_addr, ptep);
1773 if (cross_page_boundary)
1774 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1775
1776 /*
1777 * Loading the previous page-table hierarchy requires a serializing
1778 * instruction that already allows the core to see the updated version.
1779 * Xen-PV is assumed to serialize execution in a similar manner.
1780 */
1781 unuse_temporary_mm(prev);
1782
1783 /*
1784 * Flushing the TLB might involve IPIs, which would require enabled
1785 * IRQs, but not if the mm is not used, as it is in this point.
1786 */
1787 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1788 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1789 PAGE_SHIFT, false);
1790
aadd1b67
SL
1791 if (func == text_poke_memcpy) {
1792 /*
1793 * If the text does not match what we just wrote then something is
1794 * fundamentally screwy; there's nothing we can really do about that.
1795 */
1796 BUG_ON(memcmp(addr, src, len));
1797 }
b3fd8e83 1798
7cf49427 1799 local_irq_restore(flags);
a6d996cb 1800 pte_unmap_unlock(ptep, ptl);
e587cadd 1801 return addr;
19d36ccd 1802}
3d55cc8a 1803
e836673c
NA
1804/**
1805 * text_poke - Update instructions on a live kernel
1806 * @addr: address to modify
1807 * @opcode: source of the copy
1808 * @len: length to copy
1809 *
1810 * Only atomic text poke/set should be allowed when not doing early patching.
1811 * It means the size must be writable atomically and the address must be aligned
1812 * in a way that permits an atomic write. It also makes sure we fit on a single
1813 * page.
3950746d
NA
1814 *
1815 * Note that the caller must ensure that if the modified code is part of a
1816 * module, the module would not be removed during poking. This can be achieved
1817 * by registering a module notifier, and ordering module removal and patching
1818 * trough a mutex.
e836673c
NA
1819 */
1820void *text_poke(void *addr, const void *opcode, size_t len)
1821{
1822 lockdep_assert_held(&text_mutex);
1823
aadd1b67 1824 return __text_poke(text_poke_memcpy, addr, opcode, len);
e836673c
NA
1825}
1826
1827/**
1828 * text_poke_kgdb - Update instructions on a live kernel by kgdb
1829 * @addr: address to modify
1830 * @opcode: source of the copy
1831 * @len: length to copy
1832 *
1833 * Only atomic text poke/set should be allowed when not doing early patching.
1834 * It means the size must be writable atomically and the address must be aligned
1835 * in a way that permits an atomic write. It also makes sure we fit on a single
1836 * page.
1837 *
1838 * Context: should only be used by kgdb, which ensures no other core is running,
1839 * despite the fact it does not hold the text_mutex.
1840 */
1841void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
1842{
aadd1b67 1843 return __text_poke(text_poke_memcpy, addr, opcode, len);
e836673c
NA
1844}
1845
fe54d079
TG
1846void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
1847 bool core_ok)
0e06b403
SL
1848{
1849 unsigned long start = (unsigned long)addr;
1850 size_t patched = 0;
1851
fe54d079 1852 if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
0e06b403
SL
1853 return NULL;
1854
0e06b403
SL
1855 while (patched < len) {
1856 unsigned long ptr = start + patched;
1857 size_t s;
1858
1859 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1860
aadd1b67
SL
1861 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
1862 patched += s;
1863 }
fe54d079
TG
1864 return addr;
1865}
1866
1867/**
1868 * text_poke_copy - Copy instructions into (an unused part of) RX memory
1869 * @addr: address to modify
1870 * @opcode: source of the copy
1871 * @len: length to copy, could be more than 2x PAGE_SIZE
1872 *
1873 * Not safe against concurrent execution; useful for JITs to dump
1874 * new code blocks into unused regions of RX memory. Can be used in
1875 * conjunction with synchronize_rcu_tasks() to wait for existing
1876 * execution to quiesce after having made sure no existing functions
1877 * pointers are live.
1878 */
1879void *text_poke_copy(void *addr, const void *opcode, size_t len)
1880{
1881 mutex_lock(&text_mutex);
1882 addr = text_poke_copy_locked(addr, opcode, len, false);
aadd1b67
SL
1883 mutex_unlock(&text_mutex);
1884 return addr;
1885}
1886
1887/**
1888 * text_poke_set - memset into (an unused part of) RX memory
1889 * @addr: address to modify
1890 * @c: the byte to fill the area with
1891 * @len: length to copy, could be more than 2x PAGE_SIZE
1892 *
1893 * This is useful to overwrite unused regions of RX memory with illegal
1894 * instructions.
1895 */
1896void *text_poke_set(void *addr, int c, size_t len)
1897{
1898 unsigned long start = (unsigned long)addr;
1899 size_t patched = 0;
1900
1901 if (WARN_ON_ONCE(core_kernel_text(start)))
1902 return NULL;
1903
1904 mutex_lock(&text_mutex);
1905 while (patched < len) {
1906 unsigned long ptr = start + patched;
1907 size_t s;
1908
1909 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1910
1911 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
0e06b403
SL
1912 patched += s;
1913 }
1914 mutex_unlock(&text_mutex);
1915 return addr;
1916}
1917
fd4363ff
JK
1918static void do_sync_core(void *info)
1919{
1920 sync_core();
1921}
1922
5c02ece8
PZ
1923void text_poke_sync(void)
1924{
1925 on_each_cpu(do_sync_core, NULL, 1);
1926}
1927
ac0ee0a9
PZ
1928/*
1929 * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
1930 * this thing. When len == 6 everything is prefixed with 0x0f and we map
1931 * opcode to Jcc.d8, using len to distinguish.
1932 */
18cbc8be 1933struct text_poke_loc {
26c44b77
PZ
1934 /* addr := _stext + rel_addr */
1935 s32 rel_addr;
1936 s32 disp;
1937 u8 len;
18cbc8be
PZ
1938 u8 opcode;
1939 const u8 text[POKE_MAX_OPCODE_SIZE];
26c44b77 1940 /* see text_poke_bp_batch() */
d769811c 1941 u8 old;
18cbc8be
PZ
1942};
1943
1f676247 1944struct bp_patching_desc {
c0213b0a
DBO
1945 struct text_poke_loc *vec;
1946 int nr_entries;
1f676247
PZ
1947 atomic_t refs;
1948};
1949
efd608fa 1950static struct bp_patching_desc bp_desc;
1f676247 1951
4979fb53 1952static __always_inline
efd608fa 1953struct bp_patching_desc *try_get_desc(void)
1f676247 1954{
efd608fa 1955 struct bp_patching_desc *desc = &bp_desc;
1f676247 1956
efd608fa 1957 if (!arch_atomic_inc_not_zero(&desc->refs))
1f676247
PZ
1958 return NULL;
1959
1960 return desc;
1961}
1962
efd608fa 1963static __always_inline void put_desc(void)
1f676247 1964{
efd608fa
NA
1965 struct bp_patching_desc *desc = &bp_desc;
1966
1f676247 1967 smp_mb__before_atomic();
ef882bfe 1968 arch_atomic_dec(&desc->refs);
1f676247 1969}
c0213b0a 1970
4979fb53 1971static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
4531ef6a
PZ
1972{
1973 return _stext + tp->rel_addr;
1974}
1975
f64366ef 1976static __always_inline int patch_cmp(const void *key, const void *elt)
c0213b0a
DBO
1977{
1978 struct text_poke_loc *tp = (struct text_poke_loc *) elt;
1979
4531ef6a 1980 if (key < text_poke_addr(tp))
c0213b0a 1981 return -1;
4531ef6a 1982 if (key > text_poke_addr(tp))
c0213b0a
DBO
1983 return 1;
1984 return 0;
1985}
fd4363ff 1986
7f6fa101 1987noinstr int poke_int3_handler(struct pt_regs *regs)
fd4363ff 1988{
1f676247 1989 struct bp_patching_desc *desc;
c0213b0a 1990 struct text_poke_loc *tp;
26c44b77 1991 int ret = 0;
c0213b0a 1992 void *ip;
1f676247
PZ
1993
1994 if (user_mode(regs))
1995 return 0;
c0213b0a 1996
01651324
PZ
1997 /*
1998 * Having observed our INT3 instruction, we now must observe
efd608fa 1999 * bp_desc with non-zero refcount:
01651324 2000 *
efd608fa 2001 * bp_desc.refs = 1 INT3
c3d6324f 2002 * WMB RMB
efd608fa 2003 * write INT3 if (bp_desc.refs != 0)
01651324 2004 */
fd4363ff
JK
2005 smp_rmb();
2006
efd608fa 2007 desc = try_get_desc();
1f676247 2008 if (!desc)
17f41571 2009 return 0;
fd4363ff 2010
c0213b0a 2011 /*
c3d6324f 2012 * Discount the INT3. See text_poke_bp_batch().
c0213b0a 2013 */
c3d6324f 2014 ip = (void *) regs->ip - INT3_INSN_SIZE;
c0213b0a
DBO
2015
2016 /*
2017 * Skip the binary search if there is a single member in the vector.
2018 */
1f676247 2019 if (unlikely(desc->nr_entries > 1)) {
f64366ef
PZ
2020 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
2021 sizeof(struct text_poke_loc),
2022 patch_cmp);
c0213b0a 2023 if (!tp)
1f676247 2024 goto out_put;
c0213b0a 2025 } else {
1f676247 2026 tp = desc->vec;
4531ef6a 2027 if (text_poke_addr(tp) != ip)
1f676247 2028 goto out_put;
c0213b0a
DBO
2029 }
2030
26c44b77 2031 ip += tp->len;
c3d6324f
PZ
2032
2033 switch (tp->opcode) {
2034 case INT3_INSN_OPCODE:
2035 /*
2036 * Someone poked an explicit INT3, they'll want to handle it,
2037 * do not consume.
2038 */
1f676247 2039 goto out_put;
c3d6324f 2040
c43a43e4
PZ
2041 case RET_INSN_OPCODE:
2042 int3_emulate_ret(regs);
2043 break;
2044
c3d6324f 2045 case CALL_INSN_OPCODE:
26c44b77 2046 int3_emulate_call(regs, (long)ip + tp->disp);
c3d6324f
PZ
2047 break;
2048
2049 case JMP32_INSN_OPCODE:
2050 case JMP8_INSN_OPCODE:
26c44b77 2051 int3_emulate_jmp(regs, (long)ip + tp->disp);
c3d6324f
PZ
2052 break;
2053
ac0ee0a9
PZ
2054 case 0x70 ... 0x7f: /* Jcc */
2055 int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
2056 break;
2057
c3d6324f
PZ
2058 default:
2059 BUG();
2060 }
17f41571 2061
1f676247
PZ
2062 ret = 1;
2063
2064out_put:
efd608fa 2065 put_desc();
1f676247 2066 return ret;
fd4363ff 2067}
17f41571 2068
18cbc8be
PZ
2069#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
2070static struct text_poke_loc tp_vec[TP_VEC_MAX];
2071static int tp_vec_nr;
2072
fd4363ff 2073/**
c0213b0a
DBO
2074 * text_poke_bp_batch() -- update instructions on live kernel on SMP
2075 * @tp: vector of instructions to patch
2076 * @nr_entries: number of entries in the vector
fd4363ff
JK
2077 *
2078 * Modify multi-byte instruction by using int3 breakpoint on SMP.
ea8596bb
MH
2079 * We completely avoid stop_machine() here, and achieve the
2080 * synchronization using int3 breakpoint.
fd4363ff
JK
2081 *
2082 * The way it is done:
c3d6324f 2083 * - For each entry in the vector:
c0213b0a 2084 * - add a int3 trap to the address that will be patched
fd4363ff 2085 * - sync cores
c0213b0a
DBO
2086 * - For each entry in the vector:
2087 * - update all but the first byte of the patched range
fd4363ff 2088 * - sync cores
c0213b0a
DBO
2089 * - For each entry in the vector:
2090 * - replace the first byte (int3) by the first byte of
2091 * replacing opcode
fd4363ff 2092 * - sync cores
fd4363ff 2093 */
18cbc8be 2094static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
fd4363ff 2095{
c3d6324f 2096 unsigned char int3 = INT3_INSN_OPCODE;
c0213b0a 2097 unsigned int i;
c3d6324f 2098 int do_sync;
9222f606
JK
2099
2100 lockdep_assert_held(&text_mutex);
2101
efd608fa
NA
2102 bp_desc.vec = tp;
2103 bp_desc.nr_entries = nr_entries;
2104
2105 /*
2106 * Corresponds to the implicit memory barrier in try_get_desc() to
2107 * ensure reading a non-zero refcount provides up to date bp_desc data.
2108 */
2109 atomic_set_release(&bp_desc.refs, 1);
c0213b0a 2110
fd4363ff 2111 /*
01651324 2112 * Corresponding read barrier in int3 notifier for making sure the
c0213b0a 2113 * nr_entries and handler are correctly ordered wrt. patching.
fd4363ff
JK
2114 */
2115 smp_wmb();
2116
c0213b0a
DBO
2117 /*
2118 * First step: add a int3 trap to the address that will be patched.
2119 */
d769811c
AH
2120 for (i = 0; i < nr_entries; i++) {
2121 tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
76ffa720 2122 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
d769811c 2123 }
fd4363ff 2124
5c02ece8 2125 text_poke_sync();
fd4363ff 2126
c0213b0a
DBO
2127 /*
2128 * Second step: update all but the first byte of the patched range.
2129 */
c3d6324f 2130 for (do_sync = 0, i = 0; i < nr_entries; i++) {
ac0ee0a9
PZ
2131 u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
2132 u8 _new[POKE_MAX_OPCODE_SIZE+1];
2133 const u8 *new = tp[i].text;
26c44b77 2134 int len = tp[i].len;
97e6c977 2135
76ffa720 2136 if (len - INT3_INSN_SIZE > 0) {
d769811c
AH
2137 memcpy(old + INT3_INSN_SIZE,
2138 text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2139 len - INT3_INSN_SIZE);
ac0ee0a9
PZ
2140
2141 if (len == 6) {
2142 _new[0] = 0x0f;
2143 memcpy(_new + 1, new, 5);
2144 new = _new;
2145 }
2146
76ffa720 2147 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
ac0ee0a9 2148 new + INT3_INSN_SIZE,
76ffa720 2149 len - INT3_INSN_SIZE);
ac0ee0a9 2150
c3d6324f 2151 do_sync++;
c0213b0a 2152 }
d769811c
AH
2153
2154 /*
2155 * Emit a perf event to record the text poke, primarily to
2156 * support Intel PT decoding which must walk the executable code
2157 * to reconstruct the trace. The flow up to here is:
2158 * - write INT3 byte
2159 * - IPI-SYNC
2160 * - write instruction tail
2161 * At this point the actual control flow will be through the
2162 * INT3 and handler and not hit the old or new instruction.
2163 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
2164 * can still be decoded. Subsequently:
2165 * - emit RECORD_TEXT_POKE with the new instruction
2166 * - IPI-SYNC
2167 * - write first byte
2168 * - IPI-SYNC
2169 * So before the text poke event timestamp, the decoder will see
2170 * either the old instruction flow or FUP/TIP of INT3. After the
2171 * text poke event timestamp, the decoder will see either the
2172 * new instruction flow or FUP/TIP of INT3. Thus decoders can
2173 * use the timestamp as the point at which to modify the
2174 * executable code.
2175 * The old instruction is recorded so that the event can be
2176 * processed forwards or backwards.
2177 */
ac0ee0a9 2178 perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
c0213b0a
DBO
2179 }
2180
c3d6324f 2181 if (do_sync) {
fd4363ff
JK
2182 /*
2183 * According to Intel, this core syncing is very likely
2184 * not necessary and we'd be safe even without it. But
2185 * better safe than sorry (plus there's not only Intel).
2186 */
5c02ece8 2187 text_poke_sync();
fd4363ff
JK
2188 }
2189
c0213b0a
DBO
2190 /*
2191 * Third step: replace the first byte (int3) by the first byte of
2192 * replacing opcode.
2193 */
c3d6324f 2194 for (do_sync = 0, i = 0; i < nr_entries; i++) {
ac0ee0a9
PZ
2195 u8 byte = tp[i].text[0];
2196
2197 if (tp[i].len == 6)
2198 byte = 0x0f;
2199
2200 if (byte == INT3_INSN_OPCODE)
c3d6324f
PZ
2201 continue;
2202
ac0ee0a9 2203 text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
c3d6324f
PZ
2204 do_sync++;
2205 }
2206
2207 if (do_sync)
5c02ece8 2208 text_poke_sync();
fd4363ff 2209
01651324 2210 /*
efd608fa 2211 * Remove and wait for refs to be zero.
01651324 2212 */
efd608fa
NA
2213 if (!atomic_dec_and_test(&bp_desc.refs))
2214 atomic_cond_read_acquire(&bp_desc.refs, !VAL);
fd4363ff
JK
2215}
2216
244febbe
QH
2217static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
2218 const void *opcode, size_t len, const void *emulate)
c3d6324f
PZ
2219{
2220 struct insn insn;
ac0ee0a9 2221 int ret, i = 0;
c3d6324f 2222
ac0ee0a9
PZ
2223 if (len == 6)
2224 i = 1;
2225 memcpy((void *)tp->text, opcode+i, len-i);
c3d6324f
PZ
2226 if (!emulate)
2227 emulate = opcode;
2228
52fa82c2 2229 ret = insn_decode_kernel(&insn, emulate);
63c66cde 2230 BUG_ON(ret < 0);
c3d6324f 2231
4531ef6a 2232 tp->rel_addr = addr - (void *)_stext;
26c44b77 2233 tp->len = len;
c3d6324f
PZ
2234 tp->opcode = insn.opcode.bytes[0];
2235
ac0ee0a9
PZ
2236 if (is_jcc32(&insn)) {
2237 /*
2238 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
2239 */
2240 tp->opcode = insn.opcode.bytes[1] - 0x10;
2241 }
2242
26c44b77
PZ
2243 switch (tp->opcode) {
2244 case RET_INSN_OPCODE:
2245 case JMP32_INSN_OPCODE:
2246 case JMP8_INSN_OPCODE:
2247 /*
2248 * Control flow instructions without implied execution of the
2249 * next instruction can be padded with INT3.
2250 */
2251 for (i = insn.length; i < len; i++)
2252 BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
2253 break;
2254
2255 default:
2256 BUG_ON(len != insn.length);
64267734 2257 }
26c44b77 2258
c3d6324f
PZ
2259 switch (tp->opcode) {
2260 case INT3_INSN_OPCODE:
c43a43e4 2261 case RET_INSN_OPCODE:
c3d6324f
PZ
2262 break;
2263
2264 case CALL_INSN_OPCODE:
2265 case JMP32_INSN_OPCODE:
2266 case JMP8_INSN_OPCODE:
ac0ee0a9 2267 case 0x70 ... 0x7f: /* Jcc */
26c44b77 2268 tp->disp = insn.immediate.value;
c3d6324f
PZ
2269 break;
2270
2271 default: /* assume NOP */
2272 switch (len) {
2273 case 2: /* NOP2 -- emulate as JMP8+0 */
a89dfde3 2274 BUG_ON(memcmp(emulate, x86_nops[len], len));
c3d6324f 2275 tp->opcode = JMP8_INSN_OPCODE;
26c44b77 2276 tp->disp = 0;
c3d6324f
PZ
2277 break;
2278
2279 case 5: /* NOP5 -- emulate as JMP32+0 */
a89dfde3 2280 BUG_ON(memcmp(emulate, x86_nops[len], len));
c3d6324f 2281 tp->opcode = JMP32_INSN_OPCODE;
26c44b77 2282 tp->disp = 0;
c3d6324f
PZ
2283 break;
2284
2285 default: /* unknown instruction */
2286 BUG();
2287 }
2288 break;
2289 }
2290}
2291
18cbc8be
PZ
2292/*
2293 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
2294 * early if needed.
2295 */
2296static bool tp_order_fail(void *addr)
2297{
2298 struct text_poke_loc *tp;
2299
2300 if (!tp_vec_nr)
2301 return false;
2302
2303 if (!addr) /* force */
2304 return true;
2305
2306 tp = &tp_vec[tp_vec_nr - 1];
4531ef6a 2307 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
18cbc8be
PZ
2308 return true;
2309
2310 return false;
2311}
2312
2313static void text_poke_flush(void *addr)
2314{
2315 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
2316 text_poke_bp_batch(tp_vec, tp_vec_nr);
2317 tp_vec_nr = 0;
2318 }
2319}
2320
2321void text_poke_finish(void)
2322{
2323 text_poke_flush(NULL);
2324}
2325
768ae440 2326void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
18cbc8be
PZ
2327{
2328 struct text_poke_loc *tp;
2329
2330 text_poke_flush(addr);
2331
2332 tp = &tp_vec[tp_vec_nr++];
2333 text_poke_loc_init(tp, addr, opcode, len, emulate);
2334}
2335
c0213b0a
DBO
2336/**
2337 * text_poke_bp() -- update instructions on live kernel on SMP
2338 * @addr: address to patch
2339 * @opcode: opcode of new instruction
2340 * @len: length to copy
72ebb5ff 2341 * @emulate: instruction to be emulated
c0213b0a
DBO
2342 *
2343 * Update a single instruction with the vector in the stack, avoiding
2344 * dynamically allocated memory. This function should be used when it is
2345 * not possible to allocate memory.
2346 */
768ae440 2347void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
c0213b0a 2348{
c3d6324f 2349 struct text_poke_loc tp;
c0213b0a 2350
c3d6324f 2351 text_poke_loc_init(&tp, addr, opcode, len, emulate);
c0213b0a
DBO
2352 text_poke_bp_batch(&tp, 1);
2353}