]> git.ipfire.org Git - thirdparty/qemu.git/blob - tcg/i386/tcg-target.inc.c
tcg/i386: Allow bmi2 shiftx to have non-matching operands
[thirdparty/qemu.git] / tcg / i386 / tcg-target.inc.c
1 /*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "tcg-be-ldst.h"
26
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
32 #else
33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34 #endif
35 };
36 #endif
37
38 static const int tcg_target_reg_alloc_order[] = {
39 #if TCG_TARGET_REG_BITS == 64
40 TCG_REG_RBP,
41 TCG_REG_RBX,
42 TCG_REG_R12,
43 TCG_REG_R13,
44 TCG_REG_R14,
45 TCG_REG_R15,
46 TCG_REG_R10,
47 TCG_REG_R11,
48 TCG_REG_R9,
49 TCG_REG_R8,
50 TCG_REG_RCX,
51 TCG_REG_RDX,
52 TCG_REG_RSI,
53 TCG_REG_RDI,
54 TCG_REG_RAX,
55 #else
56 TCG_REG_EBX,
57 TCG_REG_ESI,
58 TCG_REG_EDI,
59 TCG_REG_EBP,
60 TCG_REG_ECX,
61 TCG_REG_EDX,
62 TCG_REG_EAX,
63 #endif
64 };
65
66 static const int tcg_target_call_iarg_regs[] = {
67 #if TCG_TARGET_REG_BITS == 64
68 #if defined(_WIN64)
69 TCG_REG_RCX,
70 TCG_REG_RDX,
71 #else
72 TCG_REG_RDI,
73 TCG_REG_RSI,
74 TCG_REG_RDX,
75 TCG_REG_RCX,
76 #endif
77 TCG_REG_R8,
78 TCG_REG_R9,
79 #else
80 /* 32 bit mode uses stack based calling convention (GCC default). */
81 #endif
82 };
83
84 static const int tcg_target_call_oarg_regs[] = {
85 TCG_REG_EAX,
86 #if TCG_TARGET_REG_BITS == 32
87 TCG_REG_EDX
88 #endif
89 };
90
91 /* Constants we accept. */
92 #define TCG_CT_CONST_S32 0x100
93 #define TCG_CT_CONST_U32 0x200
94 #define TCG_CT_CONST_I32 0x400
95
96 /* Registers used with L constraint, which are the first argument
97 registers on x86_64, and two random call clobbered registers on
98 i386. */
99 #if TCG_TARGET_REG_BITS == 64
100 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
101 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
102 #else
103 # define TCG_REG_L0 TCG_REG_EAX
104 # define TCG_REG_L1 TCG_REG_EDX
105 #endif
106
107 /* The host compiler should supply <cpuid.h> to enable runtime features
108 detection, as we're not going to go so far as our own inline assembly.
109 If not available, default values will be assumed. */
110 #if defined(CONFIG_CPUID_H)
111 #include <cpuid.h>
112 #endif
113
114 /* For 32-bit, we are going to attempt to determine at runtime whether cmov
115 is available. */
116 #if TCG_TARGET_REG_BITS == 64
117 # define have_cmov 1
118 #elif defined(CONFIG_CPUID_H) && defined(bit_CMOV)
119 static bool have_cmov;
120 #else
121 # define have_cmov 0
122 #endif
123
124 /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
125 going to attempt to determine at runtime whether movbe is available. */
126 #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
127 static bool have_movbe;
128 #else
129 # define have_movbe 0
130 #endif
131
132 /* We need this symbol in tcg-target.h, and we can't properly conditionalize
133 it there. Therefore we always define the variable. */
134 bool have_bmi1;
135
136 #if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
137 static bool have_bmi2;
138 #else
139 # define have_bmi2 0
140 #endif
141
142 static tcg_insn_unit *tb_ret_addr;
143
144 static void patch_reloc(tcg_insn_unit *code_ptr, int type,
145 intptr_t value, intptr_t addend)
146 {
147 value += addend;
148 switch(type) {
149 case R_386_PC32:
150 value -= (uintptr_t)code_ptr;
151 if (value != (int32_t)value) {
152 tcg_abort();
153 }
154 tcg_patch32(code_ptr, value);
155 break;
156 case R_386_PC8:
157 value -= (uintptr_t)code_ptr;
158 if (value != (int8_t)value) {
159 tcg_abort();
160 }
161 tcg_patch8(code_ptr, value);
162 break;
163 default:
164 tcg_abort();
165 }
166 }
167
168 /* parse target specific constraints */
169 static const char *target_parse_constraint(TCGArgConstraint *ct,
170 const char *ct_str, TCGType type)
171 {
172 switch(*ct_str++) {
173 case 'a':
174 ct->ct |= TCG_CT_REG;
175 tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
176 break;
177 case 'b':
178 ct->ct |= TCG_CT_REG;
179 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
180 break;
181 case 'c':
182 ct->ct |= TCG_CT_REG;
183 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
184 break;
185 case 'd':
186 ct->ct |= TCG_CT_REG;
187 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
188 break;
189 case 'S':
190 ct->ct |= TCG_CT_REG;
191 tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
192 break;
193 case 'D':
194 ct->ct |= TCG_CT_REG;
195 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
196 break;
197 case 'q':
198 ct->ct |= TCG_CT_REG;
199 if (TCG_TARGET_REG_BITS == 64) {
200 tcg_regset_set32(ct->u.regs, 0, 0xffff);
201 } else {
202 tcg_regset_set32(ct->u.regs, 0, 0xf);
203 }
204 break;
205 case 'Q':
206 ct->ct |= TCG_CT_REG;
207 tcg_regset_set32(ct->u.regs, 0, 0xf);
208 break;
209 case 'r':
210 ct->ct |= TCG_CT_REG;
211 if (TCG_TARGET_REG_BITS == 64) {
212 tcg_regset_set32(ct->u.regs, 0, 0xffff);
213 } else {
214 tcg_regset_set32(ct->u.regs, 0, 0xff);
215 }
216 break;
217
218 /* qemu_ld/st address constraint */
219 case 'L':
220 ct->ct |= TCG_CT_REG;
221 if (TCG_TARGET_REG_BITS == 64) {
222 tcg_regset_set32(ct->u.regs, 0, 0xffff);
223 } else {
224 tcg_regset_set32(ct->u.regs, 0, 0xff);
225 }
226 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
227 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
228 break;
229
230 case 'e':
231 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
232 break;
233 case 'Z':
234 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
235 break;
236 case 'I':
237 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
238 break;
239
240 default:
241 return NULL;
242 }
243 return ct_str;
244 }
245
246 /* test if a constant matches the constraint */
247 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
248 const TCGArgConstraint *arg_ct)
249 {
250 int ct = arg_ct->ct;
251 if (ct & TCG_CT_CONST) {
252 return 1;
253 }
254 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
255 return 1;
256 }
257 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
258 return 1;
259 }
260 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
261 return 1;
262 }
263 return 0;
264 }
265
266 #if TCG_TARGET_REG_BITS == 64
267 # define LOWREGMASK(x) ((x) & 7)
268 #else
269 # define LOWREGMASK(x) (x)
270 #endif
271
272 #define P_EXT 0x100 /* 0x0f opcode prefix */
273 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
274 #define P_DATA16 0x400 /* 0x66 opcode prefix */
275 #if TCG_TARGET_REG_BITS == 64
276 # define P_ADDR32 0x800 /* 0x67 opcode prefix */
277 # define P_REXW 0x1000 /* Set REX.W = 1 */
278 # define P_REXB_R 0x2000 /* REG field as byte register */
279 # define P_REXB_RM 0x4000 /* R/M field as byte register */
280 # define P_GS 0x8000 /* gs segment override */
281 #else
282 # define P_ADDR32 0
283 # define P_REXW 0
284 # define P_REXB_R 0
285 # define P_REXB_RM 0
286 # define P_GS 0
287 #endif
288 #define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */
289 #define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */
290
291 #define OPC_ARITH_EvIz (0x81)
292 #define OPC_ARITH_EvIb (0x83)
293 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
294 #define OPC_ANDN (0xf2 | P_EXT38)
295 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
296 #define OPC_BSWAP (0xc8 | P_EXT)
297 #define OPC_CALL_Jz (0xe8)
298 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
299 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
300 #define OPC_DEC_r32 (0x48)
301 #define OPC_IMUL_GvEv (0xaf | P_EXT)
302 #define OPC_IMUL_GvEvIb (0x6b)
303 #define OPC_IMUL_GvEvIz (0x69)
304 #define OPC_INC_r32 (0x40)
305 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
306 #define OPC_JCC_short (0x70) /* ... plus condition code */
307 #define OPC_JMP_long (0xe9)
308 #define OPC_JMP_short (0xeb)
309 #define OPC_LEA (0x8d)
310 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
311 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
312 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
313 #define OPC_MOVB_EvIz (0xc6)
314 #define OPC_MOVL_EvIz (0xc7)
315 #define OPC_MOVL_Iv (0xb8)
316 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
317 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
318 #define OPC_MOVSBL (0xbe | P_EXT)
319 #define OPC_MOVSWL (0xbf | P_EXT)
320 #define OPC_MOVSLQ (0x63 | P_REXW)
321 #define OPC_MOVZBL (0xb6 | P_EXT)
322 #define OPC_MOVZWL (0xb7 | P_EXT)
323 #define OPC_POP_r32 (0x58)
324 #define OPC_PUSH_r32 (0x50)
325 #define OPC_PUSH_Iv (0x68)
326 #define OPC_PUSH_Ib (0x6a)
327 #define OPC_RET (0xc3)
328 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
329 #define OPC_SHIFT_1 (0xd1)
330 #define OPC_SHIFT_Ib (0xc1)
331 #define OPC_SHIFT_cl (0xd3)
332 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
333 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
334 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
335 #define OPC_TESTL (0x85)
336 #define OPC_XCHG_ax_r32 (0x90)
337
338 #define OPC_GRP3_Ev (0xf7)
339 #define OPC_GRP5 (0xff)
340
341 /* Group 1 opcode extensions for 0x80-0x83.
342 These are also used as modifiers for OPC_ARITH. */
343 #define ARITH_ADD 0
344 #define ARITH_OR 1
345 #define ARITH_ADC 2
346 #define ARITH_SBB 3
347 #define ARITH_AND 4
348 #define ARITH_SUB 5
349 #define ARITH_XOR 6
350 #define ARITH_CMP 7
351
352 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
353 #define SHIFT_ROL 0
354 #define SHIFT_ROR 1
355 #define SHIFT_SHL 4
356 #define SHIFT_SHR 5
357 #define SHIFT_SAR 7
358
359 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
360 #define EXT3_NOT 2
361 #define EXT3_NEG 3
362 #define EXT3_MUL 4
363 #define EXT3_IMUL 5
364 #define EXT3_DIV 6
365 #define EXT3_IDIV 7
366
367 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
368 #define EXT5_INC_Ev 0
369 #define EXT5_DEC_Ev 1
370 #define EXT5_CALLN_Ev 2
371 #define EXT5_JMPN_Ev 4
372
373 /* Condition codes to be added to OPC_JCC_{long,short}. */
374 #define JCC_JMP (-1)
375 #define JCC_JO 0x0
376 #define JCC_JNO 0x1
377 #define JCC_JB 0x2
378 #define JCC_JAE 0x3
379 #define JCC_JE 0x4
380 #define JCC_JNE 0x5
381 #define JCC_JBE 0x6
382 #define JCC_JA 0x7
383 #define JCC_JS 0x8
384 #define JCC_JNS 0x9
385 #define JCC_JP 0xa
386 #define JCC_JNP 0xb
387 #define JCC_JL 0xc
388 #define JCC_JGE 0xd
389 #define JCC_JLE 0xe
390 #define JCC_JG 0xf
391
392 static const uint8_t tcg_cond_to_jcc[] = {
393 [TCG_COND_EQ] = JCC_JE,
394 [TCG_COND_NE] = JCC_JNE,
395 [TCG_COND_LT] = JCC_JL,
396 [TCG_COND_GE] = JCC_JGE,
397 [TCG_COND_LE] = JCC_JLE,
398 [TCG_COND_GT] = JCC_JG,
399 [TCG_COND_LTU] = JCC_JB,
400 [TCG_COND_GEU] = JCC_JAE,
401 [TCG_COND_LEU] = JCC_JBE,
402 [TCG_COND_GTU] = JCC_JA,
403 };
404
405 #if TCG_TARGET_REG_BITS == 64
406 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
407 {
408 int rex;
409
410 if (opc & P_GS) {
411 tcg_out8(s, 0x65);
412 }
413 if (opc & P_DATA16) {
414 /* We should never be asking for both 16 and 64-bit operation. */
415 tcg_debug_assert((opc & P_REXW) == 0);
416 tcg_out8(s, 0x66);
417 }
418 if (opc & P_ADDR32) {
419 tcg_out8(s, 0x67);
420 }
421
422 rex = 0;
423 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
424 rex |= (r & 8) >> 1; /* REX.R */
425 rex |= (x & 8) >> 2; /* REX.X */
426 rex |= (rm & 8) >> 3; /* REX.B */
427
428 /* P_REXB_{R,RM} indicates that the given register is the low byte.
429 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
430 as otherwise the encoding indicates %[abcd]h. Note that the values
431 that are ORed in merely indicate that the REX byte must be present;
432 those bits get discarded in output. */
433 rex |= opc & (r >= 4 ? P_REXB_R : 0);
434 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
435
436 if (rex) {
437 tcg_out8(s, (uint8_t)(rex | 0x40));
438 }
439
440 if (opc & (P_EXT | P_EXT38)) {
441 tcg_out8(s, 0x0f);
442 if (opc & P_EXT38) {
443 tcg_out8(s, 0x38);
444 }
445 }
446
447 tcg_out8(s, opc);
448 }
449 #else
450 static void tcg_out_opc(TCGContext *s, int opc)
451 {
452 if (opc & P_DATA16) {
453 tcg_out8(s, 0x66);
454 }
455 if (opc & (P_EXT | P_EXT38)) {
456 tcg_out8(s, 0x0f);
457 if (opc & P_EXT38) {
458 tcg_out8(s, 0x38);
459 }
460 }
461 tcg_out8(s, opc);
462 }
463 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
464 the 32-bit compilation paths. This method works with all versions of gcc,
465 whereas relying on optimization may not be able to exclude them. */
466 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
467 #endif
468
469 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
470 {
471 tcg_out_opc(s, opc, r, rm, 0);
472 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
473 }
474
475 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
476 {
477 int tmp;
478
479 if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
480 /* Three byte VEX prefix. */
481 tcg_out8(s, 0xc4);
482
483 /* VEX.m-mmmm */
484 if (opc & P_EXT38) {
485 tmp = 2;
486 } else if (opc & P_EXT) {
487 tmp = 1;
488 } else {
489 tcg_abort();
490 }
491 tmp |= 0x40; /* VEX.X */
492 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
493 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
494 tcg_out8(s, tmp);
495
496 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
497 } else {
498 /* Two byte VEX prefix. */
499 tcg_out8(s, 0xc5);
500
501 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
502 }
503 /* VEX.pp */
504 if (opc & P_DATA16) {
505 tmp |= 1; /* 0x66 */
506 } else if (opc & P_SIMDF3) {
507 tmp |= 2; /* 0xf3 */
508 } else if (opc & P_SIMDF2) {
509 tmp |= 3; /* 0xf2 */
510 }
511 tmp |= (~v & 15) << 3; /* VEX.vvvv */
512 tcg_out8(s, tmp);
513 tcg_out8(s, opc);
514 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
515 }
516
517 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
518 We handle either RM and INDEX missing with a negative value. In 64-bit
519 mode for absolute addresses, ~RM is the size of the immediate operand
520 that will follow the instruction. */
521
522 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
523 int index, int shift, intptr_t offset)
524 {
525 int mod, len;
526
527 if (index < 0 && rm < 0) {
528 if (TCG_TARGET_REG_BITS == 64) {
529 /* Try for a rip-relative addressing mode. This has replaced
530 the 32-bit-mode absolute addressing encoding. */
531 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
532 intptr_t disp = offset - pc;
533 if (disp == (int32_t)disp) {
534 tcg_out_opc(s, opc, r, 0, 0);
535 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
536 tcg_out32(s, disp);
537 return;
538 }
539
540 /* Try for an absolute address encoding. This requires the
541 use of the MODRM+SIB encoding and is therefore larger than
542 rip-relative addressing. */
543 if (offset == (int32_t)offset) {
544 tcg_out_opc(s, opc, r, 0, 0);
545 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
546 tcg_out8(s, (4 << 3) | 5);
547 tcg_out32(s, offset);
548 return;
549 }
550
551 /* ??? The memory isn't directly addressable. */
552 tcg_abort();
553 } else {
554 /* Absolute address. */
555 tcg_out_opc(s, opc, r, 0, 0);
556 tcg_out8(s, (r << 3) | 5);
557 tcg_out32(s, offset);
558 return;
559 }
560 }
561
562 /* Find the length of the immediate addend. Note that the encoding
563 that would be used for (%ebp) indicates absolute addressing. */
564 if (rm < 0) {
565 mod = 0, len = 4, rm = 5;
566 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
567 mod = 0, len = 0;
568 } else if (offset == (int8_t)offset) {
569 mod = 0x40, len = 1;
570 } else {
571 mod = 0x80, len = 4;
572 }
573
574 /* Use a single byte MODRM format if possible. Note that the encoding
575 that would be used for %esp is the escape to the two byte form. */
576 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
577 /* Single byte MODRM format. */
578 tcg_out_opc(s, opc, r, rm, 0);
579 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
580 } else {
581 /* Two byte MODRM+SIB format. */
582
583 /* Note that the encoding that would place %esp into the index
584 field indicates no index register. In 64-bit mode, the REX.X
585 bit counts, so %r12 can be used as the index. */
586 if (index < 0) {
587 index = 4;
588 } else {
589 tcg_debug_assert(index != TCG_REG_ESP);
590 }
591
592 tcg_out_opc(s, opc, r, rm, index);
593 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
594 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
595 }
596
597 if (len == 1) {
598 tcg_out8(s, offset);
599 } else if (len == 4) {
600 tcg_out32(s, offset);
601 }
602 }
603
604 /* A simplification of the above with no index or shift. */
605 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
606 int rm, intptr_t offset)
607 {
608 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
609 }
610
611 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
612 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
613 {
614 /* Propagate an opcode prefix, such as P_REXW. */
615 int ext = subop & ~0x7;
616 subop &= 0x7;
617
618 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
619 }
620
621 static inline void tcg_out_mov(TCGContext *s, TCGType type,
622 TCGReg ret, TCGReg arg)
623 {
624 if (arg != ret) {
625 int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
626 tcg_out_modrm(s, opc, ret, arg);
627 }
628 }
629
630 static void tcg_out_movi(TCGContext *s, TCGType type,
631 TCGReg ret, tcg_target_long arg)
632 {
633 tcg_target_long diff;
634
635 if (arg == 0) {
636 tgen_arithr(s, ARITH_XOR, ret, ret);
637 return;
638 }
639 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
640 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
641 tcg_out32(s, arg);
642 return;
643 }
644 if (arg == (int32_t)arg) {
645 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
646 tcg_out32(s, arg);
647 return;
648 }
649
650 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
651 diff = arg - ((uintptr_t)s->code_ptr + 7);
652 if (diff == (int32_t)diff) {
653 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
654 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
655 tcg_out32(s, diff);
656 return;
657 }
658
659 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
660 tcg_out64(s, arg);
661 }
662
663 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
664 {
665 if (val == (int8_t)val) {
666 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
667 tcg_out8(s, val);
668 } else if (val == (int32_t)val) {
669 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
670 tcg_out32(s, val);
671 } else {
672 tcg_abort();
673 }
674 }
675
676 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
677 {
678 /* Given the strength of x86 memory ordering, we only need care for
679 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
680 faster than "mfence", so don't bother with the sse insn. */
681 if (a0 & TCG_MO_ST_LD) {
682 tcg_out8(s, 0xf0);
683 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
684 tcg_out8(s, 0);
685 }
686 }
687
688 static inline void tcg_out_push(TCGContext *s, int reg)
689 {
690 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
691 }
692
693 static inline void tcg_out_pop(TCGContext *s, int reg)
694 {
695 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
696 }
697
698 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
699 TCGReg arg1, intptr_t arg2)
700 {
701 int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
702 tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
703 }
704
705 static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
706 TCGReg arg1, intptr_t arg2)
707 {
708 int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
709 tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
710 }
711
712 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
713 TCGReg base, intptr_t ofs)
714 {
715 int rexw = 0;
716 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
717 if (val != (int32_t)val) {
718 return false;
719 }
720 rexw = P_REXW;
721 }
722 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
723 tcg_out32(s, val);
724 return true;
725 }
726
727 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
728 {
729 /* Propagate an opcode prefix, such as P_DATA16. */
730 int ext = subopc & ~0x7;
731 subopc &= 0x7;
732
733 if (count == 1) {
734 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
735 } else {
736 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
737 tcg_out8(s, count);
738 }
739 }
740
741 static inline void tcg_out_bswap32(TCGContext *s, int reg)
742 {
743 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
744 }
745
746 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
747 {
748 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
749 }
750
751 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
752 {
753 /* movzbl */
754 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
755 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
756 }
757
758 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
759 {
760 /* movsbl */
761 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
762 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
763 }
764
765 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
766 {
767 /* movzwl */
768 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
769 }
770
771 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
772 {
773 /* movsw[lq] */
774 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
775 }
776
777 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
778 {
779 /* 32-bit mov zero extends. */
780 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
781 }
782
783 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
784 {
785 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
786 }
787
788 static inline void tcg_out_bswap64(TCGContext *s, int reg)
789 {
790 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
791 }
792
793 static void tgen_arithi(TCGContext *s, int c, int r0,
794 tcg_target_long val, int cf)
795 {
796 int rexw = 0;
797
798 if (TCG_TARGET_REG_BITS == 64) {
799 rexw = c & -8;
800 c &= 7;
801 }
802
803 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
804 partial flags update stalls on Pentium4 and are not recommended
805 by current Intel optimization manuals. */
806 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
807 int is_inc = (c == ARITH_ADD) ^ (val < 0);
808 if (TCG_TARGET_REG_BITS == 64) {
809 /* The single-byte increment encodings are re-tasked as the
810 REX prefixes. Use the MODRM encoding. */
811 tcg_out_modrm(s, OPC_GRP5 + rexw,
812 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
813 } else {
814 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
815 }
816 return;
817 }
818
819 if (c == ARITH_AND) {
820 if (TCG_TARGET_REG_BITS == 64) {
821 if (val == 0xffffffffu) {
822 tcg_out_ext32u(s, r0, r0);
823 return;
824 }
825 if (val == (uint32_t)val) {
826 /* AND with no high bits set can use a 32-bit operation. */
827 rexw = 0;
828 }
829 }
830 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
831 tcg_out_ext8u(s, r0, r0);
832 return;
833 }
834 if (val == 0xffffu) {
835 tcg_out_ext16u(s, r0, r0);
836 return;
837 }
838 }
839
840 if (val == (int8_t)val) {
841 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
842 tcg_out8(s, val);
843 return;
844 }
845 if (rexw == 0 || val == (int32_t)val) {
846 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
847 tcg_out32(s, val);
848 return;
849 }
850
851 tcg_abort();
852 }
853
854 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
855 {
856 if (val != 0) {
857 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
858 }
859 }
860
861 /* Use SMALL != 0 to force a short forward branch. */
862 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
863 {
864 int32_t val, val1;
865
866 if (l->has_value) {
867 val = tcg_pcrel_diff(s, l->u.value_ptr);
868 val1 = val - 2;
869 if ((int8_t)val1 == val1) {
870 if (opc == -1) {
871 tcg_out8(s, OPC_JMP_short);
872 } else {
873 tcg_out8(s, OPC_JCC_short + opc);
874 }
875 tcg_out8(s, val1);
876 } else {
877 if (small) {
878 tcg_abort();
879 }
880 if (opc == -1) {
881 tcg_out8(s, OPC_JMP_long);
882 tcg_out32(s, val - 5);
883 } else {
884 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
885 tcg_out32(s, val - 6);
886 }
887 }
888 } else if (small) {
889 if (opc == -1) {
890 tcg_out8(s, OPC_JMP_short);
891 } else {
892 tcg_out8(s, OPC_JCC_short + opc);
893 }
894 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
895 s->code_ptr += 1;
896 } else {
897 if (opc == -1) {
898 tcg_out8(s, OPC_JMP_long);
899 } else {
900 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
901 }
902 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
903 s->code_ptr += 4;
904 }
905 }
906
907 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
908 int const_arg2, int rexw)
909 {
910 if (const_arg2) {
911 if (arg2 == 0) {
912 /* test r, r */
913 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
914 } else {
915 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
916 }
917 } else {
918 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
919 }
920 }
921
922 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
923 TCGArg arg1, TCGArg arg2, int const_arg2,
924 TCGLabel *label, int small)
925 {
926 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
927 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
928 }
929
930 #if TCG_TARGET_REG_BITS == 64
931 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
932 TCGArg arg1, TCGArg arg2, int const_arg2,
933 TCGLabel *label, int small)
934 {
935 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
936 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
937 }
938 #else
939 /* XXX: we implement it at the target level to avoid having to
940 handle cross basic blocks temporaries */
941 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
942 const int *const_args, int small)
943 {
944 TCGLabel *label_next = gen_new_label();
945 TCGLabel *label_this = arg_label(args[5]);
946
947 switch(args[4]) {
948 case TCG_COND_EQ:
949 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
950 label_next, 1);
951 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
952 label_this, small);
953 break;
954 case TCG_COND_NE:
955 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
956 label_this, small);
957 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
958 label_this, small);
959 break;
960 case TCG_COND_LT:
961 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
962 label_this, small);
963 tcg_out_jxx(s, JCC_JNE, label_next, 1);
964 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
965 label_this, small);
966 break;
967 case TCG_COND_LE:
968 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
969 label_this, small);
970 tcg_out_jxx(s, JCC_JNE, label_next, 1);
971 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
972 label_this, small);
973 break;
974 case TCG_COND_GT:
975 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
976 label_this, small);
977 tcg_out_jxx(s, JCC_JNE, label_next, 1);
978 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
979 label_this, small);
980 break;
981 case TCG_COND_GE:
982 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
983 label_this, small);
984 tcg_out_jxx(s, JCC_JNE, label_next, 1);
985 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
986 label_this, small);
987 break;
988 case TCG_COND_LTU:
989 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
990 label_this, small);
991 tcg_out_jxx(s, JCC_JNE, label_next, 1);
992 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
993 label_this, small);
994 break;
995 case TCG_COND_LEU:
996 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
997 label_this, small);
998 tcg_out_jxx(s, JCC_JNE, label_next, 1);
999 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1000 label_this, small);
1001 break;
1002 case TCG_COND_GTU:
1003 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1004 label_this, small);
1005 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1006 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1007 label_this, small);
1008 break;
1009 case TCG_COND_GEU:
1010 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1011 label_this, small);
1012 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1013 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1014 label_this, small);
1015 break;
1016 default:
1017 tcg_abort();
1018 }
1019 tcg_out_label(s, label_next, s->code_ptr);
1020 }
1021 #endif
1022
1023 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1024 TCGArg arg1, TCGArg arg2, int const_arg2)
1025 {
1026 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1027 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1028 tcg_out_ext8u(s, dest, dest);
1029 }
1030
1031 #if TCG_TARGET_REG_BITS == 64
1032 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1033 TCGArg arg1, TCGArg arg2, int const_arg2)
1034 {
1035 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1036 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1037 tcg_out_ext8u(s, dest, dest);
1038 }
1039 #else
1040 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1041 const int *const_args)
1042 {
1043 TCGArg new_args[6];
1044 TCGLabel *label_true, *label_over;
1045
1046 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1047
1048 if (args[0] == args[1] || args[0] == args[2]
1049 || (!const_args[3] && args[0] == args[3])
1050 || (!const_args[4] && args[0] == args[4])) {
1051 /* When the destination overlaps with one of the argument
1052 registers, don't do anything tricky. */
1053 label_true = gen_new_label();
1054 label_over = gen_new_label();
1055
1056 new_args[5] = label_arg(label_true);
1057 tcg_out_brcond2(s, new_args, const_args+1, 1);
1058
1059 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1060 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1061 tcg_out_label(s, label_true, s->code_ptr);
1062
1063 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1064 tcg_out_label(s, label_over, s->code_ptr);
1065 } else {
1066 /* When the destination does not overlap one of the arguments,
1067 clear the destination first, jump if cond false, and emit an
1068 increment in the true case. This results in smaller code. */
1069
1070 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1071
1072 label_over = gen_new_label();
1073 new_args[4] = tcg_invert_cond(new_args[4]);
1074 new_args[5] = label_arg(label_over);
1075 tcg_out_brcond2(s, new_args, const_args+1, 1);
1076
1077 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1078 tcg_out_label(s, label_over, s->code_ptr);
1079 }
1080 }
1081 #endif
1082
1083 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1084 TCGArg c1, TCGArg c2, int const_c2,
1085 TCGArg v1)
1086 {
1087 tcg_out_cmp(s, c1, c2, const_c2, 0);
1088 if (have_cmov) {
1089 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond], dest, v1);
1090 } else {
1091 TCGLabel *over = gen_new_label();
1092 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1093 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1094 tcg_out_label(s, over, s->code_ptr);
1095 }
1096 }
1097
1098 #if TCG_TARGET_REG_BITS == 64
1099 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1100 TCGArg c1, TCGArg c2, int const_c2,
1101 TCGArg v1)
1102 {
1103 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1104 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | P_REXW, dest, v1);
1105 }
1106 #endif
1107
1108 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1109 {
1110 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1111
1112 if (disp == (int32_t)disp) {
1113 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1114 tcg_out32(s, disp);
1115 } else {
1116 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R10, (uintptr_t)dest);
1117 tcg_out_modrm(s, OPC_GRP5,
1118 call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev, TCG_REG_R10);
1119 }
1120 }
1121
1122 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1123 {
1124 tcg_out_branch(s, 1, dest);
1125 }
1126
1127 static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1128 {
1129 tcg_out_branch(s, 0, dest);
1130 }
1131
1132 static void tcg_out_nopn(TCGContext *s, int n)
1133 {
1134 int i;
1135 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1136 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1137 * duplicate prefix, and all of the interesting recent cores can
1138 * decode and discard the duplicates in a single cycle.
1139 */
1140 tcg_debug_assert(n >= 1);
1141 for (i = 1; i < n; ++i) {
1142 tcg_out8(s, 0x66);
1143 }
1144 tcg_out8(s, 0x90);
1145 }
1146
1147 #if defined(CONFIG_SOFTMMU)
1148 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1149 * int mmu_idx, uintptr_t ra)
1150 */
1151 static void * const qemu_ld_helpers[16] = {
1152 [MO_UB] = helper_ret_ldub_mmu,
1153 [MO_LEUW] = helper_le_lduw_mmu,
1154 [MO_LEUL] = helper_le_ldul_mmu,
1155 [MO_LEQ] = helper_le_ldq_mmu,
1156 [MO_BEUW] = helper_be_lduw_mmu,
1157 [MO_BEUL] = helper_be_ldul_mmu,
1158 [MO_BEQ] = helper_be_ldq_mmu,
1159 };
1160
1161 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1162 * uintxx_t val, int mmu_idx, uintptr_t ra)
1163 */
1164 static void * const qemu_st_helpers[16] = {
1165 [MO_UB] = helper_ret_stb_mmu,
1166 [MO_LEUW] = helper_le_stw_mmu,
1167 [MO_LEUL] = helper_le_stl_mmu,
1168 [MO_LEQ] = helper_le_stq_mmu,
1169 [MO_BEUW] = helper_be_stw_mmu,
1170 [MO_BEUL] = helper_be_stl_mmu,
1171 [MO_BEQ] = helper_be_stq_mmu,
1172 };
1173
1174 /* Perform the TLB load and compare.
1175
1176 Inputs:
1177 ADDRLO and ADDRHI contain the low and high part of the address.
1178
1179 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1180
1181 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1182 This should be offsetof addr_read or addr_write.
1183
1184 Outputs:
1185 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1186 positions of the displacements of forward jumps to the TLB miss case.
1187
1188 Second argument register is loaded with the low part of the address.
1189 In the TLB hit case, it has been adjusted as indicated by the TLB
1190 and so is a host address. In the TLB miss case, it continues to
1191 hold a guest address.
1192
1193 First argument register is clobbered. */
1194
1195 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1196 int mem_index, TCGMemOp opc,
1197 tcg_insn_unit **label_ptr, int which)
1198 {
1199 const TCGReg r0 = TCG_REG_L0;
1200 const TCGReg r1 = TCG_REG_L1;
1201 TCGType ttype = TCG_TYPE_I32;
1202 TCGType tlbtype = TCG_TYPE_I32;
1203 int trexw = 0, hrexw = 0, tlbrexw = 0;
1204 unsigned a_bits = get_alignment_bits(opc);
1205 unsigned s_bits = opc & MO_SIZE;
1206 unsigned a_mask = (1 << a_bits) - 1;
1207 unsigned s_mask = (1 << s_bits) - 1;
1208 target_ulong tlb_mask;
1209
1210 if (TCG_TARGET_REG_BITS == 64) {
1211 if (TARGET_LONG_BITS == 64) {
1212 ttype = TCG_TYPE_I64;
1213 trexw = P_REXW;
1214 }
1215 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1216 hrexw = P_REXW;
1217 if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1218 tlbtype = TCG_TYPE_I64;
1219 tlbrexw = P_REXW;
1220 }
1221 }
1222 }
1223
1224 tcg_out_mov(s, tlbtype, r0, addrlo);
1225 /* If the required alignment is at least as large as the access, simply
1226 copy the address and mask. For lesser alignments, check that we don't
1227 cross pages for the complete access. */
1228 if (a_bits >= s_bits) {
1229 tcg_out_mov(s, ttype, r1, addrlo);
1230 } else {
1231 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1232 }
1233 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1234
1235 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1236 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1237
1238 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1239 tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1240 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1241
1242 tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1243 offsetof(CPUArchState, tlb_table[mem_index][0])
1244 + which);
1245
1246 /* cmp 0(r0), r1 */
1247 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1248
1249 /* Prepare for both the fast path add of the tlb addend, and the slow
1250 path function argument setup. There are two cases worth note:
1251 For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1252 before the fastpath ADDQ below. For 64-bit guest and x32 host, MOVQ
1253 copies the entire guest address for the slow path, while truncation
1254 for the 32-bit host happens with the fastpath ADDL below. */
1255 tcg_out_mov(s, ttype, r1, addrlo);
1256
1257 /* jne slow_path */
1258 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1259 label_ptr[0] = s->code_ptr;
1260 s->code_ptr += 4;
1261
1262 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1263 /* cmp 4(r0), addrhi */
1264 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1265
1266 /* jne slow_path */
1267 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1268 label_ptr[1] = s->code_ptr;
1269 s->code_ptr += 4;
1270 }
1271
1272 /* TLB Hit. */
1273
1274 /* add addend(r0), r1 */
1275 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1276 offsetof(CPUTLBEntry, addend) - which);
1277 }
1278
1279 /*
1280 * Record the context of a call to the out of line helper code for the slow path
1281 * for a load or store, so that we can later generate the correct helper code
1282 */
1283 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
1284 TCGReg datalo, TCGReg datahi,
1285 TCGReg addrlo, TCGReg addrhi,
1286 tcg_insn_unit *raddr,
1287 tcg_insn_unit **label_ptr)
1288 {
1289 TCGLabelQemuLdst *label = new_ldst_label(s);
1290
1291 label->is_ld = is_ld;
1292 label->oi = oi;
1293 label->datalo_reg = datalo;
1294 label->datahi_reg = datahi;
1295 label->addrlo_reg = addrlo;
1296 label->addrhi_reg = addrhi;
1297 label->raddr = raddr;
1298 label->label_ptr[0] = label_ptr[0];
1299 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1300 label->label_ptr[1] = label_ptr[1];
1301 }
1302 }
1303
1304 /*
1305 * Generate code for the slow path for a load at the end of block
1306 */
1307 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1308 {
1309 TCGMemOpIdx oi = l->oi;
1310 TCGMemOp opc = get_memop(oi);
1311 TCGReg data_reg;
1312 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1313
1314 /* resolve label address */
1315 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1316 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1317 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1318 }
1319
1320 if (TCG_TARGET_REG_BITS == 32) {
1321 int ofs = 0;
1322
1323 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1324 ofs += 4;
1325
1326 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1327 ofs += 4;
1328
1329 if (TARGET_LONG_BITS == 64) {
1330 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1331 ofs += 4;
1332 }
1333
1334 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1335 ofs += 4;
1336
1337 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1338 } else {
1339 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1340 /* The second argument is already loaded with addrlo. */
1341 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1342 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1343 (uintptr_t)l->raddr);
1344 }
1345
1346 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1347
1348 data_reg = l->datalo_reg;
1349 switch (opc & MO_SSIZE) {
1350 case MO_SB:
1351 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
1352 break;
1353 case MO_SW:
1354 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
1355 break;
1356 #if TCG_TARGET_REG_BITS == 64
1357 case MO_SL:
1358 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1359 break;
1360 #endif
1361 case MO_UB:
1362 case MO_UW:
1363 /* Note that the helpers have zero-extended to tcg_target_long. */
1364 case MO_UL:
1365 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1366 break;
1367 case MO_Q:
1368 if (TCG_TARGET_REG_BITS == 64) {
1369 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1370 } else if (data_reg == TCG_REG_EDX) {
1371 /* xchg %edx, %eax */
1372 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1373 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1374 } else {
1375 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1376 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1377 }
1378 break;
1379 default:
1380 tcg_abort();
1381 }
1382
1383 /* Jump to the code corresponding to next IR of qemu_st */
1384 tcg_out_jmp(s, l->raddr);
1385 }
1386
1387 /*
1388 * Generate code for the slow path for a store at the end of block
1389 */
1390 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1391 {
1392 TCGMemOpIdx oi = l->oi;
1393 TCGMemOp opc = get_memop(oi);
1394 TCGMemOp s_bits = opc & MO_SIZE;
1395 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1396 TCGReg retaddr;
1397
1398 /* resolve label address */
1399 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1400 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1401 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1402 }
1403
1404 if (TCG_TARGET_REG_BITS == 32) {
1405 int ofs = 0;
1406
1407 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1408 ofs += 4;
1409
1410 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1411 ofs += 4;
1412
1413 if (TARGET_LONG_BITS == 64) {
1414 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1415 ofs += 4;
1416 }
1417
1418 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1419 ofs += 4;
1420
1421 if (s_bits == MO_64) {
1422 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1423 ofs += 4;
1424 }
1425
1426 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1427 ofs += 4;
1428
1429 retaddr = TCG_REG_EAX;
1430 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1431 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1432 } else {
1433 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1434 /* The second argument is already loaded with addrlo. */
1435 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1436 tcg_target_call_iarg_regs[2], l->datalo_reg);
1437 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1438
1439 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1440 retaddr = tcg_target_call_iarg_regs[4];
1441 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1442 } else {
1443 retaddr = TCG_REG_RAX;
1444 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1445 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1446 TCG_TARGET_CALL_STACK_OFFSET);
1447 }
1448 }
1449
1450 /* "Tail call" to the helper, with the return address back inline. */
1451 tcg_out_push(s, retaddr);
1452 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1453 }
1454 #elif defined(__x86_64__) && defined(__linux__)
1455 # include <asm/prctl.h>
1456 # include <sys/prctl.h>
1457
1458 int arch_prctl(int code, unsigned long addr);
1459
1460 static int guest_base_flags;
1461 static inline void setup_guest_base_seg(void)
1462 {
1463 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1464 guest_base_flags = P_GS;
1465 }
1466 }
1467 #else
1468 # define guest_base_flags 0
1469 static inline void setup_guest_base_seg(void) { }
1470 #endif /* SOFTMMU */
1471
1472 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1473 TCGReg base, int index, intptr_t ofs,
1474 int seg, TCGMemOp memop)
1475 {
1476 const TCGMemOp real_bswap = memop & MO_BSWAP;
1477 TCGMemOp bswap = real_bswap;
1478 int movop = OPC_MOVL_GvEv;
1479
1480 if (have_movbe && real_bswap) {
1481 bswap = 0;
1482 movop = OPC_MOVBE_GyMy;
1483 }
1484
1485 switch (memop & MO_SSIZE) {
1486 case MO_UB:
1487 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1488 base, index, 0, ofs);
1489 break;
1490 case MO_SB:
1491 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
1492 base, index, 0, ofs);
1493 break;
1494 case MO_UW:
1495 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1496 base, index, 0, ofs);
1497 if (real_bswap) {
1498 tcg_out_rolw_8(s, datalo);
1499 }
1500 break;
1501 case MO_SW:
1502 if (real_bswap) {
1503 if (have_movbe) {
1504 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1505 datalo, base, index, 0, ofs);
1506 } else {
1507 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1508 base, index, 0, ofs);
1509 tcg_out_rolw_8(s, datalo);
1510 }
1511 tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
1512 } else {
1513 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
1514 datalo, base, index, 0, ofs);
1515 }
1516 break;
1517 case MO_UL:
1518 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1519 if (bswap) {
1520 tcg_out_bswap32(s, datalo);
1521 }
1522 break;
1523 #if TCG_TARGET_REG_BITS == 64
1524 case MO_SL:
1525 if (real_bswap) {
1526 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1527 base, index, 0, ofs);
1528 if (bswap) {
1529 tcg_out_bswap32(s, datalo);
1530 }
1531 tcg_out_ext32s(s, datalo, datalo);
1532 } else {
1533 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1534 base, index, 0, ofs);
1535 }
1536 break;
1537 #endif
1538 case MO_Q:
1539 if (TCG_TARGET_REG_BITS == 64) {
1540 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1541 base, index, 0, ofs);
1542 if (bswap) {
1543 tcg_out_bswap64(s, datalo);
1544 }
1545 } else {
1546 if (real_bswap) {
1547 int t = datalo;
1548 datalo = datahi;
1549 datahi = t;
1550 }
1551 if (base != datalo) {
1552 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1553 base, index, 0, ofs);
1554 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1555 base, index, 0, ofs + 4);
1556 } else {
1557 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1558 base, index, 0, ofs + 4);
1559 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1560 base, index, 0, ofs);
1561 }
1562 if (bswap) {
1563 tcg_out_bswap32(s, datalo);
1564 tcg_out_bswap32(s, datahi);
1565 }
1566 }
1567 break;
1568 default:
1569 tcg_abort();
1570 }
1571 }
1572
1573 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1574 EAX. It will be useful once fixed registers globals are less
1575 common. */
1576 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
1577 {
1578 TCGReg datalo, datahi, addrlo;
1579 TCGReg addrhi __attribute__((unused));
1580 TCGMemOpIdx oi;
1581 TCGMemOp opc;
1582 #if defined(CONFIG_SOFTMMU)
1583 int mem_index;
1584 tcg_insn_unit *label_ptr[2];
1585 #endif
1586
1587 datalo = *args++;
1588 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1589 addrlo = *args++;
1590 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1591 oi = *args++;
1592 opc = get_memop(oi);
1593
1594 #if defined(CONFIG_SOFTMMU)
1595 mem_index = get_mmuidx(oi);
1596
1597 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1598 label_ptr, offsetof(CPUTLBEntry, addr_read));
1599
1600 /* TLB Hit. */
1601 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
1602
1603 /* Record the current context of a load into ldst label */
1604 add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
1605 s->code_ptr, label_ptr);
1606 #else
1607 {
1608 int32_t offset = guest_base;
1609 TCGReg base = addrlo;
1610 int index = -1;
1611 int seg = 0;
1612
1613 /* For a 32-bit guest, the high 32 bits may contain garbage.
1614 We can do this with the ADDR32 prefix if we're not using
1615 a guest base, or when using segmentation. Otherwise we
1616 need to zero-extend manually. */
1617 if (guest_base == 0 || guest_base_flags) {
1618 seg = guest_base_flags;
1619 offset = 0;
1620 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1621 seg |= P_ADDR32;
1622 }
1623 } else if (TCG_TARGET_REG_BITS == 64) {
1624 if (TARGET_LONG_BITS == 32) {
1625 tcg_out_ext32u(s, TCG_REG_L0, base);
1626 base = TCG_REG_L0;
1627 }
1628 if (offset != guest_base) {
1629 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1630 index = TCG_REG_L1;
1631 offset = 0;
1632 }
1633 }
1634
1635 tcg_out_qemu_ld_direct(s, datalo, datahi,
1636 base, index, offset, seg, opc);
1637 }
1638 #endif
1639 }
1640
1641 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1642 TCGReg base, intptr_t ofs, int seg,
1643 TCGMemOp memop)
1644 {
1645 /* ??? Ideally we wouldn't need a scratch register. For user-only,
1646 we could perform the bswap twice to restore the original value
1647 instead of moving to the scratch. But as it is, the L constraint
1648 means that TCG_REG_L0 is definitely free here. */
1649 const TCGReg scratch = TCG_REG_L0;
1650 const TCGMemOp real_bswap = memop & MO_BSWAP;
1651 TCGMemOp bswap = real_bswap;
1652 int movop = OPC_MOVL_EvGv;
1653
1654 if (have_movbe && real_bswap) {
1655 bswap = 0;
1656 movop = OPC_MOVBE_MyGy;
1657 }
1658
1659 switch (memop & MO_SIZE) {
1660 case MO_8:
1661 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
1662 Use the scratch register if necessary. */
1663 if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
1664 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1665 datalo = scratch;
1666 }
1667 tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
1668 datalo, base, ofs);
1669 break;
1670 case MO_16:
1671 if (bswap) {
1672 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1673 tcg_out_rolw_8(s, scratch);
1674 datalo = scratch;
1675 }
1676 tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
1677 break;
1678 case MO_32:
1679 if (bswap) {
1680 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1681 tcg_out_bswap32(s, scratch);
1682 datalo = scratch;
1683 }
1684 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1685 break;
1686 case MO_64:
1687 if (TCG_TARGET_REG_BITS == 64) {
1688 if (bswap) {
1689 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
1690 tcg_out_bswap64(s, scratch);
1691 datalo = scratch;
1692 }
1693 tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
1694 } else if (bswap) {
1695 tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
1696 tcg_out_bswap32(s, scratch);
1697 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
1698 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1699 tcg_out_bswap32(s, scratch);
1700 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
1701 } else {
1702 if (real_bswap) {
1703 int t = datalo;
1704 datalo = datahi;
1705 datahi = t;
1706 }
1707 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1708 tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
1709 }
1710 break;
1711 default:
1712 tcg_abort();
1713 }
1714 }
1715
1716 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
1717 {
1718 TCGReg datalo, datahi, addrlo;
1719 TCGReg addrhi __attribute__((unused));
1720 TCGMemOpIdx oi;
1721 TCGMemOp opc;
1722 #if defined(CONFIG_SOFTMMU)
1723 int mem_index;
1724 tcg_insn_unit *label_ptr[2];
1725 #endif
1726
1727 datalo = *args++;
1728 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1729 addrlo = *args++;
1730 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1731 oi = *args++;
1732 opc = get_memop(oi);
1733
1734 #if defined(CONFIG_SOFTMMU)
1735 mem_index = get_mmuidx(oi);
1736
1737 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1738 label_ptr, offsetof(CPUTLBEntry, addr_write));
1739
1740 /* TLB Hit. */
1741 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
1742
1743 /* Record the current context of a store into ldst label */
1744 add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
1745 s->code_ptr, label_ptr);
1746 #else
1747 {
1748 int32_t offset = guest_base;
1749 TCGReg base = addrlo;
1750 int seg = 0;
1751
1752 /* See comment in tcg_out_qemu_ld re zero-extension of addrlo. */
1753 if (guest_base == 0 || guest_base_flags) {
1754 seg = guest_base_flags;
1755 offset = 0;
1756 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1757 seg |= P_ADDR32;
1758 }
1759 } else if (TCG_TARGET_REG_BITS == 64) {
1760 /* ??? Note that we can't use the same SIB addressing scheme
1761 as for loads, since we require L0 free for bswap. */
1762 if (offset != guest_base) {
1763 if (TARGET_LONG_BITS == 32) {
1764 tcg_out_ext32u(s, TCG_REG_L0, base);
1765 base = TCG_REG_L0;
1766 }
1767 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1768 tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
1769 base = TCG_REG_L1;
1770 offset = 0;
1771 } else if (TARGET_LONG_BITS == 32) {
1772 tcg_out_ext32u(s, TCG_REG_L1, base);
1773 base = TCG_REG_L1;
1774 }
1775 }
1776
1777 tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
1778 }
1779 #endif
1780 }
1781
1782 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
1783 const TCGArg *args, const int *const_args)
1784 {
1785 TCGArg a0, a1, a2;
1786 int c, const_a2, vexop, rexw = 0;
1787
1788 #if TCG_TARGET_REG_BITS == 64
1789 # define OP_32_64(x) \
1790 case glue(glue(INDEX_op_, x), _i64): \
1791 rexw = P_REXW; /* FALLTHRU */ \
1792 case glue(glue(INDEX_op_, x), _i32)
1793 #else
1794 # define OP_32_64(x) \
1795 case glue(glue(INDEX_op_, x), _i32)
1796 #endif
1797
1798 /* Hoist the loads of the most common arguments. */
1799 a0 = args[0];
1800 a1 = args[1];
1801 a2 = args[2];
1802 const_a2 = const_args[2];
1803
1804 switch (opc) {
1805 case INDEX_op_exit_tb:
1806 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
1807 tcg_out_jmp(s, tb_ret_addr);
1808 break;
1809 case INDEX_op_goto_tb:
1810 if (s->tb_jmp_insn_offset) {
1811 /* direct jump method */
1812 int gap;
1813 /* jump displacement must be aligned for atomic patching;
1814 * see if we need to add extra nops before jump
1815 */
1816 gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
1817 if (gap != 1) {
1818 tcg_out_nopn(s, gap - 1);
1819 }
1820 tcg_out8(s, OPC_JMP_long); /* jmp im */
1821 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
1822 tcg_out32(s, 0);
1823 } else {
1824 /* indirect jump method */
1825 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
1826 (intptr_t)(s->tb_jmp_target_addr + a0));
1827 }
1828 s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
1829 break;
1830 case INDEX_op_br:
1831 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
1832 break;
1833 OP_32_64(ld8u):
1834 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
1835 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
1836 break;
1837 OP_32_64(ld8s):
1838 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
1839 break;
1840 OP_32_64(ld16u):
1841 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
1842 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
1843 break;
1844 OP_32_64(ld16s):
1845 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
1846 break;
1847 #if TCG_TARGET_REG_BITS == 64
1848 case INDEX_op_ld32u_i64:
1849 #endif
1850 case INDEX_op_ld_i32:
1851 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
1852 break;
1853
1854 OP_32_64(st8):
1855 if (const_args[0]) {
1856 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
1857 tcg_out8(s, a0);
1858 } else {
1859 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
1860 }
1861 break;
1862 OP_32_64(st16):
1863 if (const_args[0]) {
1864 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
1865 tcg_out16(s, a0);
1866 } else {
1867 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
1868 }
1869 break;
1870 #if TCG_TARGET_REG_BITS == 64
1871 case INDEX_op_st32_i64:
1872 #endif
1873 case INDEX_op_st_i32:
1874 if (const_args[0]) {
1875 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
1876 tcg_out32(s, a0);
1877 } else {
1878 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
1879 }
1880 break;
1881
1882 OP_32_64(add):
1883 /* For 3-operand addition, use LEA. */
1884 if (a0 != a1) {
1885 TCGArg c3 = 0;
1886 if (const_a2) {
1887 c3 = a2, a2 = -1;
1888 } else if (a0 == a2) {
1889 /* Watch out for dest = src + dest, since we've removed
1890 the matching constraint on the add. */
1891 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
1892 break;
1893 }
1894
1895 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
1896 break;
1897 }
1898 c = ARITH_ADD;
1899 goto gen_arith;
1900 OP_32_64(sub):
1901 c = ARITH_SUB;
1902 goto gen_arith;
1903 OP_32_64(and):
1904 c = ARITH_AND;
1905 goto gen_arith;
1906 OP_32_64(or):
1907 c = ARITH_OR;
1908 goto gen_arith;
1909 OP_32_64(xor):
1910 c = ARITH_XOR;
1911 goto gen_arith;
1912 gen_arith:
1913 if (const_a2) {
1914 tgen_arithi(s, c + rexw, a0, a2, 0);
1915 } else {
1916 tgen_arithr(s, c + rexw, a0, a2);
1917 }
1918 break;
1919
1920 OP_32_64(andc):
1921 if (const_a2) {
1922 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
1923 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
1924 } else {
1925 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
1926 }
1927 break;
1928
1929 OP_32_64(mul):
1930 if (const_a2) {
1931 int32_t val;
1932 val = a2;
1933 if (val == (int8_t)val) {
1934 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
1935 tcg_out8(s, val);
1936 } else {
1937 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
1938 tcg_out32(s, val);
1939 }
1940 } else {
1941 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
1942 }
1943 break;
1944
1945 OP_32_64(div2):
1946 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
1947 break;
1948 OP_32_64(divu2):
1949 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
1950 break;
1951
1952 OP_32_64(shl):
1953 /* For small constant 3-operand shift, use LEA. */
1954 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
1955 if (a2 - 1 == 0) {
1956 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
1957 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
1958 } else {
1959 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
1960 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
1961 }
1962 break;
1963 }
1964 c = SHIFT_SHL;
1965 vexop = OPC_SHLX;
1966 goto gen_shift_maybe_vex;
1967 OP_32_64(shr):
1968 c = SHIFT_SHR;
1969 vexop = OPC_SHRX;
1970 goto gen_shift_maybe_vex;
1971 OP_32_64(sar):
1972 c = SHIFT_SAR;
1973 vexop = OPC_SARX;
1974 goto gen_shift_maybe_vex;
1975 OP_32_64(rotl):
1976 c = SHIFT_ROL;
1977 goto gen_shift;
1978 OP_32_64(rotr):
1979 c = SHIFT_ROR;
1980 goto gen_shift;
1981 gen_shift_maybe_vex:
1982 if (have_bmi2) {
1983 if (!const_a2) {
1984 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
1985 break;
1986 }
1987 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
1988 }
1989 /* FALLTHRU */
1990 gen_shift:
1991 if (const_a2) {
1992 tcg_out_shifti(s, c + rexw, a0, a2);
1993 } else {
1994 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
1995 }
1996 break;
1997
1998 case INDEX_op_brcond_i32:
1999 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2000 break;
2001 case INDEX_op_setcond_i32:
2002 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2003 break;
2004 case INDEX_op_movcond_i32:
2005 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2006 break;
2007
2008 OP_32_64(bswap16):
2009 tcg_out_rolw_8(s, a0);
2010 break;
2011 OP_32_64(bswap32):
2012 tcg_out_bswap32(s, a0);
2013 break;
2014
2015 OP_32_64(neg):
2016 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2017 break;
2018 OP_32_64(not):
2019 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2020 break;
2021
2022 OP_32_64(ext8s):
2023 tcg_out_ext8s(s, a0, a1, rexw);
2024 break;
2025 OP_32_64(ext16s):
2026 tcg_out_ext16s(s, a0, a1, rexw);
2027 break;
2028 OP_32_64(ext8u):
2029 tcg_out_ext8u(s, a0, a1);
2030 break;
2031 OP_32_64(ext16u):
2032 tcg_out_ext16u(s, a0, a1);
2033 break;
2034
2035 case INDEX_op_qemu_ld_i32:
2036 tcg_out_qemu_ld(s, args, 0);
2037 break;
2038 case INDEX_op_qemu_ld_i64:
2039 tcg_out_qemu_ld(s, args, 1);
2040 break;
2041 case INDEX_op_qemu_st_i32:
2042 tcg_out_qemu_st(s, args, 0);
2043 break;
2044 case INDEX_op_qemu_st_i64:
2045 tcg_out_qemu_st(s, args, 1);
2046 break;
2047
2048 OP_32_64(mulu2):
2049 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2050 break;
2051 OP_32_64(muls2):
2052 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2053 break;
2054 OP_32_64(add2):
2055 if (const_args[4]) {
2056 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2057 } else {
2058 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2059 }
2060 if (const_args[5]) {
2061 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2062 } else {
2063 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2064 }
2065 break;
2066 OP_32_64(sub2):
2067 if (const_args[4]) {
2068 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2069 } else {
2070 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2071 }
2072 if (const_args[5]) {
2073 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2074 } else {
2075 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2076 }
2077 break;
2078
2079 #if TCG_TARGET_REG_BITS == 32
2080 case INDEX_op_brcond2_i32:
2081 tcg_out_brcond2(s, args, const_args, 0);
2082 break;
2083 case INDEX_op_setcond2_i32:
2084 tcg_out_setcond2(s, args, const_args);
2085 break;
2086 #else /* TCG_TARGET_REG_BITS == 64 */
2087 case INDEX_op_ld32s_i64:
2088 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2089 break;
2090 case INDEX_op_ld_i64:
2091 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2092 break;
2093 case INDEX_op_st_i64:
2094 if (const_args[0]) {
2095 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2096 tcg_out32(s, a0);
2097 } else {
2098 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2099 }
2100 break;
2101
2102 case INDEX_op_brcond_i64:
2103 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2104 break;
2105 case INDEX_op_setcond_i64:
2106 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2107 break;
2108 case INDEX_op_movcond_i64:
2109 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2110 break;
2111
2112 case INDEX_op_bswap64_i64:
2113 tcg_out_bswap64(s, a0);
2114 break;
2115 case INDEX_op_extu_i32_i64:
2116 case INDEX_op_ext32u_i64:
2117 tcg_out_ext32u(s, a0, a1);
2118 break;
2119 case INDEX_op_ext_i32_i64:
2120 case INDEX_op_ext32s_i64:
2121 tcg_out_ext32s(s, a0, a1);
2122 break;
2123 #endif
2124
2125 OP_32_64(deposit):
2126 if (args[3] == 0 && args[4] == 8) {
2127 /* load bits 0..7 */
2128 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2129 } else if (args[3] == 8 && args[4] == 8) {
2130 /* load bits 8..15 */
2131 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2132 } else if (args[3] == 0 && args[4] == 16) {
2133 /* load bits 0..15 */
2134 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2135 } else {
2136 tcg_abort();
2137 }
2138 break;
2139
2140 case INDEX_op_extract_i64:
2141 if (a2 + args[3] == 32) {
2142 /* This is a 32-bit zero-extending right shift. */
2143 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2144 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2145 break;
2146 }
2147 /* FALLTHRU */
2148 case INDEX_op_extract_i32:
2149 /* On the off-chance that we can use the high-byte registers.
2150 Otherwise we emit the same ext16 + shift pattern that we
2151 would have gotten from the normal tcg-op.c expansion. */
2152 tcg_debug_assert(a2 == 8 && args[3] == 8);
2153 if (a1 < 4 && a0 < 8) {
2154 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2155 } else {
2156 tcg_out_ext16u(s, a0, a1);
2157 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2158 }
2159 break;
2160
2161 case INDEX_op_sextract_i32:
2162 /* We don't implement sextract_i64, as we cannot sign-extend to
2163 64-bits without using the REX prefix that explicitly excludes
2164 access to the high-byte registers. */
2165 tcg_debug_assert(a2 == 8 && args[3] == 8);
2166 if (a1 < 4 && a0 < 8) {
2167 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2168 } else {
2169 tcg_out_ext16s(s, a0, a1, 0);
2170 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2171 }
2172 break;
2173
2174 case INDEX_op_mb:
2175 tcg_out_mb(s, a0);
2176 break;
2177 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2178 case INDEX_op_mov_i64:
2179 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
2180 case INDEX_op_movi_i64:
2181 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2182 default:
2183 tcg_abort();
2184 }
2185
2186 #undef OP_32_64
2187 }
2188
2189 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2190 {
2191 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2192 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2193 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2194 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2195 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2196 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2197 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2198 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2199 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2200 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2201 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2202 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2203 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2204 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2205 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2206 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2207 static const TCGTargetOpDef r_r_L_L
2208 = { .args_ct_str = { "r", "r", "L", "L" } };
2209 static const TCGTargetOpDef L_L_L_L
2210 = { .args_ct_str = { "L", "L", "L", "L" } };
2211
2212 switch (op) {
2213 case INDEX_op_ld8u_i32:
2214 case INDEX_op_ld8u_i64:
2215 case INDEX_op_ld8s_i32:
2216 case INDEX_op_ld8s_i64:
2217 case INDEX_op_ld16u_i32:
2218 case INDEX_op_ld16u_i64:
2219 case INDEX_op_ld16s_i32:
2220 case INDEX_op_ld16s_i64:
2221 case INDEX_op_ld_i32:
2222 case INDEX_op_ld32u_i64:
2223 case INDEX_op_ld32s_i64:
2224 case INDEX_op_ld_i64:
2225 return &r_r;
2226
2227 case INDEX_op_st8_i32:
2228 case INDEX_op_st8_i64:
2229 return &qi_r;
2230 case INDEX_op_st16_i32:
2231 case INDEX_op_st16_i64:
2232 case INDEX_op_st_i32:
2233 case INDEX_op_st32_i64:
2234 return &ri_r;
2235 case INDEX_op_st_i64:
2236 return &re_r;
2237
2238 case INDEX_op_add_i32:
2239 case INDEX_op_add_i64:
2240 return &r_r_re;
2241 case INDEX_op_sub_i32:
2242 case INDEX_op_sub_i64:
2243 case INDEX_op_mul_i32:
2244 case INDEX_op_mul_i64:
2245 case INDEX_op_or_i32:
2246 case INDEX_op_or_i64:
2247 case INDEX_op_xor_i32:
2248 case INDEX_op_xor_i64:
2249 return &r_0_re;
2250
2251 case INDEX_op_and_i32:
2252 case INDEX_op_and_i64:
2253 {
2254 static const TCGTargetOpDef and
2255 = { .args_ct_str = { "r", "0", "reZ" } };
2256 return &and;
2257 }
2258 break;
2259 case INDEX_op_andc_i32:
2260 case INDEX_op_andc_i64:
2261 {
2262 static const TCGTargetOpDef andc
2263 = { .args_ct_str = { "r", "r", "rI" } };
2264 return &andc;
2265 }
2266 break;
2267
2268 case INDEX_op_shl_i32:
2269 case INDEX_op_shl_i64:
2270 case INDEX_op_shr_i32:
2271 case INDEX_op_shr_i64:
2272 case INDEX_op_sar_i32:
2273 case INDEX_op_sar_i64:
2274 return have_bmi2 ? &r_r_ri : &r_0_ci;
2275 case INDEX_op_rotl_i32:
2276 case INDEX_op_rotl_i64:
2277 case INDEX_op_rotr_i32:
2278 case INDEX_op_rotr_i64:
2279 return &r_0_ci;
2280
2281 case INDEX_op_brcond_i32:
2282 case INDEX_op_brcond_i64:
2283 return &r_re;
2284
2285 case INDEX_op_bswap16_i32:
2286 case INDEX_op_bswap16_i64:
2287 case INDEX_op_bswap32_i32:
2288 case INDEX_op_bswap32_i64:
2289 case INDEX_op_bswap64_i64:
2290 case INDEX_op_neg_i32:
2291 case INDEX_op_neg_i64:
2292 case INDEX_op_not_i32:
2293 case INDEX_op_not_i64:
2294 return &r_0;
2295
2296 case INDEX_op_ext8s_i32:
2297 case INDEX_op_ext8s_i64:
2298 case INDEX_op_ext8u_i32:
2299 case INDEX_op_ext8u_i64:
2300 return &r_q;
2301 case INDEX_op_ext16s_i32:
2302 case INDEX_op_ext16s_i64:
2303 case INDEX_op_ext16u_i32:
2304 case INDEX_op_ext16u_i64:
2305 case INDEX_op_ext32s_i64:
2306 case INDEX_op_ext32u_i64:
2307 case INDEX_op_ext_i32_i64:
2308 case INDEX_op_extu_i32_i64:
2309 case INDEX_op_extract_i32:
2310 case INDEX_op_extract_i64:
2311 case INDEX_op_sextract_i32:
2312 return &r_r;
2313
2314 case INDEX_op_deposit_i32:
2315 case INDEX_op_deposit_i64:
2316 {
2317 static const TCGTargetOpDef dep
2318 = { .args_ct_str = { "Q", "0", "Q" } };
2319 return &dep;
2320 }
2321 case INDEX_op_setcond_i32:
2322 case INDEX_op_setcond_i64:
2323 {
2324 static const TCGTargetOpDef setc
2325 = { .args_ct_str = { "q", "r", "re" } };
2326 return &setc;
2327 }
2328 case INDEX_op_movcond_i32:
2329 case INDEX_op_movcond_i64:
2330 {
2331 static const TCGTargetOpDef movc
2332 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2333 return &movc;
2334 }
2335 case INDEX_op_div2_i32:
2336 case INDEX_op_div2_i64:
2337 case INDEX_op_divu2_i32:
2338 case INDEX_op_divu2_i64:
2339 {
2340 static const TCGTargetOpDef div2
2341 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2342 return &div2;
2343 }
2344 case INDEX_op_mulu2_i32:
2345 case INDEX_op_mulu2_i64:
2346 case INDEX_op_muls2_i32:
2347 case INDEX_op_muls2_i64:
2348 {
2349 static const TCGTargetOpDef mul2
2350 = { .args_ct_str = { "a", "d", "a", "r" } };
2351 return &mul2;
2352 }
2353 case INDEX_op_add2_i32:
2354 case INDEX_op_add2_i64:
2355 case INDEX_op_sub2_i32:
2356 case INDEX_op_sub2_i64:
2357 {
2358 static const TCGTargetOpDef arith2
2359 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2360 return &arith2;
2361 }
2362
2363 case INDEX_op_qemu_ld_i32:
2364 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
2365 case INDEX_op_qemu_st_i32:
2366 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
2367 case INDEX_op_qemu_ld_i64:
2368 return (TCG_TARGET_REG_BITS == 64 ? &r_L
2369 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
2370 : &r_r_L_L);
2371 case INDEX_op_qemu_st_i64:
2372 return (TCG_TARGET_REG_BITS == 64 ? &L_L
2373 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
2374 : &L_L_L_L);
2375
2376 case INDEX_op_brcond2_i32:
2377 {
2378 static const TCGTargetOpDef b2
2379 = { .args_ct_str = { "r", "r", "ri", "ri" } };
2380 return &b2;
2381 }
2382 case INDEX_op_setcond2_i32:
2383 {
2384 static const TCGTargetOpDef s2
2385 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
2386 return &s2;
2387 }
2388
2389 default:
2390 break;
2391 }
2392 return NULL;
2393 }
2394
2395 static int tcg_target_callee_save_regs[] = {
2396 #if TCG_TARGET_REG_BITS == 64
2397 TCG_REG_RBP,
2398 TCG_REG_RBX,
2399 #if defined(_WIN64)
2400 TCG_REG_RDI,
2401 TCG_REG_RSI,
2402 #endif
2403 TCG_REG_R12,
2404 TCG_REG_R13,
2405 TCG_REG_R14, /* Currently used for the global env. */
2406 TCG_REG_R15,
2407 #else
2408 TCG_REG_EBP, /* Currently used for the global env. */
2409 TCG_REG_EBX,
2410 TCG_REG_ESI,
2411 TCG_REG_EDI,
2412 #endif
2413 };
2414
2415 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
2416 and tcg_register_jit. */
2417
2418 #define PUSH_SIZE \
2419 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
2420 * (TCG_TARGET_REG_BITS / 8))
2421
2422 #define FRAME_SIZE \
2423 ((PUSH_SIZE \
2424 + TCG_STATIC_CALL_ARGS_SIZE \
2425 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
2426 + TCG_TARGET_STACK_ALIGN - 1) \
2427 & ~(TCG_TARGET_STACK_ALIGN - 1))
2428
2429 /* Generate global QEMU prologue and epilogue code */
2430 static void tcg_target_qemu_prologue(TCGContext *s)
2431 {
2432 int i, stack_addend;
2433
2434 /* TB prologue */
2435
2436 /* Reserve some stack space, also for TCG temps. */
2437 stack_addend = FRAME_SIZE - PUSH_SIZE;
2438 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
2439 CPU_TEMP_BUF_NLONGS * sizeof(long));
2440
2441 /* Save all callee saved registers. */
2442 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
2443 tcg_out_push(s, tcg_target_callee_save_regs[i]);
2444 }
2445
2446 #if TCG_TARGET_REG_BITS == 32
2447 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
2448 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
2449 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2450 /* jmp *tb. */
2451 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
2452 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
2453 + stack_addend);
2454 #else
2455 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
2456 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2457 /* jmp *tb. */
2458 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
2459 #endif
2460
2461 /* TB epilogue */
2462 tb_ret_addr = s->code_ptr;
2463
2464 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
2465
2466 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
2467 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
2468 }
2469 tcg_out_opc(s, OPC_RET, 0, 0, 0);
2470
2471 #if !defined(CONFIG_SOFTMMU)
2472 /* Try to set up a segment register to point to guest_base. */
2473 if (guest_base) {
2474 setup_guest_base_seg();
2475 }
2476 #endif
2477 }
2478
2479 static void tcg_target_init(TCGContext *s)
2480 {
2481 #ifdef CONFIG_CPUID_H
2482 unsigned a, b, c, d;
2483 int max = __get_cpuid_max(0, 0);
2484
2485 if (max >= 1) {
2486 __cpuid(1, a, b, c, d);
2487 #ifndef have_cmov
2488 /* For 32-bit, 99% certainty that we're running on hardware that
2489 supports cmov, but we still need to check. In case cmov is not
2490 available, we'll use a small forward branch. */
2491 have_cmov = (d & bit_CMOV) != 0;
2492 #endif
2493 #ifndef have_movbe
2494 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
2495 need to probe for it. */
2496 have_movbe = (c & bit_MOVBE) != 0;
2497 #endif
2498 }
2499
2500 if (max >= 7) {
2501 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
2502 __cpuid_count(7, 0, a, b, c, d);
2503 #ifdef bit_BMI
2504 have_bmi1 = (b & bit_BMI) != 0;
2505 #endif
2506 #ifndef have_bmi2
2507 have_bmi2 = (b & bit_BMI2) != 0;
2508 #endif
2509 }
2510 #endif
2511
2512 if (TCG_TARGET_REG_BITS == 64) {
2513 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
2514 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffff);
2515 } else {
2516 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff);
2517 }
2518
2519 tcg_regset_clear(tcg_target_call_clobber_regs);
2520 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
2521 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
2522 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
2523 if (TCG_TARGET_REG_BITS == 64) {
2524 #if !defined(_WIN64)
2525 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
2526 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
2527 #endif
2528 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
2529 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
2530 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
2531 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
2532 }
2533
2534 tcg_regset_clear(s->reserved_regs);
2535 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
2536 }
2537
2538 typedef struct {
2539 DebugFrameHeader h;
2540 uint8_t fde_def_cfa[4];
2541 uint8_t fde_reg_ofs[14];
2542 } DebugFrame;
2543
2544 /* We're expecting a 2 byte uleb128 encoded value. */
2545 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
2546
2547 #if !defined(__ELF__)
2548 /* Host machine without ELF. */
2549 #elif TCG_TARGET_REG_BITS == 64
2550 #define ELF_HOST_MACHINE EM_X86_64
2551 static const DebugFrame debug_frame = {
2552 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2553 .h.cie.id = -1,
2554 .h.cie.version = 1,
2555 .h.cie.code_align = 1,
2556 .h.cie.data_align = 0x78, /* sleb128 -8 */
2557 .h.cie.return_column = 16,
2558
2559 /* Total FDE size does not include the "len" member. */
2560 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2561
2562 .fde_def_cfa = {
2563 12, 7, /* DW_CFA_def_cfa %rsp, ... */
2564 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
2565 (FRAME_SIZE >> 7)
2566 },
2567 .fde_reg_ofs = {
2568 0x90, 1, /* DW_CFA_offset, %rip, -8 */
2569 /* The following ordering must match tcg_target_callee_save_regs. */
2570 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
2571 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
2572 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
2573 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
2574 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
2575 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
2576 }
2577 };
2578 #else
2579 #define ELF_HOST_MACHINE EM_386
2580 static const DebugFrame debug_frame = {
2581 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2582 .h.cie.id = -1,
2583 .h.cie.version = 1,
2584 .h.cie.code_align = 1,
2585 .h.cie.data_align = 0x7c, /* sleb128 -4 */
2586 .h.cie.return_column = 8,
2587
2588 /* Total FDE size does not include the "len" member. */
2589 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2590
2591 .fde_def_cfa = {
2592 12, 4, /* DW_CFA_def_cfa %esp, ... */
2593 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
2594 (FRAME_SIZE >> 7)
2595 },
2596 .fde_reg_ofs = {
2597 0x88, 1, /* DW_CFA_offset, %eip, -4 */
2598 /* The following ordering must match tcg_target_callee_save_regs. */
2599 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
2600 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
2601 0x86, 4, /* DW_CFA_offset, %esi, -16 */
2602 0x87, 5, /* DW_CFA_offset, %edi, -20 */
2603 }
2604 };
2605 #endif
2606
2607 #if defined(ELF_HOST_MACHINE)
2608 void tcg_register_jit(void *buf, size_t buf_size)
2609 {
2610 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
2611 }
2612 #endif