]> git.ipfire.org Git - thirdparty/qemu.git/blob - tcg/i386/tcg-target.inc.c
Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into...
[thirdparty/qemu.git] / tcg / i386 / tcg-target.inc.c
1 /*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "tcg-pool.inc.c"
26
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31 #else
32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33 #endif
34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36 #if TCG_TARGET_REG_BITS == 64
37 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39 #endif
40 };
41 #endif
42
43 static const int tcg_target_reg_alloc_order[] = {
44 #if TCG_TARGET_REG_BITS == 64
45 TCG_REG_RBP,
46 TCG_REG_RBX,
47 TCG_REG_R12,
48 TCG_REG_R13,
49 TCG_REG_R14,
50 TCG_REG_R15,
51 TCG_REG_R10,
52 TCG_REG_R11,
53 TCG_REG_R9,
54 TCG_REG_R8,
55 TCG_REG_RCX,
56 TCG_REG_RDX,
57 TCG_REG_RSI,
58 TCG_REG_RDI,
59 TCG_REG_RAX,
60 #else
61 TCG_REG_EBX,
62 TCG_REG_ESI,
63 TCG_REG_EDI,
64 TCG_REG_EBP,
65 TCG_REG_ECX,
66 TCG_REG_EDX,
67 TCG_REG_EAX,
68 #endif
69 TCG_REG_XMM0,
70 TCG_REG_XMM1,
71 TCG_REG_XMM2,
72 TCG_REG_XMM3,
73 TCG_REG_XMM4,
74 TCG_REG_XMM5,
75 #ifndef _WIN64
76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
78 TCG_REG_XMM6,
79 TCG_REG_XMM7,
80 #if TCG_TARGET_REG_BITS == 64
81 TCG_REG_XMM8,
82 TCG_REG_XMM9,
83 TCG_REG_XMM10,
84 TCG_REG_XMM11,
85 TCG_REG_XMM12,
86 TCG_REG_XMM13,
87 TCG_REG_XMM14,
88 TCG_REG_XMM15,
89 #endif
90 #endif
91 };
92
93 static const int tcg_target_call_iarg_regs[] = {
94 #if TCG_TARGET_REG_BITS == 64
95 #if defined(_WIN64)
96 TCG_REG_RCX,
97 TCG_REG_RDX,
98 #else
99 TCG_REG_RDI,
100 TCG_REG_RSI,
101 TCG_REG_RDX,
102 TCG_REG_RCX,
103 #endif
104 TCG_REG_R8,
105 TCG_REG_R9,
106 #else
107 /* 32 bit mode uses stack based calling convention (GCC default). */
108 #endif
109 };
110
111 static const int tcg_target_call_oarg_regs[] = {
112 TCG_REG_EAX,
113 #if TCG_TARGET_REG_BITS == 32
114 TCG_REG_EDX
115 #endif
116 };
117
118 /* Constants we accept. */
119 #define TCG_CT_CONST_S32 0x100
120 #define TCG_CT_CONST_U32 0x200
121 #define TCG_CT_CONST_I32 0x400
122 #define TCG_CT_CONST_WSZ 0x800
123
124 /* Registers used with L constraint, which are the first argument
125 registers on x86_64, and two random call clobbered registers on
126 i386. */
127 #if TCG_TARGET_REG_BITS == 64
128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130 #else
131 # define TCG_REG_L0 TCG_REG_EAX
132 # define TCG_REG_L1 TCG_REG_EDX
133 #endif
134
135 /* The host compiler should supply <cpuid.h> to enable runtime features
136 detection, as we're not going to go so far as our own inline assembly.
137 If not available, default values will be assumed. */
138 #if defined(CONFIG_CPUID_H)
139 #include "qemu/cpuid.h"
140 #endif
141
142 /* For 64-bit, we always know that CMOV is available. */
143 #if TCG_TARGET_REG_BITS == 64
144 # define have_cmov 1
145 #elif defined(CONFIG_CPUID_H)
146 static bool have_cmov;
147 #else
148 # define have_cmov 0
149 #endif
150
151 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
152 it there. Therefore we always define the variable. */
153 bool have_bmi1;
154 bool have_popcnt;
155 bool have_avx1;
156 bool have_avx2;
157
158 #ifdef CONFIG_CPUID_H
159 static bool have_movbe;
160 static bool have_bmi2;
161 static bool have_lzcnt;
162 #else
163 # define have_movbe 0
164 # define have_bmi2 0
165 # define have_lzcnt 0
166 #endif
167
168 static tcg_insn_unit *tb_ret_addr;
169
170 static void patch_reloc(tcg_insn_unit *code_ptr, int type,
171 intptr_t value, intptr_t addend)
172 {
173 value += addend;
174 switch(type) {
175 case R_386_PC32:
176 value -= (uintptr_t)code_ptr;
177 if (value != (int32_t)value) {
178 tcg_abort();
179 }
180 /* FALLTHRU */
181 case R_386_32:
182 tcg_patch32(code_ptr, value);
183 break;
184 case R_386_PC8:
185 value -= (uintptr_t)code_ptr;
186 if (value != (int8_t)value) {
187 tcg_abort();
188 }
189 tcg_patch8(code_ptr, value);
190 break;
191 default:
192 tcg_abort();
193 }
194 }
195
196 #if TCG_TARGET_REG_BITS == 64
197 #define ALL_GENERAL_REGS 0x0000ffffu
198 #define ALL_VECTOR_REGS 0xffff0000u
199 #else
200 #define ALL_GENERAL_REGS 0x000000ffu
201 #define ALL_VECTOR_REGS 0x00ff0000u
202 #endif
203
204 /* parse target specific constraints */
205 static const char *target_parse_constraint(TCGArgConstraint *ct,
206 const char *ct_str, TCGType type)
207 {
208 switch(*ct_str++) {
209 case 'a':
210 ct->ct |= TCG_CT_REG;
211 tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
212 break;
213 case 'b':
214 ct->ct |= TCG_CT_REG;
215 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
216 break;
217 case 'c':
218 ct->ct |= TCG_CT_REG;
219 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
220 break;
221 case 'd':
222 ct->ct |= TCG_CT_REG;
223 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
224 break;
225 case 'S':
226 ct->ct |= TCG_CT_REG;
227 tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
228 break;
229 case 'D':
230 ct->ct |= TCG_CT_REG;
231 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
232 break;
233 case 'q':
234 /* A register that can be used as a byte operand. */
235 ct->ct |= TCG_CT_REG;
236 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
237 break;
238 case 'Q':
239 /* A register with an addressable second byte (e.g. %ah). */
240 ct->ct |= TCG_CT_REG;
241 ct->u.regs = 0xf;
242 break;
243 case 'r':
244 /* A general register. */
245 ct->ct |= TCG_CT_REG;
246 ct->u.regs |= ALL_GENERAL_REGS;
247 break;
248 case 'W':
249 /* With TZCNT/LZCNT, we can have operand-size as an input. */
250 ct->ct |= TCG_CT_CONST_WSZ;
251 break;
252 case 'x':
253 /* A vector register. */
254 ct->ct |= TCG_CT_REG;
255 ct->u.regs |= ALL_VECTOR_REGS;
256 break;
257
258 /* qemu_ld/st address constraint */
259 case 'L':
260 ct->ct |= TCG_CT_REG;
261 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
262 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
263 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
264 break;
265
266 case 'e':
267 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
268 break;
269 case 'Z':
270 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
271 break;
272 case 'I':
273 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
274 break;
275
276 default:
277 return NULL;
278 }
279 return ct_str;
280 }
281
282 /* test if a constant matches the constraint */
283 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
284 const TCGArgConstraint *arg_ct)
285 {
286 int ct = arg_ct->ct;
287 if (ct & TCG_CT_CONST) {
288 return 1;
289 }
290 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
291 return 1;
292 }
293 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
294 return 1;
295 }
296 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
297 return 1;
298 }
299 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
300 return 1;
301 }
302 return 0;
303 }
304
305 #if TCG_TARGET_REG_BITS == 64
306 # define LOWREGMASK(x) ((x) & 7)
307 #else
308 # define LOWREGMASK(x) (x)
309 #endif
310
311 #define P_EXT 0x100 /* 0x0f opcode prefix */
312 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
313 #define P_DATA16 0x400 /* 0x66 opcode prefix */
314 #if TCG_TARGET_REG_BITS == 64
315 # define P_ADDR32 0x800 /* 0x67 opcode prefix */
316 # define P_REXW 0x1000 /* Set REX.W = 1 */
317 # define P_REXB_R 0x2000 /* REG field as byte register */
318 # define P_REXB_RM 0x4000 /* R/M field as byte register */
319 # define P_GS 0x8000 /* gs segment override */
320 #else
321 # define P_ADDR32 0
322 # define P_REXW 0
323 # define P_REXB_R 0
324 # define P_REXB_RM 0
325 # define P_GS 0
326 #endif
327 #define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
328 #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
329 #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
330 #define P_VEXL 0x80000 /* Set VEX.L = 1 */
331
332 #define OPC_ARITH_EvIz (0x81)
333 #define OPC_ARITH_EvIb (0x83)
334 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
335 #define OPC_ANDN (0xf2 | P_EXT38)
336 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
337 #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
338 #define OPC_BSF (0xbc | P_EXT)
339 #define OPC_BSR (0xbd | P_EXT)
340 #define OPC_BSWAP (0xc8 | P_EXT)
341 #define OPC_CALL_Jz (0xe8)
342 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
343 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
344 #define OPC_DEC_r32 (0x48)
345 #define OPC_IMUL_GvEv (0xaf | P_EXT)
346 #define OPC_IMUL_GvEvIb (0x6b)
347 #define OPC_IMUL_GvEvIz (0x69)
348 #define OPC_INC_r32 (0x40)
349 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
350 #define OPC_JCC_short (0x70) /* ... plus condition code */
351 #define OPC_JMP_long (0xe9)
352 #define OPC_JMP_short (0xeb)
353 #define OPC_LEA (0x8d)
354 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
355 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
356 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
357 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
358 #define OPC_MOVB_EvIz (0xc6)
359 #define OPC_MOVL_EvIz (0xc7)
360 #define OPC_MOVL_Iv (0xb8)
361 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
362 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
363 #define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
364 #define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
365 #define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
366 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
367 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
368 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
369 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
370 #define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
371 #define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
372 #define OPC_MOVSBL (0xbe | P_EXT)
373 #define OPC_MOVSWL (0xbf | P_EXT)
374 #define OPC_MOVSLQ (0x63 | P_REXW)
375 #define OPC_MOVZBL (0xb6 | P_EXT)
376 #define OPC_MOVZWL (0xb7 | P_EXT)
377 #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
378 #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
379 #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
380 #define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
381 #define OPC_PADDB (0xfc | P_EXT | P_DATA16)
382 #define OPC_PADDW (0xfd | P_EXT | P_DATA16)
383 #define OPC_PADDD (0xfe | P_EXT | P_DATA16)
384 #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
385 #define OPC_PAND (0xdb | P_EXT | P_DATA16)
386 #define OPC_PANDN (0xdf | P_EXT | P_DATA16)
387 #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
388 #define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
389 #define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
390 #define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
391 #define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
392 #define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
393 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
394 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
395 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
396 #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
397 #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
398 #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
399 #define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
400 #define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
401 #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
402 #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
403 #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
404 #define OPC_POR (0xeb | P_EXT | P_DATA16)
405 #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
406 #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
407 #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
408 #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
409 #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
410 #define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
411 #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
412 #define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
413 #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
414 #define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
415 #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
416 #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
417 #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
418 #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
419 #define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
420 #define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
421 #define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
422 #define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
423 #define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
424 #define OPC_PXOR (0xef | P_EXT | P_DATA16)
425 #define OPC_POP_r32 (0x58)
426 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
427 #define OPC_PUSH_r32 (0x50)
428 #define OPC_PUSH_Iv (0x68)
429 #define OPC_PUSH_Ib (0x6a)
430 #define OPC_RET (0xc3)
431 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
432 #define OPC_SHIFT_1 (0xd1)
433 #define OPC_SHIFT_Ib (0xc1)
434 #define OPC_SHIFT_cl (0xd3)
435 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
436 #define OPC_SHUFPS (0xc6 | P_EXT)
437 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
438 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
439 #define OPC_TESTL (0x85)
440 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
441 #define OPC_UD2 (0x0b | P_EXT)
442 #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
443 #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
444 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
445 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
446 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
447 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
448 #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW)
449 #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
450 #define OPC_VZEROUPPER (0x77 | P_EXT)
451 #define OPC_XCHG_ax_r32 (0x90)
452
453 #define OPC_GRP3_Ev (0xf7)
454 #define OPC_GRP5 (0xff)
455 #define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
456
457 /* Group 1 opcode extensions for 0x80-0x83.
458 These are also used as modifiers for OPC_ARITH. */
459 #define ARITH_ADD 0
460 #define ARITH_OR 1
461 #define ARITH_ADC 2
462 #define ARITH_SBB 3
463 #define ARITH_AND 4
464 #define ARITH_SUB 5
465 #define ARITH_XOR 6
466 #define ARITH_CMP 7
467
468 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
469 #define SHIFT_ROL 0
470 #define SHIFT_ROR 1
471 #define SHIFT_SHL 4
472 #define SHIFT_SHR 5
473 #define SHIFT_SAR 7
474
475 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
476 #define EXT3_NOT 2
477 #define EXT3_NEG 3
478 #define EXT3_MUL 4
479 #define EXT3_IMUL 5
480 #define EXT3_DIV 6
481 #define EXT3_IDIV 7
482
483 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
484 #define EXT5_INC_Ev 0
485 #define EXT5_DEC_Ev 1
486 #define EXT5_CALLN_Ev 2
487 #define EXT5_JMPN_Ev 4
488
489 /* Condition codes to be added to OPC_JCC_{long,short}. */
490 #define JCC_JMP (-1)
491 #define JCC_JO 0x0
492 #define JCC_JNO 0x1
493 #define JCC_JB 0x2
494 #define JCC_JAE 0x3
495 #define JCC_JE 0x4
496 #define JCC_JNE 0x5
497 #define JCC_JBE 0x6
498 #define JCC_JA 0x7
499 #define JCC_JS 0x8
500 #define JCC_JNS 0x9
501 #define JCC_JP 0xa
502 #define JCC_JNP 0xb
503 #define JCC_JL 0xc
504 #define JCC_JGE 0xd
505 #define JCC_JLE 0xe
506 #define JCC_JG 0xf
507
508 static const uint8_t tcg_cond_to_jcc[] = {
509 [TCG_COND_EQ] = JCC_JE,
510 [TCG_COND_NE] = JCC_JNE,
511 [TCG_COND_LT] = JCC_JL,
512 [TCG_COND_GE] = JCC_JGE,
513 [TCG_COND_LE] = JCC_JLE,
514 [TCG_COND_GT] = JCC_JG,
515 [TCG_COND_LTU] = JCC_JB,
516 [TCG_COND_GEU] = JCC_JAE,
517 [TCG_COND_LEU] = JCC_JBE,
518 [TCG_COND_GTU] = JCC_JA,
519 };
520
521 #if TCG_TARGET_REG_BITS == 64
522 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
523 {
524 int rex;
525
526 if (opc & P_GS) {
527 tcg_out8(s, 0x65);
528 }
529 if (opc & P_DATA16) {
530 /* We should never be asking for both 16 and 64-bit operation. */
531 tcg_debug_assert((opc & P_REXW) == 0);
532 tcg_out8(s, 0x66);
533 }
534 if (opc & P_ADDR32) {
535 tcg_out8(s, 0x67);
536 }
537 if (opc & P_SIMDF3) {
538 tcg_out8(s, 0xf3);
539 } else if (opc & P_SIMDF2) {
540 tcg_out8(s, 0xf2);
541 }
542
543 rex = 0;
544 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
545 rex |= (r & 8) >> 1; /* REX.R */
546 rex |= (x & 8) >> 2; /* REX.X */
547 rex |= (rm & 8) >> 3; /* REX.B */
548
549 /* P_REXB_{R,RM} indicates that the given register is the low byte.
550 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
551 as otherwise the encoding indicates %[abcd]h. Note that the values
552 that are ORed in merely indicate that the REX byte must be present;
553 those bits get discarded in output. */
554 rex |= opc & (r >= 4 ? P_REXB_R : 0);
555 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
556
557 if (rex) {
558 tcg_out8(s, (uint8_t)(rex | 0x40));
559 }
560
561 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
562 tcg_out8(s, 0x0f);
563 if (opc & P_EXT38) {
564 tcg_out8(s, 0x38);
565 } else if (opc & P_EXT3A) {
566 tcg_out8(s, 0x3a);
567 }
568 }
569
570 tcg_out8(s, opc);
571 }
572 #else
573 static void tcg_out_opc(TCGContext *s, int opc)
574 {
575 if (opc & P_DATA16) {
576 tcg_out8(s, 0x66);
577 }
578 if (opc & P_SIMDF3) {
579 tcg_out8(s, 0xf3);
580 } else if (opc & P_SIMDF2) {
581 tcg_out8(s, 0xf2);
582 }
583 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
584 tcg_out8(s, 0x0f);
585 if (opc & P_EXT38) {
586 tcg_out8(s, 0x38);
587 } else if (opc & P_EXT3A) {
588 tcg_out8(s, 0x3a);
589 }
590 }
591 tcg_out8(s, opc);
592 }
593 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
594 the 32-bit compilation paths. This method works with all versions of gcc,
595 whereas relying on optimization may not be able to exclude them. */
596 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
597 #endif
598
599 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
600 {
601 tcg_out_opc(s, opc, r, rm, 0);
602 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
603 }
604
605 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
606 int rm, int index)
607 {
608 int tmp;
609
610 /* Use the two byte form if possible, which cannot encode
611 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
612 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
613 && ((rm | index) & 8) == 0) {
614 /* Two byte VEX prefix. */
615 tcg_out8(s, 0xc5);
616
617 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
618 } else {
619 /* Three byte VEX prefix. */
620 tcg_out8(s, 0xc4);
621
622 /* VEX.m-mmmm */
623 if (opc & P_EXT3A) {
624 tmp = 3;
625 } else if (opc & P_EXT38) {
626 tmp = 2;
627 } else if (opc & P_EXT) {
628 tmp = 1;
629 } else {
630 g_assert_not_reached();
631 }
632 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
633 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
634 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
635 tcg_out8(s, tmp);
636
637 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
638 }
639
640 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
641 /* VEX.pp */
642 if (opc & P_DATA16) {
643 tmp |= 1; /* 0x66 */
644 } else if (opc & P_SIMDF3) {
645 tmp |= 2; /* 0xf3 */
646 } else if (opc & P_SIMDF2) {
647 tmp |= 3; /* 0xf2 */
648 }
649 tmp |= (~v & 15) << 3; /* VEX.vvvv */
650 tcg_out8(s, tmp);
651 tcg_out8(s, opc);
652 }
653
654 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
655 {
656 tcg_out_vex_opc(s, opc, r, v, rm, 0);
657 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
658 }
659
660 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
661 We handle either RM and INDEX missing with a negative value. In 64-bit
662 mode for absolute addresses, ~RM is the size of the immediate operand
663 that will follow the instruction. */
664
665 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
666 int shift, intptr_t offset)
667 {
668 int mod, len;
669
670 if (index < 0 && rm < 0) {
671 if (TCG_TARGET_REG_BITS == 64) {
672 /* Try for a rip-relative addressing mode. This has replaced
673 the 32-bit-mode absolute addressing encoding. */
674 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
675 intptr_t disp = offset - pc;
676 if (disp == (int32_t)disp) {
677 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
678 tcg_out32(s, disp);
679 return;
680 }
681
682 /* Try for an absolute address encoding. This requires the
683 use of the MODRM+SIB encoding and is therefore larger than
684 rip-relative addressing. */
685 if (offset == (int32_t)offset) {
686 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
687 tcg_out8(s, (4 << 3) | 5);
688 tcg_out32(s, offset);
689 return;
690 }
691
692 /* ??? The memory isn't directly addressable. */
693 g_assert_not_reached();
694 } else {
695 /* Absolute address. */
696 tcg_out8(s, (r << 3) | 5);
697 tcg_out32(s, offset);
698 return;
699 }
700 }
701
702 /* Find the length of the immediate addend. Note that the encoding
703 that would be used for (%ebp) indicates absolute addressing. */
704 if (rm < 0) {
705 mod = 0, len = 4, rm = 5;
706 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
707 mod = 0, len = 0;
708 } else if (offset == (int8_t)offset) {
709 mod = 0x40, len = 1;
710 } else {
711 mod = 0x80, len = 4;
712 }
713
714 /* Use a single byte MODRM format if possible. Note that the encoding
715 that would be used for %esp is the escape to the two byte form. */
716 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
717 /* Single byte MODRM format. */
718 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
719 } else {
720 /* Two byte MODRM+SIB format. */
721
722 /* Note that the encoding that would place %esp into the index
723 field indicates no index register. In 64-bit mode, the REX.X
724 bit counts, so %r12 can be used as the index. */
725 if (index < 0) {
726 index = 4;
727 } else {
728 tcg_debug_assert(index != TCG_REG_ESP);
729 }
730
731 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
732 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
733 }
734
735 if (len == 1) {
736 tcg_out8(s, offset);
737 } else if (len == 4) {
738 tcg_out32(s, offset);
739 }
740 }
741
742 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
743 int index, int shift, intptr_t offset)
744 {
745 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
746 tcg_out_sib_offset(s, r, rm, index, shift, offset);
747 }
748
749 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
750 int rm, int index, int shift,
751 intptr_t offset)
752 {
753 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
754 tcg_out_sib_offset(s, r, rm, index, shift, offset);
755 }
756
757 /* A simplification of the above with no index or shift. */
758 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
759 int rm, intptr_t offset)
760 {
761 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
762 }
763
764 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
765 int v, int rm, intptr_t offset)
766 {
767 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
768 }
769
770 /* Output an opcode with an expected reference to the constant pool. */
771 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
772 {
773 tcg_out_opc(s, opc, r, 0, 0);
774 /* Absolute for 32-bit, pc-relative for 64-bit. */
775 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
776 tcg_out32(s, 0);
777 }
778
779 /* Output an opcode with an expected reference to the constant pool. */
780 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
781 {
782 tcg_out_vex_opc(s, opc, r, 0, 0, 0);
783 /* Absolute for 32-bit, pc-relative for 64-bit. */
784 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
785 tcg_out32(s, 0);
786 }
787
788 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
789 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
790 {
791 /* Propagate an opcode prefix, such as P_REXW. */
792 int ext = subop & ~0x7;
793 subop &= 0x7;
794
795 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
796 }
797
798 static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
799 {
800 int rexw = 0;
801
802 if (arg == ret) {
803 return;
804 }
805 switch (type) {
806 case TCG_TYPE_I64:
807 rexw = P_REXW;
808 /* fallthru */
809 case TCG_TYPE_I32:
810 if (ret < 16) {
811 if (arg < 16) {
812 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
813 } else {
814 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
815 }
816 } else {
817 if (arg < 16) {
818 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
819 } else {
820 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
821 }
822 }
823 break;
824
825 case TCG_TYPE_V64:
826 tcg_debug_assert(ret >= 16 && arg >= 16);
827 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
828 break;
829 case TCG_TYPE_V128:
830 tcg_debug_assert(ret >= 16 && arg >= 16);
831 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
832 break;
833 case TCG_TYPE_V256:
834 tcg_debug_assert(ret >= 16 && arg >= 16);
835 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
836 break;
837
838 default:
839 g_assert_not_reached();
840 }
841 }
842
843 static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
844 TCGReg r, TCGReg a)
845 {
846 if (have_avx2) {
847 static const int dup_insn[4] = {
848 OPC_VPBROADCASTB, OPC_VPBROADCASTW,
849 OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
850 };
851 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
852 tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
853 } else {
854 switch (vece) {
855 case MO_8:
856 /* ??? With zero in a register, use PSHUFB. */
857 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, 0, a);
858 a = r;
859 /* FALLTHRU */
860 case MO_16:
861 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, 0, a);
862 a = r;
863 /* FALLTHRU */
864 case MO_32:
865 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
866 /* imm8 operand: all output lanes selected from input lane 0. */
867 tcg_out8(s, 0);
868 break;
869 case MO_64:
870 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, 0, a);
871 break;
872 default:
873 g_assert_not_reached();
874 }
875 }
876 }
877
878 static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
879 TCGReg ret, tcg_target_long arg)
880 {
881 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
882
883 if (arg == 0) {
884 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
885 return;
886 }
887 if (arg == -1) {
888 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
889 return;
890 }
891
892 if (TCG_TARGET_REG_BITS == 64) {
893 if (type == TCG_TYPE_V64) {
894 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
895 } else if (have_avx2) {
896 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
897 } else {
898 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
899 }
900 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
901 } else if (have_avx2) {
902 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
903 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
904 } else {
905 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
906 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
907 tcg_out_dup_vec(s, type, MO_32, ret, ret);
908 }
909 }
910
911 static void tcg_out_movi(TCGContext *s, TCGType type,
912 TCGReg ret, tcg_target_long arg)
913 {
914 tcg_target_long diff;
915
916 switch (type) {
917 case TCG_TYPE_I32:
918 #if TCG_TARGET_REG_BITS == 64
919 case TCG_TYPE_I64:
920 #endif
921 if (ret < 16) {
922 break;
923 }
924 /* fallthru */
925 case TCG_TYPE_V64:
926 case TCG_TYPE_V128:
927 case TCG_TYPE_V256:
928 tcg_debug_assert(ret >= 16);
929 tcg_out_dupi_vec(s, type, ret, arg);
930 return;
931 default:
932 g_assert_not_reached();
933 }
934
935 if (arg == 0) {
936 tgen_arithr(s, ARITH_XOR, ret, ret);
937 return;
938 }
939 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
940 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
941 tcg_out32(s, arg);
942 return;
943 }
944 if (arg == (int32_t)arg) {
945 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
946 tcg_out32(s, arg);
947 return;
948 }
949
950 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
951 diff = arg - ((uintptr_t)s->code_ptr + 7);
952 if (diff == (int32_t)diff) {
953 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
954 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
955 tcg_out32(s, diff);
956 return;
957 }
958
959 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
960 tcg_out64(s, arg);
961 }
962
963 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
964 {
965 if (val == (int8_t)val) {
966 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
967 tcg_out8(s, val);
968 } else if (val == (int32_t)val) {
969 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
970 tcg_out32(s, val);
971 } else {
972 tcg_abort();
973 }
974 }
975
976 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
977 {
978 /* Given the strength of x86 memory ordering, we only need care for
979 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
980 faster than "mfence", so don't bother with the sse insn. */
981 if (a0 & TCG_MO_ST_LD) {
982 tcg_out8(s, 0xf0);
983 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
984 tcg_out8(s, 0);
985 }
986 }
987
988 static inline void tcg_out_push(TCGContext *s, int reg)
989 {
990 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
991 }
992
993 static inline void tcg_out_pop(TCGContext *s, int reg)
994 {
995 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
996 }
997
998 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
999 TCGReg arg1, intptr_t arg2)
1000 {
1001 switch (type) {
1002 case TCG_TYPE_I32:
1003 if (ret < 16) {
1004 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1005 } else {
1006 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1007 }
1008 break;
1009 case TCG_TYPE_I64:
1010 if (ret < 16) {
1011 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1012 break;
1013 }
1014 /* FALLTHRU */
1015 case TCG_TYPE_V64:
1016 tcg_debug_assert(ret >= 16);
1017 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1018 break;
1019 case TCG_TYPE_V128:
1020 tcg_debug_assert(ret >= 16);
1021 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
1022 break;
1023 case TCG_TYPE_V256:
1024 tcg_debug_assert(ret >= 16);
1025 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1026 ret, 0, arg1, arg2);
1027 break;
1028 default:
1029 g_assert_not_reached();
1030 }
1031 }
1032
1033 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1034 TCGReg arg1, intptr_t arg2)
1035 {
1036 switch (type) {
1037 case TCG_TYPE_I32:
1038 if (arg < 16) {
1039 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1040 } else {
1041 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1042 }
1043 break;
1044 case TCG_TYPE_I64:
1045 if (arg < 16) {
1046 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1047 break;
1048 }
1049 /* FALLTHRU */
1050 case TCG_TYPE_V64:
1051 tcg_debug_assert(arg >= 16);
1052 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1053 break;
1054 case TCG_TYPE_V128:
1055 tcg_debug_assert(arg >= 16);
1056 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
1057 break;
1058 case TCG_TYPE_V256:
1059 tcg_debug_assert(arg >= 16);
1060 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1061 arg, 0, arg1, arg2);
1062 break;
1063 default:
1064 g_assert_not_reached();
1065 }
1066 }
1067
1068 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1069 TCGReg base, intptr_t ofs)
1070 {
1071 int rexw = 0;
1072 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1073 if (val != (int32_t)val) {
1074 return false;
1075 }
1076 rexw = P_REXW;
1077 } else if (type != TCG_TYPE_I32) {
1078 return false;
1079 }
1080 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1081 tcg_out32(s, val);
1082 return true;
1083 }
1084
1085 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1086 {
1087 /* Propagate an opcode prefix, such as P_DATA16. */
1088 int ext = subopc & ~0x7;
1089 subopc &= 0x7;
1090
1091 if (count == 1) {
1092 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1093 } else {
1094 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1095 tcg_out8(s, count);
1096 }
1097 }
1098
1099 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1100 {
1101 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1102 }
1103
1104 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1105 {
1106 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1107 }
1108
1109 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1110 {
1111 /* movzbl */
1112 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1113 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1114 }
1115
1116 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1117 {
1118 /* movsbl */
1119 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1120 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1121 }
1122
1123 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1124 {
1125 /* movzwl */
1126 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1127 }
1128
1129 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1130 {
1131 /* movsw[lq] */
1132 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1133 }
1134
1135 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1136 {
1137 /* 32-bit mov zero extends. */
1138 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1139 }
1140
1141 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1142 {
1143 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1144 }
1145
1146 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1147 {
1148 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1149 }
1150
1151 static void tgen_arithi(TCGContext *s, int c, int r0,
1152 tcg_target_long val, int cf)
1153 {
1154 int rexw = 0;
1155
1156 if (TCG_TARGET_REG_BITS == 64) {
1157 rexw = c & -8;
1158 c &= 7;
1159 }
1160
1161 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1162 partial flags update stalls on Pentium4 and are not recommended
1163 by current Intel optimization manuals. */
1164 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1165 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1166 if (TCG_TARGET_REG_BITS == 64) {
1167 /* The single-byte increment encodings are re-tasked as the
1168 REX prefixes. Use the MODRM encoding. */
1169 tcg_out_modrm(s, OPC_GRP5 + rexw,
1170 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1171 } else {
1172 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1173 }
1174 return;
1175 }
1176
1177 if (c == ARITH_AND) {
1178 if (TCG_TARGET_REG_BITS == 64) {
1179 if (val == 0xffffffffu) {
1180 tcg_out_ext32u(s, r0, r0);
1181 return;
1182 }
1183 if (val == (uint32_t)val) {
1184 /* AND with no high bits set can use a 32-bit operation. */
1185 rexw = 0;
1186 }
1187 }
1188 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1189 tcg_out_ext8u(s, r0, r0);
1190 return;
1191 }
1192 if (val == 0xffffu) {
1193 tcg_out_ext16u(s, r0, r0);
1194 return;
1195 }
1196 }
1197
1198 if (val == (int8_t)val) {
1199 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1200 tcg_out8(s, val);
1201 return;
1202 }
1203 if (rexw == 0 || val == (int32_t)val) {
1204 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1205 tcg_out32(s, val);
1206 return;
1207 }
1208
1209 tcg_abort();
1210 }
1211
1212 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1213 {
1214 if (val != 0) {
1215 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1216 }
1217 }
1218
1219 /* Use SMALL != 0 to force a short forward branch. */
1220 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1221 {
1222 int32_t val, val1;
1223
1224 if (l->has_value) {
1225 val = tcg_pcrel_diff(s, l->u.value_ptr);
1226 val1 = val - 2;
1227 if ((int8_t)val1 == val1) {
1228 if (opc == -1) {
1229 tcg_out8(s, OPC_JMP_short);
1230 } else {
1231 tcg_out8(s, OPC_JCC_short + opc);
1232 }
1233 tcg_out8(s, val1);
1234 } else {
1235 if (small) {
1236 tcg_abort();
1237 }
1238 if (opc == -1) {
1239 tcg_out8(s, OPC_JMP_long);
1240 tcg_out32(s, val - 5);
1241 } else {
1242 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1243 tcg_out32(s, val - 6);
1244 }
1245 }
1246 } else if (small) {
1247 if (opc == -1) {
1248 tcg_out8(s, OPC_JMP_short);
1249 } else {
1250 tcg_out8(s, OPC_JCC_short + opc);
1251 }
1252 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1253 s->code_ptr += 1;
1254 } else {
1255 if (opc == -1) {
1256 tcg_out8(s, OPC_JMP_long);
1257 } else {
1258 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1259 }
1260 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1261 s->code_ptr += 4;
1262 }
1263 }
1264
1265 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1266 int const_arg2, int rexw)
1267 {
1268 if (const_arg2) {
1269 if (arg2 == 0) {
1270 /* test r, r */
1271 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1272 } else {
1273 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1274 }
1275 } else {
1276 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1277 }
1278 }
1279
1280 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1281 TCGArg arg1, TCGArg arg2, int const_arg2,
1282 TCGLabel *label, int small)
1283 {
1284 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1285 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1286 }
1287
1288 #if TCG_TARGET_REG_BITS == 64
1289 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1290 TCGArg arg1, TCGArg arg2, int const_arg2,
1291 TCGLabel *label, int small)
1292 {
1293 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1294 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1295 }
1296 #else
1297 /* XXX: we implement it at the target level to avoid having to
1298 handle cross basic blocks temporaries */
1299 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1300 const int *const_args, int small)
1301 {
1302 TCGLabel *label_next = gen_new_label();
1303 TCGLabel *label_this = arg_label(args[5]);
1304
1305 switch(args[4]) {
1306 case TCG_COND_EQ:
1307 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1308 label_next, 1);
1309 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1310 label_this, small);
1311 break;
1312 case TCG_COND_NE:
1313 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1314 label_this, small);
1315 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1316 label_this, small);
1317 break;
1318 case TCG_COND_LT:
1319 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1320 label_this, small);
1321 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1322 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1323 label_this, small);
1324 break;
1325 case TCG_COND_LE:
1326 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1327 label_this, small);
1328 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1329 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1330 label_this, small);
1331 break;
1332 case TCG_COND_GT:
1333 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1334 label_this, small);
1335 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1336 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1337 label_this, small);
1338 break;
1339 case TCG_COND_GE:
1340 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1341 label_this, small);
1342 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1343 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1344 label_this, small);
1345 break;
1346 case TCG_COND_LTU:
1347 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1348 label_this, small);
1349 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1350 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1351 label_this, small);
1352 break;
1353 case TCG_COND_LEU:
1354 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1355 label_this, small);
1356 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1357 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1358 label_this, small);
1359 break;
1360 case TCG_COND_GTU:
1361 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1362 label_this, small);
1363 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1364 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1365 label_this, small);
1366 break;
1367 case TCG_COND_GEU:
1368 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1369 label_this, small);
1370 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1371 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1372 label_this, small);
1373 break;
1374 default:
1375 tcg_abort();
1376 }
1377 tcg_out_label(s, label_next, s->code_ptr);
1378 }
1379 #endif
1380
1381 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1382 TCGArg arg1, TCGArg arg2, int const_arg2)
1383 {
1384 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1385 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1386 tcg_out_ext8u(s, dest, dest);
1387 }
1388
1389 #if TCG_TARGET_REG_BITS == 64
1390 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1391 TCGArg arg1, TCGArg arg2, int const_arg2)
1392 {
1393 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1394 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1395 tcg_out_ext8u(s, dest, dest);
1396 }
1397 #else
1398 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1399 const int *const_args)
1400 {
1401 TCGArg new_args[6];
1402 TCGLabel *label_true, *label_over;
1403
1404 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1405
1406 if (args[0] == args[1] || args[0] == args[2]
1407 || (!const_args[3] && args[0] == args[3])
1408 || (!const_args[4] && args[0] == args[4])) {
1409 /* When the destination overlaps with one of the argument
1410 registers, don't do anything tricky. */
1411 label_true = gen_new_label();
1412 label_over = gen_new_label();
1413
1414 new_args[5] = label_arg(label_true);
1415 tcg_out_brcond2(s, new_args, const_args+1, 1);
1416
1417 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1418 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1419 tcg_out_label(s, label_true, s->code_ptr);
1420
1421 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1422 tcg_out_label(s, label_over, s->code_ptr);
1423 } else {
1424 /* When the destination does not overlap one of the arguments,
1425 clear the destination first, jump if cond false, and emit an
1426 increment in the true case. This results in smaller code. */
1427
1428 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1429
1430 label_over = gen_new_label();
1431 new_args[4] = tcg_invert_cond(new_args[4]);
1432 new_args[5] = label_arg(label_over);
1433 tcg_out_brcond2(s, new_args, const_args+1, 1);
1434
1435 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1436 tcg_out_label(s, label_over, s->code_ptr);
1437 }
1438 }
1439 #endif
1440
1441 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1442 TCGReg dest, TCGReg v1)
1443 {
1444 if (have_cmov) {
1445 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1446 } else {
1447 TCGLabel *over = gen_new_label();
1448 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1449 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1450 tcg_out_label(s, over, s->code_ptr);
1451 }
1452 }
1453
1454 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1455 TCGReg c1, TCGArg c2, int const_c2,
1456 TCGReg v1)
1457 {
1458 tcg_out_cmp(s, c1, c2, const_c2, 0);
1459 tcg_out_cmov(s, cond, 0, dest, v1);
1460 }
1461
1462 #if TCG_TARGET_REG_BITS == 64
1463 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1464 TCGReg c1, TCGArg c2, int const_c2,
1465 TCGReg v1)
1466 {
1467 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1468 tcg_out_cmov(s, cond, P_REXW, dest, v1);
1469 }
1470 #endif
1471
1472 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1473 TCGArg arg2, bool const_a2)
1474 {
1475 if (have_bmi1) {
1476 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1477 if (const_a2) {
1478 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1479 } else {
1480 tcg_debug_assert(dest != arg2);
1481 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1482 }
1483 } else {
1484 tcg_debug_assert(dest != arg2);
1485 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1486 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1487 }
1488 }
1489
1490 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1491 TCGArg arg2, bool const_a2)
1492 {
1493 if (have_lzcnt) {
1494 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1495 if (const_a2) {
1496 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1497 } else {
1498 tcg_debug_assert(dest != arg2);
1499 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1500 }
1501 } else {
1502 tcg_debug_assert(!const_a2);
1503 tcg_debug_assert(dest != arg1);
1504 tcg_debug_assert(dest != arg2);
1505
1506 /* Recall that the output of BSR is the index not the count. */
1507 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1508 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1509
1510 /* Since we have destroyed the flags from BSR, we have to re-test. */
1511 tcg_out_cmp(s, arg1, 0, 1, rexw);
1512 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1513 }
1514 }
1515
1516 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1517 {
1518 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1519
1520 if (disp == (int32_t)disp) {
1521 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1522 tcg_out32(s, disp);
1523 } else {
1524 /* rip-relative addressing into the constant pool.
1525 This is 6 + 8 = 14 bytes, as compared to using an
1526 an immediate load 10 + 6 = 16 bytes, plus we may
1527 be able to re-use the pool constant for more calls. */
1528 tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1529 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1530 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1531 tcg_out32(s, 0);
1532 }
1533 }
1534
1535 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1536 {
1537 tcg_out_branch(s, 1, dest);
1538 }
1539
1540 static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1541 {
1542 tcg_out_branch(s, 0, dest);
1543 }
1544
1545 static void tcg_out_nopn(TCGContext *s, int n)
1546 {
1547 int i;
1548 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1549 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1550 * duplicate prefix, and all of the interesting recent cores can
1551 * decode and discard the duplicates in a single cycle.
1552 */
1553 tcg_debug_assert(n >= 1);
1554 for (i = 1; i < n; ++i) {
1555 tcg_out8(s, 0x66);
1556 }
1557 tcg_out8(s, 0x90);
1558 }
1559
1560 #if defined(CONFIG_SOFTMMU)
1561 #include "tcg-ldst.inc.c"
1562
1563 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1564 * int mmu_idx, uintptr_t ra)
1565 */
1566 static void * const qemu_ld_helpers[16] = {
1567 [MO_UB] = helper_ret_ldub_mmu,
1568 [MO_LEUW] = helper_le_lduw_mmu,
1569 [MO_LEUL] = helper_le_ldul_mmu,
1570 [MO_LEQ] = helper_le_ldq_mmu,
1571 [MO_BEUW] = helper_be_lduw_mmu,
1572 [MO_BEUL] = helper_be_ldul_mmu,
1573 [MO_BEQ] = helper_be_ldq_mmu,
1574 };
1575
1576 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1577 * uintxx_t val, int mmu_idx, uintptr_t ra)
1578 */
1579 static void * const qemu_st_helpers[16] = {
1580 [MO_UB] = helper_ret_stb_mmu,
1581 [MO_LEUW] = helper_le_stw_mmu,
1582 [MO_LEUL] = helper_le_stl_mmu,
1583 [MO_LEQ] = helper_le_stq_mmu,
1584 [MO_BEUW] = helper_be_stw_mmu,
1585 [MO_BEUL] = helper_be_stl_mmu,
1586 [MO_BEQ] = helper_be_stq_mmu,
1587 };
1588
1589 /* Perform the TLB load and compare.
1590
1591 Inputs:
1592 ADDRLO and ADDRHI contain the low and high part of the address.
1593
1594 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1595
1596 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1597 This should be offsetof addr_read or addr_write.
1598
1599 Outputs:
1600 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1601 positions of the displacements of forward jumps to the TLB miss case.
1602
1603 Second argument register is loaded with the low part of the address.
1604 In the TLB hit case, it has been adjusted as indicated by the TLB
1605 and so is a host address. In the TLB miss case, it continues to
1606 hold a guest address.
1607
1608 First argument register is clobbered. */
1609
1610 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1611 int mem_index, TCGMemOp opc,
1612 tcg_insn_unit **label_ptr, int which)
1613 {
1614 const TCGReg r0 = TCG_REG_L0;
1615 const TCGReg r1 = TCG_REG_L1;
1616 TCGType ttype = TCG_TYPE_I32;
1617 TCGType tlbtype = TCG_TYPE_I32;
1618 int trexw = 0, hrexw = 0, tlbrexw = 0;
1619 unsigned a_bits = get_alignment_bits(opc);
1620 unsigned s_bits = opc & MO_SIZE;
1621 unsigned a_mask = (1 << a_bits) - 1;
1622 unsigned s_mask = (1 << s_bits) - 1;
1623 target_ulong tlb_mask;
1624
1625 if (TCG_TARGET_REG_BITS == 64) {
1626 if (TARGET_LONG_BITS == 64) {
1627 ttype = TCG_TYPE_I64;
1628 trexw = P_REXW;
1629 }
1630 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1631 hrexw = P_REXW;
1632 if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1633 tlbtype = TCG_TYPE_I64;
1634 tlbrexw = P_REXW;
1635 }
1636 }
1637 }
1638
1639 tcg_out_mov(s, tlbtype, r0, addrlo);
1640 /* If the required alignment is at least as large as the access, simply
1641 copy the address and mask. For lesser alignments, check that we don't
1642 cross pages for the complete access. */
1643 if (a_bits >= s_bits) {
1644 tcg_out_mov(s, ttype, r1, addrlo);
1645 } else {
1646 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1647 }
1648 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1649
1650 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1651 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1652
1653 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1654 tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1655 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1656
1657 tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1658 offsetof(CPUArchState, tlb_table[mem_index][0])
1659 + which);
1660
1661 /* cmp 0(r0), r1 */
1662 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1663
1664 /* Prepare for both the fast path add of the tlb addend, and the slow
1665 path function argument setup. There are two cases worth note:
1666 For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1667 before the fastpath ADDQ below. For 64-bit guest and x32 host, MOVQ
1668 copies the entire guest address for the slow path, while truncation
1669 for the 32-bit host happens with the fastpath ADDL below. */
1670 tcg_out_mov(s, ttype, r1, addrlo);
1671
1672 /* jne slow_path */
1673 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1674 label_ptr[0] = s->code_ptr;
1675 s->code_ptr += 4;
1676
1677 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1678 /* cmp 4(r0), addrhi */
1679 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1680
1681 /* jne slow_path */
1682 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1683 label_ptr[1] = s->code_ptr;
1684 s->code_ptr += 4;
1685 }
1686
1687 /* TLB Hit. */
1688
1689 /* add addend(r0), r1 */
1690 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1691 offsetof(CPUTLBEntry, addend) - which);
1692 }
1693
1694 /*
1695 * Record the context of a call to the out of line helper code for the slow path
1696 * for a load or store, so that we can later generate the correct helper code
1697 */
1698 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
1699 TCGReg datalo, TCGReg datahi,
1700 TCGReg addrlo, TCGReg addrhi,
1701 tcg_insn_unit *raddr,
1702 tcg_insn_unit **label_ptr)
1703 {
1704 TCGLabelQemuLdst *label = new_ldst_label(s);
1705
1706 label->is_ld = is_ld;
1707 label->oi = oi;
1708 label->datalo_reg = datalo;
1709 label->datahi_reg = datahi;
1710 label->addrlo_reg = addrlo;
1711 label->addrhi_reg = addrhi;
1712 label->raddr = raddr;
1713 label->label_ptr[0] = label_ptr[0];
1714 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1715 label->label_ptr[1] = label_ptr[1];
1716 }
1717 }
1718
1719 /*
1720 * Generate code for the slow path for a load at the end of block
1721 */
1722 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1723 {
1724 TCGMemOpIdx oi = l->oi;
1725 TCGMemOp opc = get_memop(oi);
1726 TCGReg data_reg;
1727 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1728
1729 /* resolve label address */
1730 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1731 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1732 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1733 }
1734
1735 if (TCG_TARGET_REG_BITS == 32) {
1736 int ofs = 0;
1737
1738 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1739 ofs += 4;
1740
1741 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1742 ofs += 4;
1743
1744 if (TARGET_LONG_BITS == 64) {
1745 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1746 ofs += 4;
1747 }
1748
1749 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1750 ofs += 4;
1751
1752 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1753 } else {
1754 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1755 /* The second argument is already loaded with addrlo. */
1756 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1757 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1758 (uintptr_t)l->raddr);
1759 }
1760
1761 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1762
1763 data_reg = l->datalo_reg;
1764 switch (opc & MO_SSIZE) {
1765 case MO_SB:
1766 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
1767 break;
1768 case MO_SW:
1769 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
1770 break;
1771 #if TCG_TARGET_REG_BITS == 64
1772 case MO_SL:
1773 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1774 break;
1775 #endif
1776 case MO_UB:
1777 case MO_UW:
1778 /* Note that the helpers have zero-extended to tcg_target_long. */
1779 case MO_UL:
1780 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1781 break;
1782 case MO_Q:
1783 if (TCG_TARGET_REG_BITS == 64) {
1784 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1785 } else if (data_reg == TCG_REG_EDX) {
1786 /* xchg %edx, %eax */
1787 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1788 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1789 } else {
1790 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1791 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1792 }
1793 break;
1794 default:
1795 tcg_abort();
1796 }
1797
1798 /* Jump to the code corresponding to next IR of qemu_st */
1799 tcg_out_jmp(s, l->raddr);
1800 }
1801
1802 /*
1803 * Generate code for the slow path for a store at the end of block
1804 */
1805 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1806 {
1807 TCGMemOpIdx oi = l->oi;
1808 TCGMemOp opc = get_memop(oi);
1809 TCGMemOp s_bits = opc & MO_SIZE;
1810 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1811 TCGReg retaddr;
1812
1813 /* resolve label address */
1814 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1815 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1816 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1817 }
1818
1819 if (TCG_TARGET_REG_BITS == 32) {
1820 int ofs = 0;
1821
1822 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1823 ofs += 4;
1824
1825 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1826 ofs += 4;
1827
1828 if (TARGET_LONG_BITS == 64) {
1829 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1830 ofs += 4;
1831 }
1832
1833 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1834 ofs += 4;
1835
1836 if (s_bits == MO_64) {
1837 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1838 ofs += 4;
1839 }
1840
1841 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1842 ofs += 4;
1843
1844 retaddr = TCG_REG_EAX;
1845 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1846 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1847 } else {
1848 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1849 /* The second argument is already loaded with addrlo. */
1850 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1851 tcg_target_call_iarg_regs[2], l->datalo_reg);
1852 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1853
1854 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1855 retaddr = tcg_target_call_iarg_regs[4];
1856 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1857 } else {
1858 retaddr = TCG_REG_RAX;
1859 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1860 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1861 TCG_TARGET_CALL_STACK_OFFSET);
1862 }
1863 }
1864
1865 /* "Tail call" to the helper, with the return address back inline. */
1866 tcg_out_push(s, retaddr);
1867 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1868 }
1869 #elif defined(__x86_64__) && defined(__linux__)
1870 # include <asm/prctl.h>
1871 # include <sys/prctl.h>
1872
1873 int arch_prctl(int code, unsigned long addr);
1874
1875 static int guest_base_flags;
1876 static inline void setup_guest_base_seg(void)
1877 {
1878 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1879 guest_base_flags = P_GS;
1880 }
1881 }
1882 #else
1883 # define guest_base_flags 0
1884 static inline void setup_guest_base_seg(void) { }
1885 #endif /* SOFTMMU */
1886
1887 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1888 TCGReg base, int index, intptr_t ofs,
1889 int seg, TCGMemOp memop)
1890 {
1891 const TCGMemOp real_bswap = memop & MO_BSWAP;
1892 TCGMemOp bswap = real_bswap;
1893 int movop = OPC_MOVL_GvEv;
1894
1895 if (have_movbe && real_bswap) {
1896 bswap = 0;
1897 movop = OPC_MOVBE_GyMy;
1898 }
1899
1900 switch (memop & MO_SSIZE) {
1901 case MO_UB:
1902 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1903 base, index, 0, ofs);
1904 break;
1905 case MO_SB:
1906 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
1907 base, index, 0, ofs);
1908 break;
1909 case MO_UW:
1910 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1911 base, index, 0, ofs);
1912 if (real_bswap) {
1913 tcg_out_rolw_8(s, datalo);
1914 }
1915 break;
1916 case MO_SW:
1917 if (real_bswap) {
1918 if (have_movbe) {
1919 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1920 datalo, base, index, 0, ofs);
1921 } else {
1922 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1923 base, index, 0, ofs);
1924 tcg_out_rolw_8(s, datalo);
1925 }
1926 tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
1927 } else {
1928 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
1929 datalo, base, index, 0, ofs);
1930 }
1931 break;
1932 case MO_UL:
1933 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1934 if (bswap) {
1935 tcg_out_bswap32(s, datalo);
1936 }
1937 break;
1938 #if TCG_TARGET_REG_BITS == 64
1939 case MO_SL:
1940 if (real_bswap) {
1941 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1942 base, index, 0, ofs);
1943 if (bswap) {
1944 tcg_out_bswap32(s, datalo);
1945 }
1946 tcg_out_ext32s(s, datalo, datalo);
1947 } else {
1948 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1949 base, index, 0, ofs);
1950 }
1951 break;
1952 #endif
1953 case MO_Q:
1954 if (TCG_TARGET_REG_BITS == 64) {
1955 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1956 base, index, 0, ofs);
1957 if (bswap) {
1958 tcg_out_bswap64(s, datalo);
1959 }
1960 } else {
1961 if (real_bswap) {
1962 int t = datalo;
1963 datalo = datahi;
1964 datahi = t;
1965 }
1966 if (base != datalo) {
1967 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1968 base, index, 0, ofs);
1969 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1970 base, index, 0, ofs + 4);
1971 } else {
1972 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1973 base, index, 0, ofs + 4);
1974 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1975 base, index, 0, ofs);
1976 }
1977 if (bswap) {
1978 tcg_out_bswap32(s, datalo);
1979 tcg_out_bswap32(s, datahi);
1980 }
1981 }
1982 break;
1983 default:
1984 tcg_abort();
1985 }
1986 }
1987
1988 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1989 EAX. It will be useful once fixed registers globals are less
1990 common. */
1991 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
1992 {
1993 TCGReg datalo, datahi, addrlo;
1994 TCGReg addrhi __attribute__((unused));
1995 TCGMemOpIdx oi;
1996 TCGMemOp opc;
1997 #if defined(CONFIG_SOFTMMU)
1998 int mem_index;
1999 tcg_insn_unit *label_ptr[2];
2000 #endif
2001
2002 datalo = *args++;
2003 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2004 addrlo = *args++;
2005 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2006 oi = *args++;
2007 opc = get_memop(oi);
2008
2009 #if defined(CONFIG_SOFTMMU)
2010 mem_index = get_mmuidx(oi);
2011
2012 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2013 label_ptr, offsetof(CPUTLBEntry, addr_read));
2014
2015 /* TLB Hit. */
2016 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2017
2018 /* Record the current context of a load into ldst label */
2019 add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
2020 s->code_ptr, label_ptr);
2021 #else
2022 {
2023 int32_t offset = guest_base;
2024 TCGReg base = addrlo;
2025 int index = -1;
2026 int seg = 0;
2027
2028 /* For a 32-bit guest, the high 32 bits may contain garbage.
2029 We can do this with the ADDR32 prefix if we're not using
2030 a guest base, or when using segmentation. Otherwise we
2031 need to zero-extend manually. */
2032 if (guest_base == 0 || guest_base_flags) {
2033 seg = guest_base_flags;
2034 offset = 0;
2035 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
2036 seg |= P_ADDR32;
2037 }
2038 } else if (TCG_TARGET_REG_BITS == 64) {
2039 if (TARGET_LONG_BITS == 32) {
2040 tcg_out_ext32u(s, TCG_REG_L0, base);
2041 base = TCG_REG_L0;
2042 }
2043 if (offset != guest_base) {
2044 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2045 index = TCG_REG_L1;
2046 offset = 0;
2047 }
2048 }
2049
2050 tcg_out_qemu_ld_direct(s, datalo, datahi,
2051 base, index, offset, seg, opc);
2052 }
2053 #endif
2054 }
2055
2056 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2057 TCGReg base, intptr_t ofs, int seg,
2058 TCGMemOp memop)
2059 {
2060 /* ??? Ideally we wouldn't need a scratch register. For user-only,
2061 we could perform the bswap twice to restore the original value
2062 instead of moving to the scratch. But as it is, the L constraint
2063 means that TCG_REG_L0 is definitely free here. */
2064 const TCGReg scratch = TCG_REG_L0;
2065 const TCGMemOp real_bswap = memop & MO_BSWAP;
2066 TCGMemOp bswap = real_bswap;
2067 int movop = OPC_MOVL_EvGv;
2068
2069 if (have_movbe && real_bswap) {
2070 bswap = 0;
2071 movop = OPC_MOVBE_MyGy;
2072 }
2073
2074 switch (memop & MO_SIZE) {
2075 case MO_8:
2076 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2077 Use the scratch register if necessary. */
2078 if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2079 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2080 datalo = scratch;
2081 }
2082 tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2083 datalo, base, ofs);
2084 break;
2085 case MO_16:
2086 if (bswap) {
2087 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2088 tcg_out_rolw_8(s, scratch);
2089 datalo = scratch;
2090 }
2091 tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
2092 break;
2093 case MO_32:
2094 if (bswap) {
2095 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2096 tcg_out_bswap32(s, scratch);
2097 datalo = scratch;
2098 }
2099 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
2100 break;
2101 case MO_64:
2102 if (TCG_TARGET_REG_BITS == 64) {
2103 if (bswap) {
2104 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2105 tcg_out_bswap64(s, scratch);
2106 datalo = scratch;
2107 }
2108 tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
2109 } else if (bswap) {
2110 tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2111 tcg_out_bswap32(s, scratch);
2112 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
2113 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2114 tcg_out_bswap32(s, scratch);
2115 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
2116 } else {
2117 if (real_bswap) {
2118 int t = datalo;
2119 datalo = datahi;
2120 datahi = t;
2121 }
2122 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
2123 tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
2124 }
2125 break;
2126 default:
2127 tcg_abort();
2128 }
2129 }
2130
2131 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2132 {
2133 TCGReg datalo, datahi, addrlo;
2134 TCGReg addrhi __attribute__((unused));
2135 TCGMemOpIdx oi;
2136 TCGMemOp opc;
2137 #if defined(CONFIG_SOFTMMU)
2138 int mem_index;
2139 tcg_insn_unit *label_ptr[2];
2140 #endif
2141
2142 datalo = *args++;
2143 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2144 addrlo = *args++;
2145 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2146 oi = *args++;
2147 opc = get_memop(oi);
2148
2149 #if defined(CONFIG_SOFTMMU)
2150 mem_index = get_mmuidx(oi);
2151
2152 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2153 label_ptr, offsetof(CPUTLBEntry, addr_write));
2154
2155 /* TLB Hit. */
2156 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
2157
2158 /* Record the current context of a store into ldst label */
2159 add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
2160 s->code_ptr, label_ptr);
2161 #else
2162 {
2163 int32_t offset = guest_base;
2164 TCGReg base = addrlo;
2165 int seg = 0;
2166
2167 /* See comment in tcg_out_qemu_ld re zero-extension of addrlo. */
2168 if (guest_base == 0 || guest_base_flags) {
2169 seg = guest_base_flags;
2170 offset = 0;
2171 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
2172 seg |= P_ADDR32;
2173 }
2174 } else if (TCG_TARGET_REG_BITS == 64) {
2175 /* ??? Note that we can't use the same SIB addressing scheme
2176 as for loads, since we require L0 free for bswap. */
2177 if (offset != guest_base) {
2178 if (TARGET_LONG_BITS == 32) {
2179 tcg_out_ext32u(s, TCG_REG_L0, base);
2180 base = TCG_REG_L0;
2181 }
2182 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2183 tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
2184 base = TCG_REG_L1;
2185 offset = 0;
2186 } else if (TARGET_LONG_BITS == 32) {
2187 tcg_out_ext32u(s, TCG_REG_L1, base);
2188 base = TCG_REG_L1;
2189 }
2190 }
2191
2192 tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
2193 }
2194 #endif
2195 }
2196
2197 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2198 const TCGArg *args, const int *const_args)
2199 {
2200 TCGArg a0, a1, a2;
2201 int c, const_a2, vexop, rexw = 0;
2202
2203 #if TCG_TARGET_REG_BITS == 64
2204 # define OP_32_64(x) \
2205 case glue(glue(INDEX_op_, x), _i64): \
2206 rexw = P_REXW; /* FALLTHRU */ \
2207 case glue(glue(INDEX_op_, x), _i32)
2208 #else
2209 # define OP_32_64(x) \
2210 case glue(glue(INDEX_op_, x), _i32)
2211 #endif
2212
2213 /* Hoist the loads of the most common arguments. */
2214 a0 = args[0];
2215 a1 = args[1];
2216 a2 = args[2];
2217 const_a2 = const_args[2];
2218
2219 switch (opc) {
2220 case INDEX_op_exit_tb:
2221 /* Reuse the zeroing that exists for goto_ptr. */
2222 if (a0 == 0) {
2223 tcg_out_jmp(s, s->code_gen_epilogue);
2224 } else {
2225 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2226 tcg_out_jmp(s, tb_ret_addr);
2227 }
2228 break;
2229 case INDEX_op_goto_tb:
2230 if (s->tb_jmp_insn_offset) {
2231 /* direct jump method */
2232 int gap;
2233 /* jump displacement must be aligned for atomic patching;
2234 * see if we need to add extra nops before jump
2235 */
2236 gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2237 if (gap != 1) {
2238 tcg_out_nopn(s, gap - 1);
2239 }
2240 tcg_out8(s, OPC_JMP_long); /* jmp im */
2241 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2242 tcg_out32(s, 0);
2243 } else {
2244 /* indirect jump method */
2245 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2246 (intptr_t)(s->tb_jmp_target_addr + a0));
2247 }
2248 s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
2249 break;
2250 case INDEX_op_goto_ptr:
2251 /* jmp to the given host address (could be epilogue) */
2252 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2253 break;
2254 case INDEX_op_br:
2255 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2256 break;
2257 OP_32_64(ld8u):
2258 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2259 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2260 break;
2261 OP_32_64(ld8s):
2262 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2263 break;
2264 OP_32_64(ld16u):
2265 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2266 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2267 break;
2268 OP_32_64(ld16s):
2269 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2270 break;
2271 #if TCG_TARGET_REG_BITS == 64
2272 case INDEX_op_ld32u_i64:
2273 #endif
2274 case INDEX_op_ld_i32:
2275 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2276 break;
2277
2278 OP_32_64(st8):
2279 if (const_args[0]) {
2280 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2281 tcg_out8(s, a0);
2282 } else {
2283 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2284 }
2285 break;
2286 OP_32_64(st16):
2287 if (const_args[0]) {
2288 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2289 tcg_out16(s, a0);
2290 } else {
2291 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2292 }
2293 break;
2294 #if TCG_TARGET_REG_BITS == 64
2295 case INDEX_op_st32_i64:
2296 #endif
2297 case INDEX_op_st_i32:
2298 if (const_args[0]) {
2299 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2300 tcg_out32(s, a0);
2301 } else {
2302 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2303 }
2304 break;
2305
2306 OP_32_64(add):
2307 /* For 3-operand addition, use LEA. */
2308 if (a0 != a1) {
2309 TCGArg c3 = 0;
2310 if (const_a2) {
2311 c3 = a2, a2 = -1;
2312 } else if (a0 == a2) {
2313 /* Watch out for dest = src + dest, since we've removed
2314 the matching constraint on the add. */
2315 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2316 break;
2317 }
2318
2319 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2320 break;
2321 }
2322 c = ARITH_ADD;
2323 goto gen_arith;
2324 OP_32_64(sub):
2325 c = ARITH_SUB;
2326 goto gen_arith;
2327 OP_32_64(and):
2328 c = ARITH_AND;
2329 goto gen_arith;
2330 OP_32_64(or):
2331 c = ARITH_OR;
2332 goto gen_arith;
2333 OP_32_64(xor):
2334 c = ARITH_XOR;
2335 goto gen_arith;
2336 gen_arith:
2337 if (const_a2) {
2338 tgen_arithi(s, c + rexw, a0, a2, 0);
2339 } else {
2340 tgen_arithr(s, c + rexw, a0, a2);
2341 }
2342 break;
2343
2344 OP_32_64(andc):
2345 if (const_a2) {
2346 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2347 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2348 } else {
2349 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2350 }
2351 break;
2352
2353 OP_32_64(mul):
2354 if (const_a2) {
2355 int32_t val;
2356 val = a2;
2357 if (val == (int8_t)val) {
2358 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2359 tcg_out8(s, val);
2360 } else {
2361 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2362 tcg_out32(s, val);
2363 }
2364 } else {
2365 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2366 }
2367 break;
2368
2369 OP_32_64(div2):
2370 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2371 break;
2372 OP_32_64(divu2):
2373 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2374 break;
2375
2376 OP_32_64(shl):
2377 /* For small constant 3-operand shift, use LEA. */
2378 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2379 if (a2 - 1 == 0) {
2380 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2381 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2382 } else {
2383 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2384 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2385 }
2386 break;
2387 }
2388 c = SHIFT_SHL;
2389 vexop = OPC_SHLX;
2390 goto gen_shift_maybe_vex;
2391 OP_32_64(shr):
2392 c = SHIFT_SHR;
2393 vexop = OPC_SHRX;
2394 goto gen_shift_maybe_vex;
2395 OP_32_64(sar):
2396 c = SHIFT_SAR;
2397 vexop = OPC_SARX;
2398 goto gen_shift_maybe_vex;
2399 OP_32_64(rotl):
2400 c = SHIFT_ROL;
2401 goto gen_shift;
2402 OP_32_64(rotr):
2403 c = SHIFT_ROR;
2404 goto gen_shift;
2405 gen_shift_maybe_vex:
2406 if (have_bmi2) {
2407 if (!const_a2) {
2408 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2409 break;
2410 }
2411 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2412 }
2413 /* FALLTHRU */
2414 gen_shift:
2415 if (const_a2) {
2416 tcg_out_shifti(s, c + rexw, a0, a2);
2417 } else {
2418 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2419 }
2420 break;
2421
2422 OP_32_64(ctz):
2423 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2424 break;
2425 OP_32_64(clz):
2426 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2427 break;
2428 OP_32_64(ctpop):
2429 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2430 break;
2431
2432 case INDEX_op_brcond_i32:
2433 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2434 break;
2435 case INDEX_op_setcond_i32:
2436 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2437 break;
2438 case INDEX_op_movcond_i32:
2439 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2440 break;
2441
2442 OP_32_64(bswap16):
2443 tcg_out_rolw_8(s, a0);
2444 break;
2445 OP_32_64(bswap32):
2446 tcg_out_bswap32(s, a0);
2447 break;
2448
2449 OP_32_64(neg):
2450 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2451 break;
2452 OP_32_64(not):
2453 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2454 break;
2455
2456 OP_32_64(ext8s):
2457 tcg_out_ext8s(s, a0, a1, rexw);
2458 break;
2459 OP_32_64(ext16s):
2460 tcg_out_ext16s(s, a0, a1, rexw);
2461 break;
2462 OP_32_64(ext8u):
2463 tcg_out_ext8u(s, a0, a1);
2464 break;
2465 OP_32_64(ext16u):
2466 tcg_out_ext16u(s, a0, a1);
2467 break;
2468
2469 case INDEX_op_qemu_ld_i32:
2470 tcg_out_qemu_ld(s, args, 0);
2471 break;
2472 case INDEX_op_qemu_ld_i64:
2473 tcg_out_qemu_ld(s, args, 1);
2474 break;
2475 case INDEX_op_qemu_st_i32:
2476 tcg_out_qemu_st(s, args, 0);
2477 break;
2478 case INDEX_op_qemu_st_i64:
2479 tcg_out_qemu_st(s, args, 1);
2480 break;
2481
2482 OP_32_64(mulu2):
2483 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2484 break;
2485 OP_32_64(muls2):
2486 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2487 break;
2488 OP_32_64(add2):
2489 if (const_args[4]) {
2490 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2491 } else {
2492 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2493 }
2494 if (const_args[5]) {
2495 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2496 } else {
2497 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2498 }
2499 break;
2500 OP_32_64(sub2):
2501 if (const_args[4]) {
2502 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2503 } else {
2504 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2505 }
2506 if (const_args[5]) {
2507 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2508 } else {
2509 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2510 }
2511 break;
2512
2513 #if TCG_TARGET_REG_BITS == 32
2514 case INDEX_op_brcond2_i32:
2515 tcg_out_brcond2(s, args, const_args, 0);
2516 break;
2517 case INDEX_op_setcond2_i32:
2518 tcg_out_setcond2(s, args, const_args);
2519 break;
2520 #else /* TCG_TARGET_REG_BITS == 64 */
2521 case INDEX_op_ld32s_i64:
2522 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2523 break;
2524 case INDEX_op_ld_i64:
2525 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2526 break;
2527 case INDEX_op_st_i64:
2528 if (const_args[0]) {
2529 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2530 tcg_out32(s, a0);
2531 } else {
2532 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2533 }
2534 break;
2535
2536 case INDEX_op_brcond_i64:
2537 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2538 break;
2539 case INDEX_op_setcond_i64:
2540 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2541 break;
2542 case INDEX_op_movcond_i64:
2543 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2544 break;
2545
2546 case INDEX_op_bswap64_i64:
2547 tcg_out_bswap64(s, a0);
2548 break;
2549 case INDEX_op_extu_i32_i64:
2550 case INDEX_op_ext32u_i64:
2551 tcg_out_ext32u(s, a0, a1);
2552 break;
2553 case INDEX_op_ext_i32_i64:
2554 case INDEX_op_ext32s_i64:
2555 tcg_out_ext32s(s, a0, a1);
2556 break;
2557 #endif
2558
2559 OP_32_64(deposit):
2560 if (args[3] == 0 && args[4] == 8) {
2561 /* load bits 0..7 */
2562 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2563 } else if (args[3] == 8 && args[4] == 8) {
2564 /* load bits 8..15 */
2565 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2566 } else if (args[3] == 0 && args[4] == 16) {
2567 /* load bits 0..15 */
2568 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2569 } else {
2570 tcg_abort();
2571 }
2572 break;
2573
2574 case INDEX_op_extract_i64:
2575 if (a2 + args[3] == 32) {
2576 /* This is a 32-bit zero-extending right shift. */
2577 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2578 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2579 break;
2580 }
2581 /* FALLTHRU */
2582 case INDEX_op_extract_i32:
2583 /* On the off-chance that we can use the high-byte registers.
2584 Otherwise we emit the same ext16 + shift pattern that we
2585 would have gotten from the normal tcg-op.c expansion. */
2586 tcg_debug_assert(a2 == 8 && args[3] == 8);
2587 if (a1 < 4 && a0 < 8) {
2588 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2589 } else {
2590 tcg_out_ext16u(s, a0, a1);
2591 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2592 }
2593 break;
2594
2595 case INDEX_op_sextract_i32:
2596 /* We don't implement sextract_i64, as we cannot sign-extend to
2597 64-bits without using the REX prefix that explicitly excludes
2598 access to the high-byte registers. */
2599 tcg_debug_assert(a2 == 8 && args[3] == 8);
2600 if (a1 < 4 && a0 < 8) {
2601 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2602 } else {
2603 tcg_out_ext16s(s, a0, a1, 0);
2604 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2605 }
2606 break;
2607
2608 case INDEX_op_mb:
2609 tcg_out_mb(s, a0);
2610 break;
2611 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2612 case INDEX_op_mov_i64:
2613 case INDEX_op_mov_vec:
2614 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
2615 case INDEX_op_movi_i64:
2616 case INDEX_op_dupi_vec:
2617 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2618 default:
2619 tcg_abort();
2620 }
2621
2622 #undef OP_32_64
2623 }
2624
2625 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2626 unsigned vecl, unsigned vece,
2627 const TCGArg *args, const int *const_args)
2628 {
2629 static int const add_insn[4] = {
2630 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2631 };
2632 static int const sub_insn[4] = {
2633 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2634 };
2635 static int const mul_insn[4] = {
2636 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2637 };
2638 static int const shift_imm_insn[4] = {
2639 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2640 };
2641 static int const cmpeq_insn[4] = {
2642 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2643 };
2644 static int const cmpgt_insn[4] = {
2645 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2646 };
2647 static int const punpckl_insn[4] = {
2648 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2649 };
2650 static int const punpckh_insn[4] = {
2651 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2652 };
2653 static int const packss_insn[4] = {
2654 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2655 };
2656 static int const packus_insn[4] = {
2657 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2658 };
2659
2660 TCGType type = vecl + TCG_TYPE_V64;
2661 int insn, sub;
2662 TCGArg a0, a1, a2;
2663
2664 a0 = args[0];
2665 a1 = args[1];
2666 a2 = args[2];
2667
2668 switch (opc) {
2669 case INDEX_op_add_vec:
2670 insn = add_insn[vece];
2671 goto gen_simd;
2672 case INDEX_op_sub_vec:
2673 insn = sub_insn[vece];
2674 goto gen_simd;
2675 case INDEX_op_mul_vec:
2676 insn = mul_insn[vece];
2677 goto gen_simd;
2678 case INDEX_op_and_vec:
2679 insn = OPC_PAND;
2680 goto gen_simd;
2681 case INDEX_op_or_vec:
2682 insn = OPC_POR;
2683 goto gen_simd;
2684 case INDEX_op_xor_vec:
2685 insn = OPC_PXOR;
2686 goto gen_simd;
2687 case INDEX_op_x86_punpckl_vec:
2688 insn = punpckl_insn[vece];
2689 goto gen_simd;
2690 case INDEX_op_x86_punpckh_vec:
2691 insn = punpckh_insn[vece];
2692 goto gen_simd;
2693 case INDEX_op_x86_packss_vec:
2694 insn = packss_insn[vece];
2695 goto gen_simd;
2696 case INDEX_op_x86_packus_vec:
2697 insn = packus_insn[vece];
2698 goto gen_simd;
2699 #if TCG_TARGET_REG_BITS == 32
2700 case INDEX_op_dup2_vec:
2701 /* Constraints have already placed both 32-bit inputs in xmm regs. */
2702 insn = OPC_PUNPCKLDQ;
2703 goto gen_simd;
2704 #endif
2705 gen_simd:
2706 tcg_debug_assert(insn != OPC_UD2);
2707 if (type == TCG_TYPE_V256) {
2708 insn |= P_VEXL;
2709 }
2710 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2711 break;
2712
2713 case INDEX_op_cmp_vec:
2714 sub = args[3];
2715 if (sub == TCG_COND_EQ) {
2716 insn = cmpeq_insn[vece];
2717 } else if (sub == TCG_COND_GT) {
2718 insn = cmpgt_insn[vece];
2719 } else {
2720 g_assert_not_reached();
2721 }
2722 goto gen_simd;
2723
2724 case INDEX_op_andc_vec:
2725 insn = OPC_PANDN;
2726 if (type == TCG_TYPE_V256) {
2727 insn |= P_VEXL;
2728 }
2729 tcg_out_vex_modrm(s, insn, a0, a2, a1);
2730 break;
2731
2732 case INDEX_op_shli_vec:
2733 sub = 6;
2734 goto gen_shift;
2735 case INDEX_op_shri_vec:
2736 sub = 2;
2737 goto gen_shift;
2738 case INDEX_op_sari_vec:
2739 tcg_debug_assert(vece != MO_64);
2740 sub = 4;
2741 gen_shift:
2742 tcg_debug_assert(vece != MO_8);
2743 insn = shift_imm_insn[vece];
2744 if (type == TCG_TYPE_V256) {
2745 insn |= P_VEXL;
2746 }
2747 tcg_out_vex_modrm(s, insn, sub, a0, a1);
2748 tcg_out8(s, a2);
2749 break;
2750
2751 case INDEX_op_ld_vec:
2752 tcg_out_ld(s, type, a0, a1, a2);
2753 break;
2754 case INDEX_op_st_vec:
2755 tcg_out_st(s, type, a0, a1, a2);
2756 break;
2757 case INDEX_op_dup_vec:
2758 tcg_out_dup_vec(s, type, vece, a0, a1);
2759 break;
2760
2761 case INDEX_op_x86_shufps_vec:
2762 insn = OPC_SHUFPS;
2763 sub = args[3];
2764 goto gen_simd_imm8;
2765 case INDEX_op_x86_blend_vec:
2766 if (vece == MO_16) {
2767 insn = OPC_PBLENDW;
2768 } else if (vece == MO_32) {
2769 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2770 } else {
2771 g_assert_not_reached();
2772 }
2773 sub = args[3];
2774 goto gen_simd_imm8;
2775 case INDEX_op_x86_vperm2i128_vec:
2776 insn = OPC_VPERM2I128;
2777 sub = args[3];
2778 goto gen_simd_imm8;
2779 gen_simd_imm8:
2780 if (type == TCG_TYPE_V256) {
2781 insn |= P_VEXL;
2782 }
2783 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2784 tcg_out8(s, sub);
2785 break;
2786
2787 case INDEX_op_x86_vpblendvb_vec:
2788 insn = OPC_VPBLENDVB;
2789 if (type == TCG_TYPE_V256) {
2790 insn |= P_VEXL;
2791 }
2792 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2793 tcg_out8(s, args[3] << 4);
2794 break;
2795
2796 case INDEX_op_x86_psrldq_vec:
2797 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2798 tcg_out8(s, a2);
2799 break;
2800
2801 default:
2802 g_assert_not_reached();
2803 }
2804 }
2805
2806 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2807 {
2808 static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2809 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2810 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2811 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2812 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2813 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2814 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2815 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2816 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2817 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2818 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2819 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2820 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2821 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2822 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2823 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2824 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2825 static const TCGTargetOpDef r_r_L_L
2826 = { .args_ct_str = { "r", "r", "L", "L" } };
2827 static const TCGTargetOpDef L_L_L_L
2828 = { .args_ct_str = { "L", "L", "L", "L" } };
2829 static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2830 static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2831 static const TCGTargetOpDef x_x_x_x
2832 = { .args_ct_str = { "x", "x", "x", "x" } };
2833 static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2834
2835 switch (op) {
2836 case INDEX_op_goto_ptr:
2837 return &r;
2838
2839 case INDEX_op_ld8u_i32:
2840 case INDEX_op_ld8u_i64:
2841 case INDEX_op_ld8s_i32:
2842 case INDEX_op_ld8s_i64:
2843 case INDEX_op_ld16u_i32:
2844 case INDEX_op_ld16u_i64:
2845 case INDEX_op_ld16s_i32:
2846 case INDEX_op_ld16s_i64:
2847 case INDEX_op_ld_i32:
2848 case INDEX_op_ld32u_i64:
2849 case INDEX_op_ld32s_i64:
2850 case INDEX_op_ld_i64:
2851 return &r_r;
2852
2853 case INDEX_op_st8_i32:
2854 case INDEX_op_st8_i64:
2855 return &qi_r;
2856 case INDEX_op_st16_i32:
2857 case INDEX_op_st16_i64:
2858 case INDEX_op_st_i32:
2859 case INDEX_op_st32_i64:
2860 return &ri_r;
2861 case INDEX_op_st_i64:
2862 return &re_r;
2863
2864 case INDEX_op_add_i32:
2865 case INDEX_op_add_i64:
2866 return &r_r_re;
2867 case INDEX_op_sub_i32:
2868 case INDEX_op_sub_i64:
2869 case INDEX_op_mul_i32:
2870 case INDEX_op_mul_i64:
2871 case INDEX_op_or_i32:
2872 case INDEX_op_or_i64:
2873 case INDEX_op_xor_i32:
2874 case INDEX_op_xor_i64:
2875 return &r_0_re;
2876
2877 case INDEX_op_and_i32:
2878 case INDEX_op_and_i64:
2879 {
2880 static const TCGTargetOpDef and
2881 = { .args_ct_str = { "r", "0", "reZ" } };
2882 return &and;
2883 }
2884 break;
2885 case INDEX_op_andc_i32:
2886 case INDEX_op_andc_i64:
2887 {
2888 static const TCGTargetOpDef andc
2889 = { .args_ct_str = { "r", "r", "rI" } };
2890 return &andc;
2891 }
2892 break;
2893
2894 case INDEX_op_shl_i32:
2895 case INDEX_op_shl_i64:
2896 case INDEX_op_shr_i32:
2897 case INDEX_op_shr_i64:
2898 case INDEX_op_sar_i32:
2899 case INDEX_op_sar_i64:
2900 return have_bmi2 ? &r_r_ri : &r_0_ci;
2901 case INDEX_op_rotl_i32:
2902 case INDEX_op_rotl_i64:
2903 case INDEX_op_rotr_i32:
2904 case INDEX_op_rotr_i64:
2905 return &r_0_ci;
2906
2907 case INDEX_op_brcond_i32:
2908 case INDEX_op_brcond_i64:
2909 return &r_re;
2910
2911 case INDEX_op_bswap16_i32:
2912 case INDEX_op_bswap16_i64:
2913 case INDEX_op_bswap32_i32:
2914 case INDEX_op_bswap32_i64:
2915 case INDEX_op_bswap64_i64:
2916 case INDEX_op_neg_i32:
2917 case INDEX_op_neg_i64:
2918 case INDEX_op_not_i32:
2919 case INDEX_op_not_i64:
2920 return &r_0;
2921
2922 case INDEX_op_ext8s_i32:
2923 case INDEX_op_ext8s_i64:
2924 case INDEX_op_ext8u_i32:
2925 case INDEX_op_ext8u_i64:
2926 return &r_q;
2927 case INDEX_op_ext16s_i32:
2928 case INDEX_op_ext16s_i64:
2929 case INDEX_op_ext16u_i32:
2930 case INDEX_op_ext16u_i64:
2931 case INDEX_op_ext32s_i64:
2932 case INDEX_op_ext32u_i64:
2933 case INDEX_op_ext_i32_i64:
2934 case INDEX_op_extu_i32_i64:
2935 case INDEX_op_extract_i32:
2936 case INDEX_op_extract_i64:
2937 case INDEX_op_sextract_i32:
2938 case INDEX_op_ctpop_i32:
2939 case INDEX_op_ctpop_i64:
2940 return &r_r;
2941
2942 case INDEX_op_deposit_i32:
2943 case INDEX_op_deposit_i64:
2944 {
2945 static const TCGTargetOpDef dep
2946 = { .args_ct_str = { "Q", "0", "Q" } };
2947 return &dep;
2948 }
2949 case INDEX_op_setcond_i32:
2950 case INDEX_op_setcond_i64:
2951 {
2952 static const TCGTargetOpDef setc
2953 = { .args_ct_str = { "q", "r", "re" } };
2954 return &setc;
2955 }
2956 case INDEX_op_movcond_i32:
2957 case INDEX_op_movcond_i64:
2958 {
2959 static const TCGTargetOpDef movc
2960 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2961 return &movc;
2962 }
2963 case INDEX_op_div2_i32:
2964 case INDEX_op_div2_i64:
2965 case INDEX_op_divu2_i32:
2966 case INDEX_op_divu2_i64:
2967 {
2968 static const TCGTargetOpDef div2
2969 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2970 return &div2;
2971 }
2972 case INDEX_op_mulu2_i32:
2973 case INDEX_op_mulu2_i64:
2974 case INDEX_op_muls2_i32:
2975 case INDEX_op_muls2_i64:
2976 {
2977 static const TCGTargetOpDef mul2
2978 = { .args_ct_str = { "a", "d", "a", "r" } };
2979 return &mul2;
2980 }
2981 case INDEX_op_add2_i32:
2982 case INDEX_op_add2_i64:
2983 case INDEX_op_sub2_i32:
2984 case INDEX_op_sub2_i64:
2985 {
2986 static const TCGTargetOpDef arith2
2987 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2988 return &arith2;
2989 }
2990 case INDEX_op_ctz_i32:
2991 case INDEX_op_ctz_i64:
2992 {
2993 static const TCGTargetOpDef ctz[2] = {
2994 { .args_ct_str = { "&r", "r", "r" } },
2995 { .args_ct_str = { "&r", "r", "rW" } },
2996 };
2997 return &ctz[have_bmi1];
2998 }
2999 case INDEX_op_clz_i32:
3000 case INDEX_op_clz_i64:
3001 {
3002 static const TCGTargetOpDef clz[2] = {
3003 { .args_ct_str = { "&r", "r", "r" } },
3004 { .args_ct_str = { "&r", "r", "rW" } },
3005 };
3006 return &clz[have_lzcnt];
3007 }
3008
3009 case INDEX_op_qemu_ld_i32:
3010 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3011 case INDEX_op_qemu_st_i32:
3012 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3013 case INDEX_op_qemu_ld_i64:
3014 return (TCG_TARGET_REG_BITS == 64 ? &r_L
3015 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3016 : &r_r_L_L);
3017 case INDEX_op_qemu_st_i64:
3018 return (TCG_TARGET_REG_BITS == 64 ? &L_L
3019 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3020 : &L_L_L_L);
3021
3022 case INDEX_op_brcond2_i32:
3023 {
3024 static const TCGTargetOpDef b2
3025 = { .args_ct_str = { "r", "r", "ri", "ri" } };
3026 return &b2;
3027 }
3028 case INDEX_op_setcond2_i32:
3029 {
3030 static const TCGTargetOpDef s2
3031 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3032 return &s2;
3033 }
3034
3035 case INDEX_op_ld_vec:
3036 case INDEX_op_st_vec:
3037 return &x_r;
3038
3039 case INDEX_op_add_vec:
3040 case INDEX_op_sub_vec:
3041 case INDEX_op_mul_vec:
3042 case INDEX_op_and_vec:
3043 case INDEX_op_or_vec:
3044 case INDEX_op_xor_vec:
3045 case INDEX_op_andc_vec:
3046 case INDEX_op_cmp_vec:
3047 case INDEX_op_x86_shufps_vec:
3048 case INDEX_op_x86_blend_vec:
3049 case INDEX_op_x86_packss_vec:
3050 case INDEX_op_x86_packus_vec:
3051 case INDEX_op_x86_vperm2i128_vec:
3052 case INDEX_op_x86_punpckl_vec:
3053 case INDEX_op_x86_punpckh_vec:
3054 #if TCG_TARGET_REG_BITS == 32
3055 case INDEX_op_dup2_vec:
3056 #endif
3057 return &x_x_x;
3058 case INDEX_op_dup_vec:
3059 case INDEX_op_shli_vec:
3060 case INDEX_op_shri_vec:
3061 case INDEX_op_sari_vec:
3062 case INDEX_op_x86_psrldq_vec:
3063 return &x_x;
3064 case INDEX_op_x86_vpblendvb_vec:
3065 return &x_x_x_x;
3066
3067 default:
3068 break;
3069 }
3070 return NULL;
3071 }
3072
3073 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3074 {
3075 switch (opc) {
3076 case INDEX_op_add_vec:
3077 case INDEX_op_sub_vec:
3078 case INDEX_op_and_vec:
3079 case INDEX_op_or_vec:
3080 case INDEX_op_xor_vec:
3081 case INDEX_op_andc_vec:
3082 return 1;
3083 case INDEX_op_cmp_vec:
3084 return -1;
3085
3086 case INDEX_op_shli_vec:
3087 case INDEX_op_shri_vec:
3088 /* We must expand the operation for MO_8. */
3089 return vece == MO_8 ? -1 : 1;
3090
3091 case INDEX_op_sari_vec:
3092 /* We must expand the operation for MO_8. */
3093 if (vece == MO_8) {
3094 return -1;
3095 }
3096 /* We can emulate this for MO_64, but it does not pay off
3097 unless we're producing at least 4 values. */
3098 if (vece == MO_64) {
3099 return type >= TCG_TYPE_V256 ? -1 : 0;
3100 }
3101 return 1;
3102
3103 case INDEX_op_mul_vec:
3104 if (vece == MO_8) {
3105 /* We can expand the operation for MO_8. */
3106 return -1;
3107 }
3108 if (vece == MO_64) {
3109 return 0;
3110 }
3111 return 1;
3112
3113 default:
3114 return 0;
3115 }
3116 }
3117
3118 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3119 TCGArg a0, ...)
3120 {
3121 va_list va;
3122 TCGArg a1, a2;
3123 TCGv_vec v0, t1, t2, t3, t4;
3124
3125 va_start(va, a0);
3126 v0 = temp_tcgv_vec(arg_temp(a0));
3127
3128 switch (opc) {
3129 case INDEX_op_shli_vec:
3130 case INDEX_op_shri_vec:
3131 tcg_debug_assert(vece == MO_8);
3132 a1 = va_arg(va, TCGArg);
3133 a2 = va_arg(va, TCGArg);
3134 /* Unpack to W, shift, and repack. Tricky bits:
3135 (1) Use punpck*bw x,x to produce DDCCBBAA,
3136 i.e. duplicate in other half of the 16-bit lane.
3137 (2) For right-shift, add 8 so that the high half of
3138 the lane becomes zero. For left-shift, we must
3139 shift up and down again.
3140 (3) Step 2 leaves high half zero such that PACKUSWB
3141 (pack with unsigned saturation) does not modify
3142 the quantity. */
3143 t1 = tcg_temp_new_vec(type);
3144 t2 = tcg_temp_new_vec(type);
3145 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3146 tcgv_vec_arg(t1), a1, a1);
3147 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3148 tcgv_vec_arg(t2), a1, a1);
3149 if (opc == INDEX_op_shri_vec) {
3150 vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3151 tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3152 vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3153 tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3154 } else {
3155 vec_gen_3(INDEX_op_shli_vec, type, MO_16,
3156 tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3157 vec_gen_3(INDEX_op_shli_vec, type, MO_16,
3158 tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3159 vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3160 tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
3161 vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3162 tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
3163 }
3164 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3165 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3166 tcg_temp_free_vec(t1);
3167 tcg_temp_free_vec(t2);
3168 break;
3169
3170 case INDEX_op_sari_vec:
3171 a1 = va_arg(va, TCGArg);
3172 a2 = va_arg(va, TCGArg);
3173 if (vece == MO_8) {
3174 /* Unpack to W, shift, and repack, as above. */
3175 t1 = tcg_temp_new_vec(type);
3176 t2 = tcg_temp_new_vec(type);
3177 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3178 tcgv_vec_arg(t1), a1, a1);
3179 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3180 tcgv_vec_arg(t2), a1, a1);
3181 vec_gen_3(INDEX_op_sari_vec, type, MO_16,
3182 tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3183 vec_gen_3(INDEX_op_sari_vec, type, MO_16,
3184 tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3185 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3186 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3187 tcg_temp_free_vec(t1);
3188 tcg_temp_free_vec(t2);
3189 break;
3190 }
3191 tcg_debug_assert(vece == MO_64);
3192 /* MO_64: If the shift is <= 32, we can emulate the sign extend by
3193 performing an arithmetic 32-bit shift and overwriting the high
3194 half of the result (note that the ISA says shift of 32 is valid). */
3195 if (a2 <= 32) {
3196 t1 = tcg_temp_new_vec(type);
3197 vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
3198 vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
3199 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3200 a0, a0, tcgv_vec_arg(t1), 0xaa);
3201 tcg_temp_free_vec(t1);
3202 break;
3203 }
3204 /* Otherwise we will need to use a compare vs 0 to produce the
3205 sign-extend, shift and merge. */
3206 t1 = tcg_temp_new_vec(type);
3207 t2 = tcg_const_zeros_vec(type);
3208 vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
3209 tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
3210 tcg_temp_free_vec(t2);
3211 vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
3212 vec_gen_3(INDEX_op_shli_vec, type, MO_64,
3213 tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
3214 vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
3215 tcg_temp_free_vec(t1);
3216 break;
3217
3218 case INDEX_op_mul_vec:
3219 tcg_debug_assert(vece == MO_8);
3220 a1 = va_arg(va, TCGArg);
3221 a2 = va_arg(va, TCGArg);
3222 switch (type) {
3223 case TCG_TYPE_V64:
3224 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3225 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3226 tcg_gen_dup16i_vec(t2, 0);
3227 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3228 tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
3229 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3230 tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
3231 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3232 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3233 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3234 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3235 tcg_temp_free_vec(t1);
3236 tcg_temp_free_vec(t2);
3237 break;
3238
3239 case TCG_TYPE_V128:
3240 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3241 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3242 t3 = tcg_temp_new_vec(TCG_TYPE_V128);
3243 t4 = tcg_temp_new_vec(TCG_TYPE_V128);
3244 tcg_gen_dup16i_vec(t4, 0);
3245 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3246 tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
3247 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3248 tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
3249 vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
3250 tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
3251 vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
3252 tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
3253 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3254 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3255 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3256 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3257 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3258 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3259 tcg_temp_free_vec(t1);
3260 tcg_temp_free_vec(t2);
3261 tcg_temp_free_vec(t3);
3262 tcg_temp_free_vec(t4);
3263 break;
3264
3265 case TCG_TYPE_V256:
3266 t1 = tcg_temp_new_vec(TCG_TYPE_V256);
3267 t2 = tcg_temp_new_vec(TCG_TYPE_V256);
3268 t3 = tcg_temp_new_vec(TCG_TYPE_V256);
3269 t4 = tcg_temp_new_vec(TCG_TYPE_V256);
3270 tcg_gen_dup16i_vec(t4, 0);
3271 /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
3272 t1: extends of B[0-7], D[0-7]
3273 t2: extends of X[0-7], Z[0-7]
3274 t3: extends of A[0-7], C[0-7]
3275 t4: extends of W[0-7], Y[0-7]. */
3276 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
3277 tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
3278 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
3279 tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
3280 vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
3281 tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
3282 vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
3283 tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
3284 /* t1: BX DZ; t2: AW CY. */
3285 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3286 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3287 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3288 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3289 /* a0: AW BX CY DZ. */
3290 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
3291 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3292 tcg_temp_free_vec(t1);
3293 tcg_temp_free_vec(t2);
3294 tcg_temp_free_vec(t3);
3295 tcg_temp_free_vec(t4);
3296 break;
3297
3298 default:
3299 g_assert_not_reached();
3300 }
3301 break;
3302
3303 case INDEX_op_cmp_vec:
3304 {
3305 enum {
3306 NEED_SWAP = 1,
3307 NEED_INV = 2,
3308 NEED_BIAS = 4
3309 };
3310 static const uint8_t fixups[16] = {
3311 [0 ... 15] = -1,
3312 [TCG_COND_EQ] = 0,
3313 [TCG_COND_NE] = NEED_INV,
3314 [TCG_COND_GT] = 0,
3315 [TCG_COND_LT] = NEED_SWAP,
3316 [TCG_COND_LE] = NEED_INV,
3317 [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3318 [TCG_COND_GTU] = NEED_BIAS,
3319 [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
3320 [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
3321 [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
3322 };
3323
3324 TCGCond cond;
3325 uint8_t fixup;
3326
3327 a1 = va_arg(va, TCGArg);
3328 a2 = va_arg(va, TCGArg);
3329 cond = va_arg(va, TCGArg);
3330 fixup = fixups[cond & 15];
3331 tcg_debug_assert(fixup != 0xff);
3332
3333 if (fixup & NEED_INV) {
3334 cond = tcg_invert_cond(cond);
3335 }
3336 if (fixup & NEED_SWAP) {
3337 TCGArg t;
3338 t = a1, a1 = a2, a2 = t;
3339 cond = tcg_swap_cond(cond);
3340 }
3341
3342 t1 = t2 = NULL;
3343 if (fixup & NEED_BIAS) {
3344 t1 = tcg_temp_new_vec(type);
3345 t2 = tcg_temp_new_vec(type);
3346 tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3347 tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
3348 tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
3349 a1 = tcgv_vec_arg(t1);
3350 a2 = tcgv_vec_arg(t2);
3351 cond = tcg_signed_cond(cond);
3352 }
3353
3354 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3355 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
3356
3357 if (fixup & NEED_BIAS) {
3358 tcg_temp_free_vec(t1);
3359 tcg_temp_free_vec(t2);
3360 }
3361 if (fixup & NEED_INV) {
3362 tcg_gen_not_vec(vece, v0, v0);
3363 }
3364 }
3365 break;
3366
3367 default:
3368 break;
3369 }
3370
3371 va_end(va);
3372 }
3373
3374 static const int tcg_target_callee_save_regs[] = {
3375 #if TCG_TARGET_REG_BITS == 64
3376 TCG_REG_RBP,
3377 TCG_REG_RBX,
3378 #if defined(_WIN64)
3379 TCG_REG_RDI,
3380 TCG_REG_RSI,
3381 #endif
3382 TCG_REG_R12,
3383 TCG_REG_R13,
3384 TCG_REG_R14, /* Currently used for the global env. */
3385 TCG_REG_R15,
3386 #else
3387 TCG_REG_EBP, /* Currently used for the global env. */
3388 TCG_REG_EBX,
3389 TCG_REG_ESI,
3390 TCG_REG_EDI,
3391 #endif
3392 };
3393
3394 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3395 and tcg_register_jit. */
3396
3397 #define PUSH_SIZE \
3398 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3399 * (TCG_TARGET_REG_BITS / 8))
3400
3401 #define FRAME_SIZE \
3402 ((PUSH_SIZE \
3403 + TCG_STATIC_CALL_ARGS_SIZE \
3404 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3405 + TCG_TARGET_STACK_ALIGN - 1) \
3406 & ~(TCG_TARGET_STACK_ALIGN - 1))
3407
3408 /* Generate global QEMU prologue and epilogue code */
3409 static void tcg_target_qemu_prologue(TCGContext *s)
3410 {
3411 int i, stack_addend;
3412
3413 /* TB prologue */
3414
3415 /* Reserve some stack space, also for TCG temps. */
3416 stack_addend = FRAME_SIZE - PUSH_SIZE;
3417 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3418 CPU_TEMP_BUF_NLONGS * sizeof(long));
3419
3420 /* Save all callee saved registers. */
3421 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3422 tcg_out_push(s, tcg_target_callee_save_regs[i]);
3423 }
3424
3425 #if TCG_TARGET_REG_BITS == 32
3426 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3427 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3428 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3429 /* jmp *tb. */
3430 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3431 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3432 + stack_addend);
3433 #else
3434 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3435 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3436 /* jmp *tb. */
3437 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3438 #endif
3439
3440 /*
3441 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3442 * and fall through to the rest of the epilogue.
3443 */
3444 s->code_gen_epilogue = s->code_ptr;
3445 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3446
3447 /* TB epilogue */
3448 tb_ret_addr = s->code_ptr;
3449
3450 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3451
3452 if (have_avx2) {
3453 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3454 }
3455 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3456 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3457 }
3458 tcg_out_opc(s, OPC_RET, 0, 0, 0);
3459
3460 #if !defined(CONFIG_SOFTMMU)
3461 /* Try to set up a segment register to point to guest_base. */
3462 if (guest_base) {
3463 setup_guest_base_seg();
3464 }
3465 #endif
3466 }
3467
3468 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3469 {
3470 memset(p, 0x90, count);
3471 }
3472
3473 static void tcg_target_init(TCGContext *s)
3474 {
3475 #ifdef CONFIG_CPUID_H
3476 unsigned a, b, c, d, b7 = 0;
3477 int max = __get_cpuid_max(0, 0);
3478
3479 if (max >= 7) {
3480 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
3481 __cpuid_count(7, 0, a, b7, c, d);
3482 have_bmi1 = (b7 & bit_BMI) != 0;
3483 have_bmi2 = (b7 & bit_BMI2) != 0;
3484 }
3485
3486 if (max >= 1) {
3487 __cpuid(1, a, b, c, d);
3488 #ifndef have_cmov
3489 /* For 32-bit, 99% certainty that we're running on hardware that
3490 supports cmov, but we still need to check. In case cmov is not
3491 available, we'll use a small forward branch. */
3492 have_cmov = (d & bit_CMOV) != 0;
3493 #endif
3494
3495 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3496 need to probe for it. */
3497 have_movbe = (c & bit_MOVBE) != 0;
3498 have_popcnt = (c & bit_POPCNT) != 0;
3499
3500 /* There are a number of things we must check before we can be
3501 sure of not hitting invalid opcode. */
3502 if (c & bit_OSXSAVE) {
3503 unsigned xcrl, xcrh;
3504 asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3505 if ((xcrl & 6) == 6) {
3506 have_avx1 = (c & bit_AVX) != 0;
3507 have_avx2 = (b7 & bit_AVX2) != 0;
3508 }
3509 }
3510 }
3511
3512 max = __get_cpuid_max(0x8000000, 0);
3513 if (max >= 1) {
3514 __cpuid(0x80000001, a, b, c, d);
3515 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
3516 have_lzcnt = (c & bit_LZCNT) != 0;
3517 }
3518 #endif /* CONFIG_CPUID_H */
3519
3520 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3521 if (TCG_TARGET_REG_BITS == 64) {
3522 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3523 }
3524 if (have_avx1) {
3525 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3526 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3527 }
3528 if (have_avx2) {
3529 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3530 }
3531
3532 tcg_target_call_clobber_regs = 0;
3533 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3534 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3535 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3536 if (TCG_TARGET_REG_BITS == 64) {
3537 #if !defined(_WIN64)
3538 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3539 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3540 #endif
3541 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3542 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3543 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3544 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3545 }
3546
3547 s->reserved_regs = 0;
3548 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3549 }
3550
3551 typedef struct {
3552 DebugFrameHeader h;
3553 uint8_t fde_def_cfa[4];
3554 uint8_t fde_reg_ofs[14];
3555 } DebugFrame;
3556
3557 /* We're expecting a 2 byte uleb128 encoded value. */
3558 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3559
3560 #if !defined(__ELF__)
3561 /* Host machine without ELF. */
3562 #elif TCG_TARGET_REG_BITS == 64
3563 #define ELF_HOST_MACHINE EM_X86_64
3564 static const DebugFrame debug_frame = {
3565 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3566 .h.cie.id = -1,
3567 .h.cie.version = 1,
3568 .h.cie.code_align = 1,
3569 .h.cie.data_align = 0x78, /* sleb128 -8 */
3570 .h.cie.return_column = 16,
3571
3572 /* Total FDE size does not include the "len" member. */
3573 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3574
3575 .fde_def_cfa = {
3576 12, 7, /* DW_CFA_def_cfa %rsp, ... */
3577 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3578 (FRAME_SIZE >> 7)
3579 },
3580 .fde_reg_ofs = {
3581 0x90, 1, /* DW_CFA_offset, %rip, -8 */
3582 /* The following ordering must match tcg_target_callee_save_regs. */
3583 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
3584 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
3585 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
3586 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
3587 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
3588 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
3589 }
3590 };
3591 #else
3592 #define ELF_HOST_MACHINE EM_386
3593 static const DebugFrame debug_frame = {
3594 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3595 .h.cie.id = -1,
3596 .h.cie.version = 1,
3597 .h.cie.code_align = 1,
3598 .h.cie.data_align = 0x7c, /* sleb128 -4 */
3599 .h.cie.return_column = 8,
3600
3601 /* Total FDE size does not include the "len" member. */
3602 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3603
3604 .fde_def_cfa = {
3605 12, 4, /* DW_CFA_def_cfa %esp, ... */
3606 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3607 (FRAME_SIZE >> 7)
3608 },
3609 .fde_reg_ofs = {
3610 0x88, 1, /* DW_CFA_offset, %eip, -4 */
3611 /* The following ordering must match tcg_target_callee_save_regs. */
3612 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
3613 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
3614 0x86, 4, /* DW_CFA_offset, %esi, -16 */
3615 0x87, 5, /* DW_CFA_offset, %edi, -20 */
3616 }
3617 };
3618 #endif
3619
3620 #if defined(ELF_HOST_MACHINE)
3621 void tcg_register_jit(void *buf, size_t buf_size)
3622 {
3623 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3624 }
3625 #endif