#define TARGET_M0_LDS_LIMIT (TARGET_GCN3)
-#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS)
+#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS || TARGET_RDNA3)
#define TARGET_XNACK (flag_xnack != HSACO_ATTR_OFF)
;; }}}
;; {{{ Int/int conversions
+(define_code_iterator all_convert [truncate zero_extend sign_extend])
(define_code_iterator zero_convert [truncate zero_extend])
(define_code_attr convop [
(sign_extend "extend")
(zero_extend "zero_extend")
(truncate "trunc")])
-(define_insn "<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>"
+(define_expand "<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>"
+ [(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
+ (all_convert:V_INT_1REG
+ (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))]
+ "")
+
+(define_insn "*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>"
[(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
(zero_convert:V_INT_1REG
(match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))]
- ""
+ "!TARGET_RDNA3"
"v_mov_b32_sdwa\t%0, %1 dst_sel:<V_INT_1REG:sdwa> dst_unused:UNUSED_PAD src0_sel:<V_INT_1REG_ALT:sdwa>"
[(set_attr "type" "vop_sdwa")
(set_attr "length" "8")])
-(define_insn "extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>"
+(define_insn "extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>"
[(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
(sign_extend:V_INT_1REG
(match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))]
- ""
+ "!TARGET_RDNA3"
"v_mov_b32_sdwa\t%0, sext(%1) src0_sel:<V_INT_1REG_ALT:sdwa>"
[(set_attr "type" "vop_sdwa")
(set_attr "length" "8")])
+(define_insn "*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_shift<exec>"
+ [(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
+ (all_convert:V_INT_1REG
+ (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))]
+ "TARGET_RDNA3"
+ {
+ enum {extend, zero_extend, trunc};
+ rtx shiftwidth = (<V_INT_1REG_ALT:SCALAR_MODE>mode == QImode
+ || <V_INT_1REG:SCALAR_MODE>mode == QImode
+ ? GEN_INT (24)
+ : <V_INT_1REG_ALT:SCALAR_MODE>mode == HImode
+ || <V_INT_1REG:SCALAR_MODE>mode == HImode
+ ? GEN_INT (16)
+ : NULL);
+ operands[2] = shiftwidth;
+
+ if (!shiftwidth)
+ return "v_mov_b32 %0, %1";
+ else if (<convop> == extend || <convop> == trunc)
+ return "v_lshlrev_b32\t%0, %2, %1\;v_ashrrev_i32\t%0, %2, %0";
+ else
+ return "v_lshlrev_b32\t%0, %2, %1\;v_lshrrev_b32\t%0, %2, %0";
+ }
+ [(set_attr "type" "mult")
+ (set_attr "length" "8")])
+
;; GCC can already do these for scalar types, but not for vector types.
;; Unfortunately you can't just do SUBREG on a vector to select the low part,
;; so there must be a few tricks here.
rtx offset = XEXP (addr, 1);
int offsetbits = (TARGET_RDNA2_PLUS ? 11 : 12);
bool immediate_p = (CONST_INT_P (offset)
- && INTVAL (offset) >= -(1 << 12)
- && INTVAL (offset) < (1 << 12));
+ && INTVAL (offset) >= -(1 << offsetbits)
+ && INTVAL (offset) < (1 << offsetbits));
if ((gcn_address_register_p (base, DImode, false)
|| gcn_vec_address_register_p (base, DImode, false))
if (df_regs_ever_live_p (FIRST_AVGPR_REG + avgpr))
break;
avgpr++;
- vgpr = (vgpr + 3) & ~3;
- avgpr = (avgpr + 3) & ~3;
+
+ /* The main function epilogue uses v8, but df doesn't see that. */
+ if (vgpr < 9)
+ vgpr = 9;
if (!leaf_function_p ())
{
avgpr = MAX_NORMAL_AVGPR_COUNT;
}
- /* The gfx90a accum_offset field can't represent 0 registers. */
- if (gcn_arch == PROCESSOR_GFX90a && vgpr < 4)
- vgpr = 4;
+ /* SIMD32 devices count double in wavefront64 mode. */
+ if (TARGET_RDNA2_PLUS)
+ vgpr *= 2;
+
+ /* Round up to the allocation block size. */
+ int vgpr_block_size = (TARGET_RDNA3 ? 12
+ : TARGET_RDNA2_PLUS || TARGET_CDNA2_PLUS ? 8
+ : 4);
+ if (vgpr % vgpr_block_size)
+ vgpr += vgpr_block_size - (vgpr % vgpr_block_size);
+ if (avgpr % vgpr_block_size)
+ avgpr += vgpr_block_size - (avgpr % vgpr_block_size);
fputs ("\t.rodata\n"
"\t.p2align\t6\n"
" .private_segment_fixed_size: 0\n"
" .wavefront_size: 64\n"
" .sgpr_count: %i\n"
- " .vgpr_count: %i\n"
+ " .vgpr_count: %i%s\n"
" .max_flat_workgroup_size: 1024\n",
cfun->machine->kernarg_segment_byte_size,
cfun->machine->kernarg_segment_alignment,
LDS_SIZE,
- sgpr, next_free_vgpr);
+ sgpr, next_free_vgpr,
+ (TARGET_RDNA2_PLUS ? " ; wavefrontsize64 counts double on SIMD32"
+ : ""));
if (gcn_arch == PROCESSOR_GFX90a || gcn_arch == PROCESSOR_GFX908)
fprintf (file, " .agpr_count: %i\n", avgpr);
fputs (" .end_amdgpu_metadata\n", file);
(mult:SI
(any_extend:SI (match_operand:HI 1 "register_operand" "%v"))
(any_extend:SI (match_operand:HI 2 "register_operand" " v"))))]
- ""
+ "!TARGET_RDNA3"
"v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:WORD_0 src1_sel:WORD_0"
[(set_attr "type" "vop_sdwa")
(set_attr "length" "8")])
(mult:HI
(any_extend:HI (match_operand:QI 1 "register_operand" "%v"))
(any_extend:HI (match_operand:QI 2 "register_operand" " v"))))]
- ""
+ "!TARGET_RDNA3"
"v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:BYTE_0 src1_sel:BYTE_0"
[(set_attr "type" "vop_sdwa")
(set_attr "length" "8")])
#if defined (__GCN3__) || defined (__GCN5__) \
|| defined (__CDNA1__) || defined (__CDNA2__) \
- || defined (__RDNA2__)
+ || defined (__RDNA2__) || defined (__RDNA3__)
#define CDNA3_PLUS 0
#else
#define CDNA3_PLUS 1
/* According to AMD:
dGPU RTC is 27MHz
AGPU RTC is 100MHz
+ RDNA3 ISA manual states "typically 100MHz"
FIXME: DTRT on an APU. */
+#ifdef __RDNA3__
+#define RTC_TICKS (1.0 / 100000000.0) /* 100MHz */
+#else
#define RTC_TICKS (1.0 / 27000000.0) /* 27MHz */
+#endif
double
omp_get_wtime (void)
{
uint64_t clock;
+#ifdef __RDNA3__
+ asm ("s_sendmsg_rtn_b64 %0 0x83 ;Get REALTIME\n\t"
+ "s_waitcnt 0" : "=r" (clock));
+#else
asm ("s_memrealtime %0\n\t"
"s_waitcnt 0" : "=r" (clock));
+#endif
return clock * RTC_TICKS;
}
case EF_AMDGPU_MACH_AMDGCN_GFX900:
case EF_AMDGPU_MACH_AMDGCN_GFX906:
case EF_AMDGPU_MACH_AMDGCN_GFX908:
- case EF_AMDGPU_MACH_AMDGCN_GFX1030:
- case EF_AMDGPU_MACH_AMDGCN_GFX1100:
return 256;
case EF_AMDGPU_MACH_AMDGCN_GFX90a:
return 512;
+ case EF_AMDGPU_MACH_AMDGCN_GFX1030:
+ return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
+ case EF_AMDGPU_MACH_AMDGCN_GFX1100:
+ return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
}
GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
}