From: Kwok Cheung Yeung Date: Mon, 18 Nov 2019 21:26:50 +0000 (-0800) Subject: [og9] Backport AMD GCN backend improvements from mainline X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0ff59ef4f9a86348b99d80398e9c95386964c0a8;p=thirdparty%2Fgcc.git [og9] Backport AMD GCN backend improvements from mainline 2019-11-07 Kwok Cheung Yeung gcc/ * ira.c (setup_alloc_regs): Setup no_unit_alloc_regs for frame pointer in multiple registers. (ira_setup_eliminable_regset): Setup eliminable_regset, ira_no_alloc_regs and regs_ever_live for frame pointer in multiple registers. 2019-11-10 Kwok Cheung Yeung gcc/ * lra-spills.c (assign_spill_hard_regs): Do not spill into registers in eliminable_regset. 2019-11-14 Kwok Cheung Yeung gcc/ * lra-spills.c (assign_spill_hard_regs): Check that the spill register is suitable for the mode. 2019-11-15 Kwok Cheung Yeung gcc/ * config/gcn/gcn.c (gcn_regno_reg_class): Return VCC_CONDITIONAL_REG register class for VCC_LO and VCC_HI. (gcn_spill_class): Use SGPR_REGS to spill registers in VCC_CONDITIONAL_REG. 2019-11-15 Kwok Cheung Yeung gcc/ * config/gcn/gcn.c (gcn_expand_prologue): Remove initialization and prologue use of v0. (print_operand_address): Use v1 for zero vector offset. 2019-11-15 Kwok Cheung Yeung gcc/ * config/gcn/gcn.c (gcn_init_cumulative_args): Call reinit_regs. 2019-11-15 Kwok Cheung Yeung gcc/ * config/gcn/gcn.c (default_requested_args): New. (gcn_parse_amdgpu_hsa_kernel_attribute): Initialize requested args set with default_requested_args. (gcn_conditional_register_usage): Limit register usage of non-kernel functions. Reassign fixed registers if a non-standard set of args is requested. * config/gcn/gcn.h (FIXED_REGISTERS): Fix registers according to ABI. 2019-11-15 Kwok Cheung Yeung gcc/ * config/gcn/gcn.c (MAX_NORMAL_SGPR_COUNT, MAX_NORMAL_VGPR_COUNT): New. (gcn_conditional_register_usage): Use constants in place of hard-coded values. (gcn_hsa_declare_function_name): Set lower bound for number of SGPRs/VGPRs in non-leaf kernels to MAX_NORMAL_SGPR_COUNT and MAX_NORMAL_VGPR_COUNT. 2019-11-15 Kwok Cheung Yeung gcc/ * config/gcn/gcn.h (FIXED_REGISTERS): Unfix frame pointer. (CALL_USED_REGISTERS): Make frame pointer callee-saved. (cherry picked from openacc-gcc-9-branch commit 20245bf2e0b8ac258f86e6495b3be3e09edd0181, and commit e4005e490429a9595aee002528e0bf71281cb188) --- diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index a84569760d32..d024ee78a29b 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,66 @@ +2019-11-18 Kwok Cheung Yeung + + Backport from mainline: + + 2019-11-07 Kwok Cheung Yeung + + * ira.c (setup_alloc_regs): Setup no_unit_alloc_regs for + frame pointer in multiple registers. + (ira_setup_eliminable_regset): Setup eliminable_regset, + ira_no_alloc_regs and regs_ever_live for frame pointer in + multiple registers. + + 2019-11-10 Kwok Cheung Yeung + + * lra-spills.c (assign_spill_hard_regs): Do not spill into + registers in eliminable_regset. + + 2019-11-14 Kwok Cheung Yeung + + * lra-spills.c (assign_spill_hard_regs): Check that the spill + register is suitable for the mode. + + 2019-11-15 Kwok Cheung Yeung + + * config/gcn/gcn.c (gcn_regno_reg_class): Return VCC_CONDITIONAL_REG + register class for VCC_LO and VCC_HI. + (gcn_spill_class): Use SGPR_REGS to spill registers in + VCC_CONDITIONAL_REG. + + 2019-11-15 Kwok Cheung Yeung + + * config/gcn/gcn.c (gcn_expand_prologue): Remove initialization and + prologue use of v0. + (print_operand_address): Use v1 for zero vector offset. + + 2019-11-15 Kwok Cheung Yeung + + * config/gcn/gcn.c (gcn_init_cumulative_args): Call reinit_regs. + + 2019-11-15 Kwok Cheung Yeung + + * config/gcn/gcn.c (default_requested_args): New. + (gcn_parse_amdgpu_hsa_kernel_attribute): Initialize requested args + set with default_requested_args. + (gcn_conditional_register_usage): Limit register usage of non-kernel + functions. Reassign fixed registers if a non-standard set of args is + requested. + * config/gcn/gcn.h (FIXED_REGISTERS): Fix registers according to ABI. + + 2019-11-15 Kwok Cheung Yeung + + * config/gcn/gcn.c (MAX_NORMAL_SGPR_COUNT, MAX_NORMAL_VGPR_COUNT): New. + (gcn_conditional_register_usage): Use constants in place of hard-coded + values. + (gcn_hsa_declare_function_name): Set lower bound for number of + SGPRs/VGPRs in non-leaf kernels to MAX_NORMAL_SGPR_COUNT and + MAX_NORMAL_VGPR_COUNT. + + 2019-11-15 Kwok Cheung Yeung + + * config/gcn/gcn.h (FIXED_REGISTERS): Unfix frame pointer. + (CALL_USED_REGISTERS): Make frame pointer callee-saved. + 2019-10-16 Julian Brown * config/gcn/gcn-protos.h (gcn_goacc_adjust_gangprivate_decl): Rename diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 2835a3d71419..f556ffea1437 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -75,6 +75,12 @@ int gcn_isa = 3; /* Default to GCN3. */ #define LDS_SIZE 65536 +/* The number of registers usable by normal non-kernel functions. + The SGPR count includes any special extra registers such as VCC. */ + +#define MAX_NORMAL_SGPR_COUNT 64 +#define MAX_NORMAL_VGPR_COUNT 24 + /* }}} */ /* {{{ Initialization and options. */ @@ -191,6 +197,17 @@ static const struct gcn_kernel_arg_type {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2} }; +static const long default_requested_args + = (1 << PRIVATE_SEGMENT_BUFFER_ARG) + | (1 << DISPATCH_PTR_ARG) + | (1 << QUEUE_PTR_ARG) + | (1 << KERNARG_SEGMENT_PTR_ARG) + | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG) + | (1 << WORKGROUP_ID_X_ARG) + | (1 << WORK_ITEM_ID_X_ARG) + | (1 << WORK_ITEM_ID_Y_ARG) + | (1 << WORK_ITEM_ID_Z_ARG); + /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())). This function also sets the default values for some arguments. @@ -201,10 +218,7 @@ gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args, tree list) { bool err = false; - args->requested = ((1 << PRIVATE_SEGMENT_BUFFER_ARG) - | (1 << QUEUE_PTR_ARG) - | (1 << KERNARG_SEGMENT_PTR_ARG) - | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)); + args->requested = default_requested_args; args->nargs = 0; for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++) @@ -242,8 +256,6 @@ gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args, args->requested |= (1 << a); args->order[args->nargs++] = a; } - args->requested |= (1 << WORKGROUP_ID_X_ARG); - args->requested |= (1 << WORK_ITEM_ID_Z_ARG); /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies @@ -253,10 +265,6 @@ gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args, if (args->requested & (1 << WORK_ITEM_ID_Y_ARG)) args->requested |= (1 << WORK_ITEM_ID_X_ARG); - /* Always enable this so that kernargs is in a predictable place for - gomp_print, etc. */ - args->requested |= (1 << DISPATCH_PTR_ARG); - int sgpr_regno = FIRST_SGPR_REG; args->nsgprs = 0; for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++) @@ -462,6 +470,9 @@ gcn_regno_reg_class (int regno) { case SCC_REG: return SCC_CONDITIONAL_REG; + case VCC_LO_REG: + case VCC_HI_REG: + return VCC_CONDITIONAL_REG; case VCCZ_REG: return VCCZ_CONDITIONAL_REG; case EXECZ_REG: @@ -629,7 +640,8 @@ gcn_can_split_p (machine_mode, rtx op) static reg_class_t gcn_spill_class (reg_class_t c, machine_mode /*mode */ ) { - if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)) + if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c) + || c == VCC_CONDITIONAL_REG) return SGPR_REGS; else return NO_REGS; @@ -2040,27 +2052,36 @@ gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass, static void gcn_conditional_register_usage (void) { - int i; + if (!cfun || !cfun->machine) + return; - /* FIXME: Do we need to reset fixed_regs? */ + if (cfun->machine->normal_function) + { + /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */ + for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT - 2); + i <= LAST_SGPR_REG; i++) + fixed_regs[i] = 1, call_used_regs[i] = 1; -/* Limit ourselves to 1/16 the register file for maximimum sized workgroups. - There are enough SGPRs not to limit those. - TODO: Adjust this more dynamically. */ - for (i = FIRST_VGPR_REG + 64; i <= LAST_VGPR_REG; i++) - fixed_regs[i] = 1, call_used_regs[i] = 1; + for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT); + i <= LAST_VGPR_REG; i++) + fixed_regs[i] = 1, call_used_regs[i] = 1; - if (!cfun || !cfun->machine || cfun->machine->normal_function) - { - /* Normal functions can't know what kernel argument registers are - live, so just fix the bottom 16 SGPRs, and bottom 3 VGPRs. */ - for (i = 0; i < 16; i++) - fixed_regs[FIRST_SGPR_REG + i] = 1; - for (i = 0; i < 3; i++) - fixed_regs[FIRST_VGPR_REG + i] = 1; return; } + /* If the set of requested args is the default set, nothing more needs to + be done. */ + if (cfun->machine->args.requested == default_requested_args) + return; + + /* Requesting a set of args different from the default violates the ABI. */ + if (!leaf_function_p ()) + warning (0, "A non-default set of initial values has been requested, " + "which violates the ABI!"); + + for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++) + fixed_regs[i] = 0; + /* Fix the runtime argument register containing values that may be needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be needed after the prologue so there's no need to fix them. */ @@ -2068,10 +2089,10 @@ gcn_conditional_register_usage (void) fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1; if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0) { + /* The upper 32-bits of the 64-bit descriptor are not used, so allow + the containing registers to be used for other purposes. */ fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1; fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1; - fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 2] = 1; - fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 3] = 1; } if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0) { @@ -2441,6 +2462,8 @@ gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ , cfun->machine->args = cum->args; if (!caller && cfun->machine->normal_function) gcn_detect_incoming_pointer_arg (fndecl); + + reinit_regs (); } static bool @@ -2776,15 +2799,6 @@ gcn_expand_prologue () cfun->machine->args. reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]); - if (TARGET_GCN5_PLUS) - { - /* v0 is reserved for constant zero so that "global" - memory instructions can have a nul-offset without - causing reloads. */ - emit_insn (gen_vec_duplicatev64si - (gen_rtx_REG (V64SImode, VGPR_REGNO (0)), const0_rtx)); - } - if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG)) { rtx fs_init_lo = @@ -2843,8 +2857,6 @@ gcn_expand_prologue () gen_int_mode (LDS_SIZE, SImode)); emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG))); - if (TARGET_GCN5_PLUS) - emit_insn (gen_prologue_use (gen_rtx_REG (SImode, VGPR_REGNO (0)))); if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp) { @@ -4876,10 +4888,10 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) if (!leaf_function_p ()) { /* We can't know how many registers function calls might use. */ - if (vgpr < 64) - vgpr = 64; - if (sgpr + extra_regs < 102) - sgpr = 102 - extra_regs; + if (vgpr < MAX_NORMAL_VGPR_COUNT) + vgpr = MAX_NORMAL_VGPR_COUNT; + if (sgpr + extra_regs < MAX_NORMAL_SGPR_COUNT) + sgpr = MAX_NORMAL_SGPR_COUNT - extra_regs; } /* GFX8 allocates SGPRs in blocks of 8. @@ -5303,9 +5315,9 @@ print_operand_address (FILE *file, rtx mem) /* The assembler requires a 64-bit VGPR pair here, even though the offset should be only 32-bit. */ if (vgpr_offset == NULL_RTX) - /* In this case, the vector offset is zero, so we use v0, - which is initialized by the kernel prologue to zero. */ - fprintf (file, "v[0:1]"); + /* In this case, the vector offset is zero, so we use the first + lane of v1, which is initialized to zero. */ + fprintf (file, "v[1:2]"); else if (REG_P (vgpr_offset) && VGPR_REGNO_P (REGNO (vgpr_offset))) { diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h index b3b2d1ad3f93..e60b43122d5b 100644 --- a/gcc/config/gcn/gcn.h +++ b/gcc/config/gcn/gcn.h @@ -160,9 +160,9 @@ #define FIXED_REGISTERS { \ /* Scalars. */ \ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ + 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, \ /* fp sp lr. */ \ - 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, \ + 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, \ /* exec_save, cc_save */ \ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ @@ -180,7 +180,7 @@ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ /* VGRPs */ \ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ + 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ @@ -203,7 +203,7 @@ #define CALL_USED_REGISTERS { \ /* Scalars. */ \ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, \ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ diff --git a/gcc/ira.c b/gcc/ira.c index fd481d6e0e2f..60e0b9bd29a3 100644 --- a/gcc/ira.c +++ b/gcc/ira.c @@ -516,7 +516,8 @@ setup_alloc_regs (bool use_hard_frame_p) #endif COPY_HARD_REG_SET (no_unit_alloc_regs, fixed_nonglobal_reg_set); if (! use_hard_frame_p) - SET_HARD_REG_BIT (no_unit_alloc_regs, HARD_FRAME_POINTER_REGNUM); + add_to_hard_reg_set (&no_unit_alloc_regs, Pmode, + HARD_FRAME_POINTER_REGNUM); setup_class_hard_regs (); } @@ -2275,6 +2276,7 @@ ira_setup_eliminable_regset (void) { int i; static const struct {const int from, to; } eliminables[] = ELIMINABLE_REGS; + int fp_reg_count = hard_regno_nregs (HARD_FRAME_POINTER_REGNUM, Pmode); /* Setup is_leaf as frame_pointer_required may use it. This function is called by sched_init before ira if scheduling is enabled. */ @@ -2303,7 +2305,8 @@ ira_setup_eliminable_regset (void) frame pointer in LRA. */ if (frame_pointer_needed) - df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM, true); + for (i = 0; i < fp_reg_count; i++) + df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM + i, true); COPY_HARD_REG_SET (ira_no_alloc_regs, no_unit_alloc_regs); CLEAR_HARD_REG_SET (eliminable_regset); @@ -2333,17 +2336,21 @@ ira_setup_eliminable_regset (void) } if (!HARD_FRAME_POINTER_IS_FRAME_POINTER) { - if (!TEST_HARD_REG_BIT (crtl->asm_clobbers, HARD_FRAME_POINTER_REGNUM)) - { - SET_HARD_REG_BIT (eliminable_regset, HARD_FRAME_POINTER_REGNUM); - if (frame_pointer_needed) - SET_HARD_REG_BIT (ira_no_alloc_regs, HARD_FRAME_POINTER_REGNUM); - } - else if (frame_pointer_needed) - error ("%s cannot be used in asm here", - reg_names[HARD_FRAME_POINTER_REGNUM]); - else - df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM, true); + for (i = 0; i < fp_reg_count; i++) + if (!TEST_HARD_REG_BIT (crtl->asm_clobbers, + HARD_FRAME_POINTER_REGNUM + i)) + { + SET_HARD_REG_BIT (eliminable_regset, + HARD_FRAME_POINTER_REGNUM + i); + if (frame_pointer_needed) + SET_HARD_REG_BIT (ira_no_alloc_regs, + HARD_FRAME_POINTER_REGNUM + i); + } + else if (frame_pointer_needed) + error ("%s cannot be used in asm here", + reg_names[HARD_FRAME_POINTER_REGNUM + i]); + else + df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM + i, true); } } diff --git a/gcc/lra-spills.c b/gcc/lra-spills.c index c19b76a579cf..417d68c15e3f 100644 --- a/gcc/lra-spills.c +++ b/gcc/lra-spills.c @@ -283,6 +283,9 @@ assign_spill_hard_regs (int *pseudo_regnos, int n) for (k = 0; k < spill_class_size; k++) { hard_regno = ira_class_hard_regs[spill_class][k]; + if (TEST_HARD_REG_BIT (eliminable_regset, hard_regno) + || !targetm.hard_regno_mode_ok (hard_regno, mode)) + continue; if (! overlaps_hard_reg_set_p (conflict_hard_regs, mode, hard_regno)) break; }