From: Andreas Arnez Date: Thu, 25 Apr 2024 12:38:45 +0000 (+0200) Subject: s390x: Add support for the `NNPA' instruction X-Git-Tag: VALGRIND_3_23_0~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=0d1e0c997b908bc488625d10cac392130334ff2a;p=thirdparty%2Fvalgrind.git s390x: Add support for the `NNPA' instruction Finalize the NNPA facility support by handling the NNPA instruction itself, using the new "extension" approach. Also reflect NNPA facility support in HWCAP and in the STFLE bits. --- diff --git a/VEX/priv/guest_s390_helpers.c b/VEX/priv/guest_s390_helpers.c index d60e4ce3c..bb5e9c939 100644 --- a/VEX/priv/guest_s390_helpers.c +++ b/VEX/priv/guest_s390_helpers.c @@ -391,7 +391,9 @@ s390x_dirtyhelper_STFLE(VexGuestS390XState *guest_state, ULong *addr) /* 154: unassigned */ /* 155: MSA9, not supported */ | s390_stfle_range(156, 156) - /* 157-167: unassigned */ + /* 157-164: unassigned */ + | s390_stfle_range(165, 165) + /* 166-167: unassigned */ | s390_stfle_range(168, 168) /* 168-191: unassigned */ ), diff --git a/VEX/priv/guest_s390_toIR.c b/VEX/priv/guest_s390_toIR.c index 44a0293e2..7e836faf3 100644 --- a/VEX/priv/guest_s390_toIR.c +++ b/VEX/priv/guest_s390_toIR.c @@ -19669,6 +19669,14 @@ s390_irgen_VCRNF(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5) return "vcrnf"; } +static const HChar * +s390_irgen_NNPA(void) +{ + s390_insn_assert("nnpa", s390_host_has_nnpa); + extension(S390_EXT_NNPA, 0); + return "nnpa"; +} + /* New insns are added here. If an insn is contingent on a facility being installed also check whether the list of supported facilities in function @@ -20446,7 +20454,7 @@ s390_decode_4byte_and_irgen(const UChar *bytes) case 0xb938: /* SORTL */ goto unimplemented; case 0xb939: /* DFLTCC */ goto unimplemented; case 0xb93a: /* KDSA */ goto unimplemented; - case 0xb93b: /* NNPA */ goto unimplemented; + case 0xb93b: s390_format_E(s390_irgen_NNPA); goto ok; case 0xb93c: s390_format_RRE_RR(s390_irgen_PPNO, RRE_r1(ovl), RRE_r2(ovl)); goto ok; case 0xb93e: /* KIMD */ goto unimplemented; diff --git a/VEX/pub/libvex_s390x_common.h b/VEX/pub/libvex_s390x_common.h index 7217fdec3..4ed848c5c 100644 --- a/VEX/pub/libvex_s390x_common.h +++ b/VEX/pub/libvex_s390x_common.h @@ -117,6 +117,7 @@ #define S390_EXT_ID_NBITS 16 #define S390_EXT_PRNO 1 +#define S390_EXT_NNPA 2 /*--------------------------------------------------------------*/ /*--- Miscellaneous ---*/ diff --git a/coregrind/m_extension/extension-s390x.c b/coregrind/m_extension/extension-s390x.c index 55bcd84a6..fd45c7ee9 100644 --- a/coregrind/m_extension/extension-s390x.c +++ b/coregrind/m_extension/extension-s390x.c @@ -68,8 +68,7 @@ #define INSN_ERR(msg) \ ({ \ - VG_(umsg)("Illegal operation: "); \ - VG_(umsg)(msg); \ + VG_(umsg)("Illegal operation: %s", msg); \ ExtErr_Illop; \ }) @@ -138,7 +137,7 @@ static const ULong PRNO_functions[] = { | S390_SETBIT(114)), // TRNG }; -static UWord do_extension_PRNO(ThreadState* tst, ULong variant) +static enum ExtensionError do_extension_PRNO(ThreadState* tst, ULong variant) { UChar r1 = variant & 0xf; UChar r2 = (variant >> 4) & 0xf; @@ -190,10 +189,10 @@ static UWord do_extension_PRNO(ThreadState* tst, ULong variant) break; case 114: // TRNG addr1 = orig_addr1 = READ_GPR(tst, "PRNO(op1_addr)", r1); - len1 = orig_len1 = READ_GPR(tst, "PRNO(op1_len)", r1 + 1); + len1 = orig_len1 = READ_GPR(tst, "PRNO(op1_len)", r1 + 1); PRE_MEM_WRITE(tst, "PRNO(op1)", addr1, len1); addr2 = orig_addr2 = READ_GPR(tst, "PRNO(op2_addr)", r2); - len2 = orig_len2 = READ_GPR(tst, "PRNO(op2_len)", r2 + 1); + len2 = orig_len2 = READ_GPR(tst, "PRNO(op2_len)", r2 + 1); PRE_MEM_WRITE(tst, "PRNO(op2)", addr2, len2); cc = do_PRNO_insn(func, parms, &addr1, &len1, &addr2, &len2); WRITE_GPR(tst, r1, addr1); @@ -210,6 +209,386 @@ static UWord do_extension_PRNO(ThreadState* tst, ULong variant) return ExtErr_OK; } +/*---------------------------------------------------------------*/ +/*--- NNPA (neural network processing assist) ---*/ +/*---------------------------------------------------------------*/ + +static Int do_NNPA_insn(ULong* gpr0, ULong parms) +{ + register ULong reg0 asm("0") = *gpr0; + register void* reg1 asm("1") = (void*)parms; + Int cc; + + asm volatile(".insn rre, 0xb93b0000, 0, 0\n" + "ipm %[cc]\n" + "srl %[cc], 28\n" + : [cc] "=d"(cc), "+d"(reg0) + : "d"(reg1) + : "cc", "memory"); + *gpr0 = reg0; + return cc; +} + +/* NNPA function codes */ +typedef enum { + S390_NNPA_QAF = 0x00, + S390_NNPA_ADD = 0x10, + S390_NNPA_SUB = 0x11, + S390_NNPA_MUL = 0x12, + S390_NNPA_DIV = 0x13, + S390_NNPA_MIN = 0x14, + S390_NNPA_MAX = 0x15, + S390_NNPA_LOG = 0x20, + S390_NNPA_EXP = 0x21, + S390_NNPA_RELU = 0x31, + S390_NNPA_TANH = 0x32, + S390_NNPA_SIGMOID = 0x33, + S390_NNPA_SOFTMAX = 0x34, + S390_NNPA_BATCHNORM = 0x40, + S390_NNPA_MAXPOOL2D = 0x50, + S390_NNPA_AVGPOOL2D = 0x51, + S390_NNPA_LSTMACT = 0x60, + S390_NNPA_GRUACT = 0x61, + S390_NNPA_CONVOLUTION = 0x70, + S390_NNPA_MATMUL_OP = 0x71, + S390_NNPA_MATMUL_OP_BCAST23 = 0x72, +} s390_nnpa_function_t; + +/* Suported NNPA functions */ +static const ULong NNPA_functions[] = { + (S390_SETBIT(S390_NNPA_QAF) | S390_SETBIT(S390_NNPA_ADD) | + S390_SETBIT(S390_NNPA_SUB) | S390_SETBIT(S390_NNPA_MUL) | + S390_SETBIT(S390_NNPA_DIV) | S390_SETBIT(S390_NNPA_MIN) | + S390_SETBIT(S390_NNPA_MAX) | S390_SETBIT(S390_NNPA_LOG) | + S390_SETBIT(S390_NNPA_EXP) | S390_SETBIT(S390_NNPA_RELU) | + S390_SETBIT(S390_NNPA_TANH) | S390_SETBIT(S390_NNPA_SIGMOID) | + S390_SETBIT(S390_NNPA_SOFTMAX)), + (S390_SETBIT(S390_NNPA_BATCHNORM) | S390_SETBIT(S390_NNPA_MAXPOOL2D) | + S390_SETBIT(S390_NNPA_AVGPOOL2D) | S390_SETBIT(S390_NNPA_LSTMACT) | + S390_SETBIT(S390_NNPA_GRUACT) | S390_SETBIT(S390_NNPA_CONVOLUTION) | + S390_SETBIT(S390_NNPA_MATMUL_OP) | + S390_SETBIT(S390_NNPA_MATMUL_OP_BCAST23)), +}; + +/* Supported parameter block formats */ +static const ULong NNPA_ipbf[] = { + (S390_SETBIT(0)), +}; + +/* Supported data types and data layout formats */ +static const ULong NNPA_dtypes_layouts[] = { + /* Data types */ + (S390_SETBIT(0) | // data type 1 (16 bit) + + /* Data layout formats */ + S390_SETBIT(32 + 0) | // 4D-feature tensor + S390_SETBIT(32 + 1) // 4D-kernel tensor + ), +}; + +static const ULong NNPA_conversions[] = { + (S390_SETBIT(1) | // BFP tiny format + S390_SETBIT(2)), // BFP short format +}; + +struct s390_NNPA_parms_qaf { + ULong funcs[4]; + ULong ipbf[2]; + ULong dtypes_layouts; + UInt reserved1; + UInt mdis; + ULong mts; + ULong conversions; + ULong reserved2[22]; +}; + +struct s390_NNPA_tensor0 { + UChar layout; + UChar dtype; + UShort reserved1; + UInt reserved2; + UInt dim4, dim3, dim2, dim1; + ULong address; +}; + +struct s390_NNPA_parms0 { + ULong pbvn : 16; + ULong mvn : 8; + ULong ribm : 24; + ULong reserved0 : 15; + ULong cf : 1; + ULong reserved1[6]; + ULong save_area_address; + struct s390_NNPA_tensor0 out[2]; + struct s390_NNPA_tensor0 reserved2[2]; + struct s390_NNPA_tensor0 in[3]; + ULong reserved3[12]; + UInt param[5]; + UInt reserved4; + ULong reserved5[13]; +}; + +enum { + s390_NNPA_message_in = 0, + s390_NNPA_message_out = 3, + s390_NNPA_message_n = 5, +}; + +static const char* const s390_NNPA_errmsg_dtype[s390_NNPA_message_n] = { + "NNPA: unknown data type in input tensor 1", + "NNPA: unknown data type in input tensor 2", + "NNPA: unknown data type in input tensor 3", + "NNPA: unknown data type in output tensor 1", + "NNPA: unknown data type in output tensor 2", +}; + +static const char* const s390_NNPA_errmsg_layout[s390_NNPA_message_n] = { + "NNPA: unknown layout in input tensor 1", + "NNPA: unknown layout in input tensor 2", + "NNPA: unknown layout in input tensor 3", + "NNPA: unknown layout in output tensor 1", + "NNPA: unknown layout in output tensor 2", +}; + +static const char* const s390_NNPA_errmsg_access[s390_NNPA_message_n] = { + "NNPA(in_tensor_1)", "NNPA(in_tensor_2)", "NNPA(in_tensor_3)", + "NNPA(out_tensor_1)", "NNPA(out_tensor_2)", +}; + +struct s390_NNPA_mem_dimensions { + ULong dim[5]; // total dimensions + ULong used[5]; // used dimensions, without padding + ULong step[5]; +}; + +/* Determine the 5 dimensions used to represent the tensor data in memory */ +static enum ExtensionError +NNPA_tensor0_size(const struct s390_NNPA_tensor0* t, + UInt msg_idx, + struct s390_NNPA_mem_dimensions* out_md) +{ + struct s390_NNPA_mem_dimensions md; + ULong elem_size; + + if (t->dtype == 0) + elem_size = 2; + else + return INSN_ERR(s390_NNPA_errmsg_dtype[msg_idx]); + + switch (t->layout) { + case 0: // 4D-feature tensor + md.dim[0] = md.used[0] = t->dim4; + md.dim[1] = md.used[1] = (t->dim1 + 63) / 64; + md.dim[2] = md.used[2] = t->dim3; + md.dim[3] = (t->dim2 + 31) / 32 * 32; + md.used[3] = t->dim2; + md.dim[4] = 64; + md.used[4] = t->dim1; + break; + case 1: // 4D-kernel tensor + md.dim[0] = md.used[0] = (t->dim1 + 63) / 64; + md.dim[1] = md.used[1] = t->dim4; + md.dim[2] = md.used[2] = t->dim3; + md.dim[3] = (t->dim2 + 31) / 32 * 32; + md.used[3] = t->dim2; + md.dim[4] = 64; + md.used[4] = t->dim1; + break; + default: + return INSN_ERR(s390_NNPA_errmsg_layout[msg_idx]); + } + md.step[4] = elem_size * md.dim[4]; + md.step[3] = md.step[4] * md.dim[3]; + md.step[2] = md.step[3] * md.dim[2]; + md.step[1] = md.step[2] * md.dim[1]; + md.step[0] = md.step[1] * md.dim[0]; // total size + *out_md = md; + return ExtErr_OK; +} + +static enum ExtensionError NNPA_pre_read_tensor0( + ThreadState* tst, UInt msg_idx, const struct s390_NNPA_tensor0* t) +{ + struct s390_NNPA_mem_dimensions md; + enum ExtensionError ret; + + ret = NNPA_tensor0_size(t, msg_idx, &md); + if (ret != ExtErr_OK) + return ret; + + for (ULong d0 = 0; d0 < md.used[0]; d0++) { + for (ULong d1 = 0; d1 < md.used[1]; d1++) { + for (ULong d2 = 0; d2 < md.used[2]; d2++) { + for (ULong d3 = 0; d3 < md.used[3]; d3++) { + ULong addr = t->address + d0 * md.step[1] + d1 * md.step[2] + + d2 * md.step[3] + d3 * md.step[4]; + PRE_MEM_READ(tst, s390_NNPA_errmsg_access[msg_idx], addr, + md.dim[4]); + } + } + } + } + return ExtErr_OK; +} + +static UWord NNPA_pre_write_tensor0(ThreadState* tst, + UInt msg_idx, + const struct s390_NNPA_tensor0* t) +{ + struct s390_NNPA_mem_dimensions md; + enum ExtensionError ret; + + ret = NNPA_tensor0_size(t, msg_idx, &md); + if (ret != ExtErr_OK) + return ret; + + PRE_MEM_WRITE(tst, "NNPA(out_tensor)", t->address, md.step[0]); + return ExtErr_OK; +} + +static void NNPA_post_write_tensor0(ThreadState* tst, + UInt msg_idx, + const struct s390_NNPA_tensor0* t) +{ + struct s390_NNPA_mem_dimensions md; + enum ExtensionError ret; + + ret = NNPA_tensor0_size(t, msg_idx, &md); + if (ret != ExtErr_OK) + return; + + for (ULong d0 = 0; d0 < md.used[0]; d0++) { + for (ULong d1 = 0; d1 < md.used[1]; d1++) { + for (ULong d2 = 0; d2 < md.used[2]; d2++) { + for (ULong d3 = 0; d3 < md.used[3]; d3++) { + ULong addr = t->address + d0 * md.step[1] + d1 * md.step[2] + + d2 * md.step[3] + d3 * md.step[4]; + POST_MEM_WRITE(tst, addr, md.dim[4]); + } + } + } + } +} + +static enum ExtensionError do_extension_NNPA(ThreadState* tst, ULong variant) +{ + ULong gpr0 = READ_GPR(tst, "NNPA(r0)", 0); + UChar fc = gpr0 & 0x7f; + ULong parms_addr = READ_GPR(tst, "r1", 1); + Int cc = 0; + ULong parms_len; + + if (fc == S390_NNPA_QAF) { // Query + struct s390_NNPA_parms_qaf* parms = (void*)parms_addr; + + parms_len = sizeof(struct s390_NNPA_parms_qaf); + PRE_MEM_WRITE(tst, "NNPA(parms)", parms_addr, parms_len); + cc = do_NNPA_insn(&gpr0, parms_addr); + s390_filter_functions(parms->funcs, sizeof(parms->funcs), NNPA_functions, + sizeof(NNPA_functions)); + s390_filter_functions(parms->ipbf, sizeof(parms->ipbf), NNPA_ipbf, + sizeof(NNPA_ipbf)); + s390_filter_functions(&parms->dtypes_layouts, sizeof(ULong), + NNPA_dtypes_layouts, sizeof(NNPA_dtypes_layouts)); + s390_filter_functions(&parms->conversions, sizeof(ULong), + NNPA_conversions, sizeof(NNPA_conversions)); + } else { + struct s390_NNPA_parms0* parms = (void*)parms_addr; + const struct s390_NNPA_parms0 orig_parms = *parms; + ULong save_area_size = 0; + UInt in_tensors; + UInt out_tensors; + + parms_len = 4096; + PRE_MEM_READ(tst, "NNPA(parms)", parms_addr, + sizeof(struct s390_NNPA_parms0)); + if (parms->cf) { + PRE_MEM_READ(tst, "NNPA(parms.csb)", parms_addr + 512, + parms_len - 512); + } + PRE_MEM_WRITE(tst, "NNPA(parms)", parms_addr, parms_len); + + switch (fc) { + case S390_NNPA_ADD: + case S390_NNPA_SUB: + case S390_NNPA_MUL: + case S390_NNPA_DIV: + case S390_NNPA_MIN: + case S390_NNPA_MAX: + in_tensors = 2; + out_tensors = 1; + break; + case S390_NNPA_LOG: + case S390_NNPA_EXP: + case S390_NNPA_RELU: + case S390_NNPA_TANH: + case S390_NNPA_SIGMOID: + in_tensors = 1; + out_tensors = 1; + break; + case S390_NNPA_SOFTMAX: + in_tensors = 1; + out_tensors = 1; + save_area_size = 8192; + break; + case S390_NNPA_BATCHNORM: + in_tensors = 3; + out_tensors = 1; + break; + case S390_NNPA_MAXPOOL2D: + case S390_NNPA_AVGPOOL2D: + in_tensors = 1; + out_tensors = 1; + break; + case S390_NNPA_LSTMACT: + in_tensors = 3; + out_tensors = 2; + break; + case S390_NNPA_GRUACT: + case S390_NNPA_CONVOLUTION: + case S390_NNPA_MATMUL_OP: + case S390_NNPA_MATMUL_OP_BCAST23: + in_tensors = 3; + out_tensors = 1; + break; + default: + return INSN_ERR("NNPA: unknown function code\n"); + } + + for (UInt i = 0; i < in_tensors; i++) { + enum ExtensionError retval = + NNPA_pre_read_tensor0(tst, s390_NNPA_message_in + i, &parms->in[i]); + if (retval != ExtErr_OK) + return retval; + } + for (UInt i = 0; i < out_tensors; i++) { + enum ExtensionError retval = NNPA_pre_write_tensor0( + tst, s390_NNPA_message_out + i, &parms->out[i]); + if (retval != ExtErr_OK) + return retval; + } + if (save_area_size != 0) { + PRE_MEM_WRITE(tst, "NNPA(save_area)", parms->save_area_address, + save_area_size); + } + cc = do_NNPA_insn(&gpr0, parms_addr); + if (cc == 0) { + for (UInt i = 0; i < out_tensors; i++) { + NNPA_post_write_tensor0(tst, s390_NNPA_message_out + i, + &orig_parms.out[i]); + } + } + } + POST_MEM_WRITE(tst, parms_addr, parms_len); + WRITE_GPR(tst, 0, gpr0); + WRITE_CC(tst, cc); + return ExtErr_OK; +} + +/*---------------------------------------------------------------*/ +/*--- Main function: select and call appropriate extension ---*/ +/*---------------------------------------------------------------*/ + enum ExtensionError ML_(do_client_extension)(ThreadState* tst) { ULong code = REG_READ(tst, SYSNO); @@ -219,6 +598,8 @@ enum ExtensionError ML_(do_client_extension)(ThreadState* tst) switch (id) { case S390_EXT_PRNO: return do_extension_PRNO(tst, variant); + case S390_EXT_NNPA: + return do_extension_NNPA(tst, variant); default: VG_(core_panic)("unknown extension ID"); } diff --git a/coregrind/m_initimg/initimg-linux.c b/coregrind/m_initimg/initimg-linux.c index 7680baa8e..c92c3cfb2 100644 --- a/coregrind/m_initimg/initimg-linux.c +++ b/coregrind/m_initimg/initimg-linux.c @@ -721,7 +721,8 @@ Addr setup_client_stack( void* init_sp, auxv->u.a_val &= ((VKI_HWCAP_S390_TE - 1) | VKI_HWCAP_S390_VXRS | VKI_HWCAP_S390_VXRS_EXT - | VKI_HWCAP_S390_VXRS_EXT2); + | VKI_HWCAP_S390_VXRS_EXT2 + | VKI_HWCAP_S390_NNPA); } # elif defined(VGP_arm64_linux) { diff --git a/docs/internals/s390-opcodes.csv b/docs/internals/s390-opcodes.csv index b41d9bd2b..a65ceea31 100644 --- a/docs/internals/s390-opcodes.csv +++ b/docs/internals/s390-opcodes.csv @@ -1754,7 +1754,7 @@ vpkzr,"vector pack zoned register","not implemented",arch14 vsrpr,"vector shift and round decimal register","not implemented",arch14 vupkzh,"vector unpack zoned high","not implemented",arch14 vupkzl,"vector unpack zoned low","not implemented",arch14 -nnpa,"neural network processing assist","not implemented",arch14 +nnpa,"neural network processing assist",implemented,arch14 vclfnh,"vector fp convert and lengthen from nnp high","implemented",arch14 vclfnl,"vector fp convert and lengthen from nnp low","implemented",arch14 vcrnf,"vector fp convert and round to nnp","implemented",arch14