}
/* Initialise V128 temporaries en masse. */
+static
+void newTempsV128_2(IRTemp* t1, IRTemp* t2)
+{
+ vassert(t1 && *t1 == IRTemp_INVALID);
+ vassert(t2 && *t2 == IRTemp_INVALID);
+ *t1 = newTempV128();
+ *t2 = newTempV128();
+}
+
+/* Initialise V128 temporaries en masse. */
+static
+void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
+{
+ vassert(t1 && *t1 == IRTemp_INVALID);
+ vassert(t2 && *t2 == IRTemp_INVALID);
+ vassert(t3 && *t3 == IRTemp_INVALID);
+ *t1 = newTempV128();
+ *t2 = newTempV128();
+ *t3 = newTempV128();
+}
+
static
void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
return ops[sizeNarrow];
}
+static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
+ const IROp ops[4]
+ = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
+ vassert(sizeNarrow < 3);
+ return ops[sizeNarrow];
+}
+
static IROp mkVecCMPEQ ( UInt size ) {
const IROp ops[4]
= { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
}
-static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( IRTemp src, UInt size )
+/* Compute vector SQNEG at lane size |size| for |srcE|, returning
+ the q result in |*qneg| and the normal result in |*nneg|. */
+static
+void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
+ IRExpr* srcE, UInt size )
+{
+ IRTemp src = IRTemp_INVALID;
+ newTempsV128_3(&src, nneg, qneg);
+ assign(src, srcE);
+ assign(*nneg, binop(mkVecSUB(size), mkV128(0x0000), mkexpr(src)));
+ assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
+}
+
+
+static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( IRExpr* srcE, UInt size )
{
vassert(size < 4);
IRTemp t = newTempV128();
- assign(t, unop(mkVecZEROHIxxOFV128(size), mkexpr(src)));
+ assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
return t;
}
+/* Generate IR to compute vector widening MULL from either the lower
+ (is2==False) or upper (is2==True) halves of vecN and vecM. The
+ widening multiplies are unsigned when isU==True and signed when
+ isU==False. |size| is the narrow lane size indication. Optionally,
+ the product may be added to or subtracted from vecD, at the wide lane
+ size. This happens when |mas| is 'a' (add) or 's' (sub). When |mas|
+ is 'm' (only multiply) then the accumulate part does not happen, and
+ |vecD| is expected to == IRTemp_INVALID.
+
+ Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
+ are allowed. The result is returned in a new IRTemp, which is
+ returned in *res. */
+static
+void math_MULL_ACC ( /*OUT*/IRTemp* res,
+ Bool is2, Bool isU, UInt size, HChar mas,
+ IRTemp vecN, IRTemp vecM, IRTemp vecD )
+{
+ vassert(res && *res == IRTemp_INVALID);
+ vassert(size <= 2);
+ vassert(mas == 'm' || mas == 'a' || mas == 's');
+ if (mas == 'm') vassert(vecD == IRTemp_INVALID);
+ IROp mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
+ IROp accOp = (mas == 'a') ? mkVecADD(size+1)
+ : (mas == 's' ? mkVecSUB(size+1)
+ : Iop_INVALID);
+ IRTemp mul = math_BINARY_WIDENING_V128(is2, mulOp,
+ mkexpr(vecN), mkexpr(vecM));
+ *res = newTempV128();
+ assign(*res, mas == 'm' ? mkexpr(mul)
+ : binop(accOp, mkexpr(vecD), mkexpr(mul)));
+}
+
+
+/* Same as math_MULL_ACC, except the multiply is signed widening,
+ the multiplied value is then doubled, before being added to or
+ subtracted from the accumulated value. And everything is
+ saturated. In all cases, saturation residuals are returned
+ via (sat1q, sat1n), and in the accumulate cases,
+ via (sat2q, sat2n) too. All results are returned in new temporaries.
+ In the no-accumulate case, *sat2q and *sat2n are never instantiated,
+ so the caller can tell this has happened. */
+static
+void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
+ /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
+ /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
+ Bool is2, UInt size, HChar mas,
+ IRTemp vecN, IRTemp vecM, IRTemp vecD )
+{
+ vassert(size <= 2);
+ vassert(mas == 'm' || mas == 'a' || mas == 's');
+ /* Compute
+ sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
+ sat1n = vecN.D[is2] *s vecM.d[is2] * 2
+ IOW take either the low or high halves of vecN and vecM, signed widen,
+ multiply, double that, and signedly saturate. Also compute the same
+ but without saturation.
+ */
+ vassert(sat2q && *sat2q == IRTemp_INVALID);
+ vassert(sat2n && *sat2n == IRTemp_INVALID);
+ newTempsV128_3(sat1q, sat1n, res);
+ IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
+ mkexpr(vecN), mkexpr(vecM));
+ IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
+ mkexpr(vecN), mkexpr(vecM));
+ assign(*sat1q, mkexpr(tq));
+ assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
+
+ /* If there is no accumulation, the final result is sat1q,
+ and there's no assignment to sat2q or sat2n. */
+ if (mas == 'm') {
+ assign(*res, mkexpr(*sat1q));
+ return;
+ }
+
+ /* Compute
+ sat2q = vecD +sq/-sq sat1q
+ sat2n = vecD +/- sat1n
+ result = sat2q
+ */
+ newTempsV128_2(sat2q, sat2n);
+ assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
+ mkexpr(vecD), mkexpr(*sat1q)));
+ assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
+ mkexpr(vecD), mkexpr(*sat1n)));
+ assign(*res, mkexpr(*sat2q));
+}
+
+
/* QCFLAG tracks the SIMD sticky saturation status. Update the status
thusly: if |qres| and |nres| hold the same value, leave QCFLAG
unchanged. Otherwise, set it (implicitly) to 1. */
{
/* 31 29 28 23 21 20 15 10 9 4
01 U 11110 size 1 m opcode 1 n d
+ Decode fields: u,size,opcode
*/
# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin))
if (INSN(31,30) != BITS2(0,1)
UInt dd = INSN(4,0);
vassert(size < 4);
+ if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
+ /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
+ /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
+ /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
+ /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
+ Bool isADD = opcode == BITS5(0,0,0,0,1);
+ Bool isU = bitU == 1;
+ IROp qop = Iop_INVALID;
+ IROp nop = Iop_INVALID;
+ if (isADD) {
+ qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
+ nop = mkVecADD(size);
+ } else {
+ qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
+ nop = mkVecSUB(size);
+ }
+ IRTemp argL = newTempV128();
+ IRTemp argR = newTempV128();
+ IRTemp qres = newTempV128();
+ IRTemp nres = newTempV128();
+ assign(argL, getQReg128(nn));
+ assign(argR, getQReg128(mm));
+ assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
+ binop(qop, mkexpr(argL), mkexpr(argR)), size)));
+ assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
+ binop(nop, mkexpr(argL), mkexpr(argR)), size)));
+ putQReg128(dd, mkexpr(qres));
+ updateQCFLAGwithDifference(qres, nres);
+ const HChar* nm = isADD ? (isU ? "uqadd" : "sqadd")
+ : (isU ? "uqsub" : "sqsub");
+ const HChar arr = "bhsd"[size];
+ DIP("%s %s.%c, %s.%c, %s.%c\n", nm,
+ nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+ return True;
+ }
+
if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
/* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
/* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
UInt dd = INSN(4,0);
vassert(size < 4);
- if (bitU == 0 && opcode == BITS5(0,0,1,1,1)) {
+ if (opcode == BITS5(0,0,1,1,1)) {
/* -------- 0,xx,00111 SQABS std4_std4 -------- */
- IRTemp qabs = IRTemp_INVALID, nabs = IRTemp_INVALID;
- math_SQABS(&qabs, &nabs, getQReg128(nn), size);
- IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(qabs, size);
- IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(nabs, size);
+ /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
+ Bool isNEG = bitU == 1;
+ IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
+ (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
+ getQReg128(nn), size );
+ IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(mkexpr(qresFW), size);
+ IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(mkexpr(nresFW), size);
putQReg128(dd, mkexpr(qres));
updateQCFLAGwithDifference(qres, nres);
const HChar arr = "bhsd"[size];
- DIP("%s %c%u, %c%u\n", "sqabs", arr, dd, arr, nn);
+ DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
return True;
}
vassert(ks >= 0 && ks <= 2);
if (size == X11) return False;
vassert(size <= 2);
- Bool isU = bitU == 1;
- IROp mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
- IROp accOp = (ks == 1) ? mkVecADD(size+1)
- : (ks == 2 ? mkVecSUB(size+1) : Iop_INVALID);
- IRTemp mul = math_BINARY_WIDENING_V128(is2, mulOp,
- getQReg128(nn), getQReg128(mm));
- IRTemp res = newTempV128();
- assign(res, ks == 0 ? mkexpr(mul)
- : binop(accOp, getQReg128(dd), mkexpr(mul)));
+ Bool isU = bitU == 1;
+ IRTemp vecN = newTempV128();
+ IRTemp vecM = newTempV128();
+ IRTemp vecD = newTempV128();
+ assign(vecN, getQReg128(nn));
+ assign(vecM, getQReg128(mm));
+ assign(vecD, getQReg128(dd));
+ IRTemp res = IRTemp_INVALID;
+ math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
+ vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
putQReg128(dd, mkexpr(res));
const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
const HChar* arrWide = nameArr_Q_SZ(1, size+1);
return True;
}
- if (bitU == 0 && opcode == BITS5(0,0,1,1,1)) {
+ if (opcode == BITS5(0,0,1,1,1)) {
/* -------- 0,xx,00111 SQABS std7_std7 -------- */
+ /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
if (bitQ == 0 && size == X11) return False; // implied 1d case
- IRTemp qabs = IRTemp_INVALID, nabs = IRTemp_INVALID;
- math_SQABS(&qabs, &nabs, getQReg128(nn), size);
+ Bool isNEG = bitU == 1;
+ IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
+ (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
+ getQReg128(nn), size );
IRTemp qres = newTempV128(), nres = newTempV128();
- assign(qres, math_MAYBE_ZERO_HI64(bitQ, qabs));
- assign(nres, math_MAYBE_ZERO_HI64(bitQ, nabs));
+ assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
+ assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
putQReg128(dd, mkexpr(qres));
updateQCFLAGwithDifference(qres, nres);
const HChar* arr = nameArr_Q_SZ(bitQ, size);
- DIP("%s %s.%s, %s.%s\n", "sqabs",
+ DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
nameQReg128(dd), arr, nameQReg128(nn), arr);
return True;
}
vassert(0);
}
vassert(mm < 32 && ix < 16);
- IROp mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
- IROp accOp = (ks == 1) ? mkVecADD(size+1)
- : (ks == 2 ? mkVecSUB(size+1) : Iop_INVALID);
+ IRTemp vecN = newTempV128();
IRTemp vecM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
IRTemp vecD = newTempV128();
- IRTemp vecN = newTempV128();
- assign(vecD, getQReg128(dd));
assign(vecN, getQReg128(nn));
- IRTemp mul = math_BINARY_WIDENING_V128(is2, mulOp,
- mkexpr(vecN), mkexpr(vecM));
- IRTemp res = newTempV128();
- assign(res, ks == 0 ? mkexpr(mul)
- : binop(accOp, getQReg128(dd), mkexpr(mul)));
+ assign(vecD, getQReg128(dd));
+ IRTemp res = IRTemp_INVALID;
+ math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
+ vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
putQReg128(dd, mkexpr(res));
const HChar* nm = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
return True;
}
+ if (bitU == 0
+ && (opcode == BITS4(1,0,1,1)
+ || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
+ /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
+ /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
+ /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
+ /* Widens, and size refers to the narrowed lanes. */
+ UInt ks = 3;
+ switch (opcode) {
+ case BITS4(1,0,1,1): ks = 0; break;
+ case BITS4(0,0,1,1): ks = 1; break;
+ case BITS4(0,1,1,1): ks = 2; break;
+ default: vassert(0);
+ }
+ vassert(ks >= 0 && ks <= 2);
+ Bool is2 = bitQ == 1;
+ UInt mm = 32; // invalid
+ UInt ix = 16; // invalid
+ switch (size) {
+ case X00:
+ return False; // h_b_b[] case is not allowed
+ case X01:
+ mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
+ case X10:
+ mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
+ case X11:
+ return False; // q_d_d[] case is not allowed
+ default:
+ vassert(0);
+ }
+ vassert(mm < 32 && ix < 16);
+ IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
+ vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
+ newTempsV128_2(&vecN, &vecD);
+ assign(vecN, getQReg128(nn));
+ IRTemp vecM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
+ assign(vecD, getQReg128(dd));
+ math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
+ is2, size, "mas"[ks],
+ vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
+ putQReg128(dd, mkexpr(res));
+ vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
+ updateQCFLAGwithDifference(sat1q, sat1n);
+ if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
+ updateQCFLAGwithDifference(sat2q, sat2n);
+ }
+ const HChar* nm = ks == 0 ? "sqmull"
+ : (ks == 1 ? "sqdmlal" : "sqdmlsl");
+ const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+ const HChar* arrWide = nameArr_Q_SZ(1, size+1);
+ HChar ch = size == X01 ? 'h' : 's';
+ DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
+ nm, is2 ? "2" : "",
+ nameQReg128(dd), arrWide,
+ nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
+ return True;
+ }
+
return False;
# undef INSN
}
static void showARM64VecBinOp(/*OUT*/const HChar** nm,
/*OUT*/const HChar** ar, ARM64VecBinOp op ) {
switch (op) {
- case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return;
- case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return;
- case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return;
- case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return;
- case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return;
- case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return;
- case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return;
- case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return;
- case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return;
- case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return;
- case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return;
- case ARM64vecb_FADD64x2: *nm = "fadd "; *ar = "2d"; return;
- case ARM64vecb_FSUB64x2: *nm = "fsub "; *ar = "2d"; return;
- case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return;
- case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return;
- case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return;
- case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return;
- case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return;
- case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return;
- case ARM64vecb_UMAX32x4: *nm = "umax "; *ar = "4s"; return;
- case ARM64vecb_UMAX16x8: *nm = "umax "; *ar = "8h"; return;
- case ARM64vecb_UMAX8x16: *nm = "umax "; *ar = "16b"; return;
- case ARM64vecb_UMIN32x4: *nm = "umin "; *ar = "4s"; return;
- case ARM64vecb_UMIN16x8: *nm = "umin "; *ar = "8h"; return;
- case ARM64vecb_UMIN8x16: *nm = "umin "; *ar = "16b"; return;
- case ARM64vecb_SMAX32x4: *nm = "smax "; *ar = "4s"; return;
- case ARM64vecb_SMAX16x8: *nm = "smax "; *ar = "8h"; return;
- case ARM64vecb_SMAX8x16: *nm = "smax "; *ar = "16b"; return;
- case ARM64vecb_SMIN32x4: *nm = "smin "; *ar = "4s"; return;
- case ARM64vecb_SMIN16x8: *nm = "smin "; *ar = "8h"; return;
- case ARM64vecb_SMIN8x16: *nm = "smin "; *ar = "16b"; return;
- case ARM64vecb_AND: *nm = "and "; *ar = "16b"; return;
- case ARM64vecb_ORR: *nm = "orr "; *ar = "16b"; return;
- case ARM64vecb_XOR: *nm = "eor "; *ar = "16b"; return;
- case ARM64vecb_CMEQ64x2: *nm = "cmeq "; *ar = "2d"; return;
- case ARM64vecb_CMEQ32x4: *nm = "cmeq "; *ar = "4s"; return;
- case ARM64vecb_CMEQ16x8: *nm = "cmeq "; *ar = "8h"; return;
- case ARM64vecb_CMEQ8x16: *nm = "cmeq "; *ar = "16b"; return;
- case ARM64vecb_CMHI64x2: *nm = "cmhi "; *ar = "2d"; return;
- case ARM64vecb_CMHI32x4: *nm = "cmhi "; *ar = "4s"; return;
- case ARM64vecb_CMHI16x8: *nm = "cmhi "; *ar = "8h"; return;
- case ARM64vecb_CMHI8x16: *nm = "cmhi "; *ar = "16b"; return;
- case ARM64vecb_CMGT64x2: *nm = "cmgt "; *ar = "2d"; return;
- case ARM64vecb_CMGT32x4: *nm = "cmgt "; *ar = "4s"; return;
- case ARM64vecb_CMGT16x8: *nm = "cmgt "; *ar = "8h"; return;
- case ARM64vecb_CMGT8x16: *nm = "cmgt "; *ar = "16b"; return;
- case ARM64vecb_FCMEQ64x2: *nm = "fcmeq"; *ar = "2d"; return;
- case ARM64vecb_FCMEQ32x4: *nm = "fcmeq"; *ar = "4s"; return;
- case ARM64vecb_FCMGE64x2: *nm = "fcmge"; *ar = "2d"; return;
- case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return;
- case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return;
- case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return;
- case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return;
- case ARM64vecb_UZP164x2: *nm = "uzp1 "; *ar = "2d"; return;
- case ARM64vecb_UZP132x4: *nm = "uzp1 "; *ar = "4s"; return;
- case ARM64vecb_UZP116x8: *nm = "uzp1 "; *ar = "8h"; return;
- case ARM64vecb_UZP18x16: *nm = "uzp1 "; *ar = "16b"; return;
- case ARM64vecb_UZP264x2: *nm = "uzp2 "; *ar = "2d"; return;
- case ARM64vecb_UZP232x4: *nm = "uzp2 "; *ar = "4s"; return;
- case ARM64vecb_UZP216x8: *nm = "uzp2 "; *ar = "8h"; return;
- case ARM64vecb_UZP28x16: *nm = "uzp2 "; *ar = "16b"; return;
- case ARM64vecb_ZIP132x4: *nm = "zip1 "; *ar = "4s"; return;
- case ARM64vecb_ZIP116x8: *nm = "zip1 "; *ar = "8h"; return;
- case ARM64vecb_ZIP18x16: *nm = "zip1 "; *ar = "16b"; return;
- case ARM64vecb_ZIP232x4: *nm = "zip2 "; *ar = "4s"; return;
- case ARM64vecb_ZIP216x8: *nm = "zip2 "; *ar = "8h"; return;
- case ARM64vecb_ZIP28x16: *nm = "zip2 "; *ar = "16b"; return;
- case ARM64vecb_PMUL8x16: *nm = "pmul "; *ar = "16b"; return;
- case ARM64vecb_PMULL8x8: *nm = "pmull"; *ar = "8hbb"; return;
- case ARM64vecb_UMULL2DSS: *nm = "umull"; *ar = "2dss"; return;
- case ARM64vecb_UMULL4SHH: *nm = "umull"; *ar = "4shh"; return;
- case ARM64vecb_UMULL8HBB: *nm = "umull"; *ar = "8hbb"; return;
- case ARM64vecb_SMULL2DSS: *nm = "smull"; *ar = "2dss"; return;
- case ARM64vecb_SMULL4SHH: *nm = "smull"; *ar = "4shh"; return;
- case ARM64vecb_SMULL8HBB: *nm = "smull"; *ar = "8hbb"; return;
- case ARM64vecb_SQADD64x2: *nm = "sqadd"; *ar = "2d"; return;
- case ARM64vecb_SQADD32x4: *nm = "sqadd"; *ar = "4s"; return;
- case ARM64vecb_SQADD16x8: *nm = "sqadd"; *ar = "8h"; return;
- case ARM64vecb_SQADD8x16: *nm = "sqadd"; *ar = "16b"; return;
- case ARM64vecb_UQADD64x2: *nm = "uqadd"; *ar = "2d"; return;
- case ARM64vecb_UQADD32x4: *nm = "uqadd"; *ar = "4s"; return;
- case ARM64vecb_UQADD16x8: *nm = "uqadd"; *ar = "8h"; return;
- case ARM64vecb_UQADD8x16: *nm = "uqadd"; *ar = "16b"; return;
- case ARM64vecb_SQSUB64x2: *nm = "sqsub"; *ar = "2d"; return;
- case ARM64vecb_SQSUB32x4: *nm = "sqsub"; *ar = "4s"; return;
- case ARM64vecb_SQSUB16x8: *nm = "sqsub"; *ar = "8h"; return;
- case ARM64vecb_SQSUB8x16: *nm = "sqsub"; *ar = "16b"; return;
- case ARM64vecb_UQSUB64x2: *nm = "uqsub"; *ar = "2d"; return;
- case ARM64vecb_UQSUB32x4: *nm = "uqsub"; *ar = "4s"; return;
- case ARM64vecb_UQSUB16x8: *nm = "uqsub"; *ar = "8h"; return;
- case ARM64vecb_UQSUB8x16: *nm = "uqsub"; *ar = "16b"; return;
+ case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return;
+ case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return;
+ case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return;
+ case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return;
+ case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return;
+ case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return;
+ case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return;
+ case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return;
+ case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return;
+ case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return;
+ case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return;
+ case ARM64vecb_FADD64x2: *nm = "fadd "; *ar = "2d"; return;
+ case ARM64vecb_FSUB64x2: *nm = "fsub "; *ar = "2d"; return;
+ case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return;
+ case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return;
+ case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return;
+ case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return;
+ case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return;
+ case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return;
+ case ARM64vecb_UMAX32x4: *nm = "umax "; *ar = "4s"; return;
+ case ARM64vecb_UMAX16x8: *nm = "umax "; *ar = "8h"; return;
+ case ARM64vecb_UMAX8x16: *nm = "umax "; *ar = "16b"; return;
+ case ARM64vecb_UMIN32x4: *nm = "umin "; *ar = "4s"; return;
+ case ARM64vecb_UMIN16x8: *nm = "umin "; *ar = "8h"; return;
+ case ARM64vecb_UMIN8x16: *nm = "umin "; *ar = "16b"; return;
+ case ARM64vecb_SMAX32x4: *nm = "smax "; *ar = "4s"; return;
+ case ARM64vecb_SMAX16x8: *nm = "smax "; *ar = "8h"; return;
+ case ARM64vecb_SMAX8x16: *nm = "smax "; *ar = "16b"; return;
+ case ARM64vecb_SMIN32x4: *nm = "smin "; *ar = "4s"; return;
+ case ARM64vecb_SMIN16x8: *nm = "smin "; *ar = "8h"; return;
+ case ARM64vecb_SMIN8x16: *nm = "smin "; *ar = "16b"; return;
+ case ARM64vecb_AND: *nm = "and "; *ar = "16b"; return;
+ case ARM64vecb_ORR: *nm = "orr "; *ar = "16b"; return;
+ case ARM64vecb_XOR: *nm = "eor "; *ar = "16b"; return;
+ case ARM64vecb_CMEQ64x2: *nm = "cmeq "; *ar = "2d"; return;
+ case ARM64vecb_CMEQ32x4: *nm = "cmeq "; *ar = "4s"; return;
+ case ARM64vecb_CMEQ16x8: *nm = "cmeq "; *ar = "8h"; return;
+ case ARM64vecb_CMEQ8x16: *nm = "cmeq "; *ar = "16b"; return;
+ case ARM64vecb_CMHI64x2: *nm = "cmhi "; *ar = "2d"; return;
+ case ARM64vecb_CMHI32x4: *nm = "cmhi "; *ar = "4s"; return;
+ case ARM64vecb_CMHI16x8: *nm = "cmhi "; *ar = "8h"; return;
+ case ARM64vecb_CMHI8x16: *nm = "cmhi "; *ar = "16b"; return;
+ case ARM64vecb_CMGT64x2: *nm = "cmgt "; *ar = "2d"; return;
+ case ARM64vecb_CMGT32x4: *nm = "cmgt "; *ar = "4s"; return;
+ case ARM64vecb_CMGT16x8: *nm = "cmgt "; *ar = "8h"; return;
+ case ARM64vecb_CMGT8x16: *nm = "cmgt "; *ar = "16b"; return;
+ case ARM64vecb_FCMEQ64x2: *nm = "fcmeq"; *ar = "2d"; return;
+ case ARM64vecb_FCMEQ32x4: *nm = "fcmeq"; *ar = "4s"; return;
+ case ARM64vecb_FCMGE64x2: *nm = "fcmge"; *ar = "2d"; return;
+ case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return;
+ case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return;
+ case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return;
+ case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return;
+ case ARM64vecb_UZP164x2: *nm = "uzp1 "; *ar = "2d"; return;
+ case ARM64vecb_UZP132x4: *nm = "uzp1 "; *ar = "4s"; return;
+ case ARM64vecb_UZP116x8: *nm = "uzp1 "; *ar = "8h"; return;
+ case ARM64vecb_UZP18x16: *nm = "uzp1 "; *ar = "16b"; return;
+ case ARM64vecb_UZP264x2: *nm = "uzp2 "; *ar = "2d"; return;
+ case ARM64vecb_UZP232x4: *nm = "uzp2 "; *ar = "4s"; return;
+ case ARM64vecb_UZP216x8: *nm = "uzp2 "; *ar = "8h"; return;
+ case ARM64vecb_UZP28x16: *nm = "uzp2 "; *ar = "16b"; return;
+ case ARM64vecb_ZIP132x4: *nm = "zip1 "; *ar = "4s"; return;
+ case ARM64vecb_ZIP116x8: *nm = "zip1 "; *ar = "8h"; return;
+ case ARM64vecb_ZIP18x16: *nm = "zip1 "; *ar = "16b"; return;
+ case ARM64vecb_ZIP232x4: *nm = "zip2 "; *ar = "4s"; return;
+ case ARM64vecb_ZIP216x8: *nm = "zip2 "; *ar = "8h"; return;
+ case ARM64vecb_ZIP28x16: *nm = "zip2 "; *ar = "16b"; return;
+ case ARM64vecb_PMUL8x16: *nm = "pmul "; *ar = "16b"; return;
+ case ARM64vecb_PMULL8x8: *nm = "pmull"; *ar = "8hbb"; return;
+ case ARM64vecb_UMULL2DSS: *nm = "umull"; *ar = "2dss"; return;
+ case ARM64vecb_UMULL4SHH: *nm = "umull"; *ar = "4shh"; return;
+ case ARM64vecb_UMULL8HBB: *nm = "umull"; *ar = "8hbb"; return;
+ case ARM64vecb_SMULL2DSS: *nm = "smull"; *ar = "2dss"; return;
+ case ARM64vecb_SMULL4SHH: *nm = "smull"; *ar = "4shh"; return;
+ case ARM64vecb_SMULL8HBB: *nm = "smull"; *ar = "8hbb"; return;
+ case ARM64vecb_SQADD64x2: *nm = "sqadd"; *ar = "2d"; return;
+ case ARM64vecb_SQADD32x4: *nm = "sqadd"; *ar = "4s"; return;
+ case ARM64vecb_SQADD16x8: *nm = "sqadd"; *ar = "8h"; return;
+ case ARM64vecb_SQADD8x16: *nm = "sqadd"; *ar = "16b"; return;
+ case ARM64vecb_UQADD64x2: *nm = "uqadd"; *ar = "2d"; return;
+ case ARM64vecb_UQADD32x4: *nm = "uqadd"; *ar = "4s"; return;
+ case ARM64vecb_UQADD16x8: *nm = "uqadd"; *ar = "8h"; return;
+ case ARM64vecb_UQADD8x16: *nm = "uqadd"; *ar = "16b"; return;
+ case ARM64vecb_SQSUB64x2: *nm = "sqsub"; *ar = "2d"; return;
+ case ARM64vecb_SQSUB32x4: *nm = "sqsub"; *ar = "4s"; return;
+ case ARM64vecb_SQSUB16x8: *nm = "sqsub"; *ar = "8h"; return;
+ case ARM64vecb_SQSUB8x16: *nm = "sqsub"; *ar = "16b"; return;
+ case ARM64vecb_UQSUB64x2: *nm = "uqsub"; *ar = "2d"; return;
+ case ARM64vecb_UQSUB32x4: *nm = "uqsub"; *ar = "4s"; return;
+ case ARM64vecb_UQSUB16x8: *nm = "uqsub"; *ar = "8h"; return;
+ case ARM64vecb_UQSUB8x16: *nm = "uqsub"; *ar = "16b"; return;
+ case ARM64vecb_SQDMULL2DSS: *nm = "sqdmull"; *ar = "2dss"; return;
+ case ARM64vecb_SQDMULL4SHH: *nm = "sqdmull"; *ar = "4shh"; return;
default: vpanic("showARM64VecBinOp");
}
}
#define X101110 BITS8(0,0, 1,0,1,1,1,0)
#define X110000 BITS8(0,0, 1,1,0,0,0,0)
#define X110001 BITS8(0,0, 1,1,0,0,0,1)
+#define X110100 BITS8(0,0, 1,1,0,1,0,0)
#define X110101 BITS8(0,0, 1,1,0,1,0,1)
#define X110111 BITS8(0,0, 1,1,0,1,1,1)
#define X111000 BITS8(0,0, 1,1,1,0,0,0)
011 01110 10 1 m 001011 n d UQSUB Vd.4s, Vn.4s, Vm.4s
011 01110 01 1 m 001011 n d UQSUB Vd.8h, Vn.8h, Vm.8h
011 01110 00 1 m 001011 n d UQSUB Vd.16b, Vn.16b, Vm.16b
+
+ 000 01110 10 1 m 110100 n d SQDMULL Vd.2d, Vn.2s, Vm.2s
+ 000 01110 01 1 m 110100 n d SQDMULL Vd.4s, Vn.4h, Vm.4h
*/
UInt vD = qregNo(i->ARM64in.VBinV.dst);
UInt vN = qregNo(i->ARM64in.VBinV.argL);
*p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X001011, vN, vD);
break;
+ case ARM64vecb_SQDMULL2DSS:
+ *p++ = X_3_8_5_6_5_5(X000, X01110101, vM, X110100, vN, vD);
+ break;
+ case ARM64vecb_SQDMULL4SHH:
+ *p++ = X_3_8_5_6_5_5(X000, X01110011, vM, X110100, vN, vD);
+ break;
+
default:
goto bad;
}