From: Andreas Arnez <arnez@linux.ibm.com>
Date: Mon, 24 Sep 2018 16:56:07 +0000 (+0200)
Subject: s390x: Vector integer and string instruction support
X-Git-Tag: VALGRIND_3_14_0~9
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1cc1d564f4e9b33daa381c598dfc464c83080c15;p=thirdparty%2Fvalgrind.git

s390x: Vector integer and string instruction support

This adds z/Architecture vector integer and string instruction support.

The main author of this patch is Vadim Barkov <vbrkov@gmail.com>.  Some
fixes were provided by Andreas Arnez <arnez@linux.ibm.com>.
---

diff --git a/NEWS b/NEWS
index 384cc9d357..def0b4d29f 100644
--- a/NEWS
+++ b/NEWS
@@ -121,6 +121,8 @@ where XXXXXX is the bug number as listed below.
         == 387045  Valgrind crashing on High Sierra when testing any newly [..]
 385334  PPC64, fix vpermr, xxperm, xxpermr mask value.
 385408  s390x: z13 vector "support" instructions not implemented
+385409  s390x: z13 vector integer instructions not implemented
+385410  s390x: z13 vector string instructions not implemented
 385412  s390x: new non-vector z13 instructions not implemented
 385868  glibc ld.so _dl_runtime_resolve_avx_slow conditional jump warning.
 385912  none/tests/rlimit_nofile fails on newer glibc/kernel.
diff --git a/VEX/priv/guest_s390_defs.h b/VEX/priv/guest_s390_defs.h
index 4f9e962d3a..3bfecbe316 100644
--- a/VEX/priv/guest_s390_defs.h
+++ b/VEX/priv/guest_s390_defs.h
@@ -80,8 +80,8 @@ ULong s390x_dirtyhelper_STCKF(ULong *addr);
 ULong s390x_dirtyhelper_STCKE(ULong *addr);
 ULong s390x_dirtyhelper_STFLE(VexGuestS390XState *guest_state, ULong *addr);
 void  s390x_dirtyhelper_CUxy(UChar *addr, ULong data, ULong num_bytes);
-ULong s390x_dirtyhelper_vec_binop(VexGuestS390XState *guest_state, ULong opcode,
-                                  ULong v1, ULong v2);
+ULong s390x_dirtyhelper_vec_op(VexGuestS390XState *guest_state,
+                               ULong details);
 ULong s390_do_cu12_cu14_helper1(UInt byte1, UInt etf3_and_m3_is_1);
 ULong s390_do_cu12_helper2(UInt byte1, UInt byte2, UInt byte3, UInt byte4,
                            ULong stuff);
@@ -261,25 +261,52 @@ extern ULong last_execute_target;
 /*--- Vector helpers.                                      ---*/
 /*------------------------------------------------------------*/
 
-/* Vector operatons which can change condition code */
+/* Vector operatons passed to s390x_dirtyhelper_vec_op(...) helper.
+   Please don't change ordering of elements and append new items
+   before  S390_VEC_OP_LAST. */
 enum {
-   S390_CC_VEC_INVALID = 0,
-   S390_CC_VEC_VPKS = 1,
-   S390_CC_VEC_VPKLS = 2,
-   S390_CC_VEC_LAST = 3 // supposed to be the last element in enum
-} s390x_cc_vec_binop;
-
-/* Create an "object" which contain information about vector operation
-   and it's element size. Used for passing data to dirtyhelper with one argument.
-*/
-#define s390x_cc_vec_opcode(op, elem_size) ( ((op) << 3) | ((elem_size) & 0x07))
-
-/* Extract operation from opcode created with "s390x_cc_vec_opcode" macro */
-#define s390x_cc_vec_get_op(opcode) ((opcode) >> 3)
-
-/* Extract operation from opcode created with "s390x_cc_vec_opcode" macro */
-#define s390x_cc_vec_get_elem_size(opcode) ((opcode) & 0x07)
-
+   S390_VEC_OP_INVALID = 0,
+   S390_VEC_OP_VPKS = 1,
+   S390_VEC_OP_VPKLS = 2,
+   S390_VEC_OP_VFAE = 3,
+   S390_VEC_OP_VFEE = 4,
+   S390_VEC_OP_VFENE = 5,
+   S390_VEC_OP_VISTR = 6,
+   S390_VEC_OP_VSTRC = 7,
+   S390_VEC_OP_VCEQ = 8,
+   S390_VEC_OP_VTM = 9,
+   S390_VEC_OP_VGFM = 10,
+   S390_VEC_OP_VGFMA = 11,
+   S390_VEC_OP_VMAH = 12,
+   S390_VEC_OP_VMALH = 13,
+   S390_VEC_OP_VCH = 14,
+   S390_VEC_OP_VCHL = 15,
+   S390_VEC_OP_LAST = 16 // supposed to be the last element in enum
+} s390x_vec_op_t;
+
+/* Arguments of s390x_dirtyhelper_vec_op(...) which are packed into one
+   ULong variable.
+ */
+typedef union {
+   struct {
+      unsigned int op : 8;        // should be an element of s390x_vec_op_t
+      unsigned int v1 : 5;        // result of operation
+      unsigned int v2 : 5;        // argument one of operation
+      unsigned int v3 : 5;        // argument two of operation or
+                                  // zero for unary operations
+
+      unsigned int v4 : 5;        // argument two of operation or
+                                  // zero for unary and binary operations
+
+      unsigned int m4 : 4;        // field m4 of insn or zero if it's missing
+      unsigned int m5 : 4;        // field m5 of insn or zero if it's missing
+      unsigned int read_only: 1;  // don't write result to Guest State
+      unsigned int reserved : 27; // reserved for future
+   };
+   ULong serialized;
+} s390x_vec_op_details_t;
+
+STATIC_ASSERT(sizeof(s390x_vec_op_details_t) == sizeof(ULong));
 
 /* Macro definitions for opcodes that are not generally available.
 
@@ -293,6 +320,7 @@ enum {
             ".short 0x" #op1 #v1 #v2 "\n\t .int  0x" #v3 "0" #m5 "0" #m4 #rxb #op2 "\n\t"
 
 #define VL(v1, x2, b2, d2, rxb)                VRX_VXBD(e7, v1, x2, b2, d2, rxb, 06)
+#define VST(v1, x2, b2, d2, rxb)               VRX_VXBD(e7, v1, x2, b2, d2, rxb, 0e)
 #define VPKS(v1, v2, v3, m4, m5, rxb)          VRR_VVVMM(e7, v1, v2, v3, m5, m4, rxb, 97)
 #define VPKLS(v1, v2, v3, m4, m5, rxb)         VRR_VVVMM(e7, v1, v2, v3, m5, m4, rxb, 95)
 
diff --git a/VEX/priv/guest_s390_helpers.c b/VEX/priv/guest_s390_helpers.c
index aeda677044..3aec1f8949 100644
--- a/VEX/priv/guest_s390_helpers.c
+++ b/VEX/priv/guest_s390_helpers.c
@@ -1210,23 +1210,6 @@ decode_bfp_rounding_mode(UInt irrm)
    psw >> 28;   /* cc */ \
 })
 
-/* This macro believes that arguments' addresses are in GPR1 and GPR2.
-   We use %%v16, %%v17 and %%v18 to avoid side effects in FPRs.
-*/
-#define S390_CC_FOR_V128_BINOP(insn) \
-({ \
-   /* VL(v1, x2, b2, d2, rxb) */ \
-   __asm__ volatile ( \
-        VL(1, 0, 1, 000, 8) \
-        VL(2, 0, 2, 000, 8) \
-        insn \
-        "ipm %[psw]\n\t" \
-           : [psw] "=d"(psw) \
-           : "d"(arg1), "d"(arg2) \
-           : "cc", "v16", "v17", "v18"); \
-   psw >> 28;   /* cc */ \
-})
-
 /* Convert an IRRoundingMode value to s390_dfp_round_t */
 #if defined(VGA_s390x)
 static s390_dfp_round_t
@@ -2488,48 +2471,156 @@ missed:
 
 #if defined(VGA_s390x)
 ULong
-s390x_dirtyhelper_vec_binop(VexGuestS390XState *guest_state, ULong opcode,
-                            ULong v1, ULong v2)
+s390x_dirtyhelper_vec_op(VexGuestS390XState *guest_state,
+                         const ULong serialized)
 {
    UInt psw;
-   UInt elem_size = s390x_cc_vec_get_elem_size(opcode);
-   UInt op = s390x_cc_vec_get_op(opcode);
-   /* S390_CC_FOR_V128_BINOP relies on exatly this GPRs numbers and names. */
-   register ULong arg1 asm("1") = (ULong) &((&guest_state->guest_v0)[v1]);
-   register ULong arg2 asm("2") = (ULong) &((&guest_state->guest_v0)[v2]);
-
-   switch(op) {
-   case S390_CC_VEC_VPKS:
-      /* VPKS(v1, v2, v3, m4, m5, rxb) */
-      switch(elem_size) {
-      case 1: return S390_CC_FOR_V128_BINOP(VPKS(3, 1, 2, 1, 1, e));
-      case 2: return S390_CC_FOR_V128_BINOP(VPKS(3, 1, 2, 2, 1, e));
-      case 3: return S390_CC_FOR_V128_BINOP(VPKS(3, 1, 2, 3, 1, e));
-      default: vassert(0);
-      }
+   s390x_vec_op_details_t details;
+   const s390x_vec_op_details_t* d = (const s390x_vec_op_details_t*) &details;
+
+   details.serialized = serialized;
+
+   vassert(d->op > S390_VEC_OP_INVALID && d->op < S390_VEC_OP_LAST);
+   static const UChar opcodes[][2] = {
+      {0x00, 0x00}, /* invalid */
+      {0xe7, 0x97}, /* VPKS */
+      {0xe7, 0x95}, /* VPKLS */
+      {0xe7, 0x82}, /* VFAE */
+      {0xe7, 0x80}, /* VFEE */
+      {0xe7, 0x81}, /* VFENE */
+      {0xe7, 0x5c}, /* VISTR */
+      {0xe7, 0x8a}, /* VSTRC */
+      {0xe7, 0xf8}, /* VCEQ */
+      {0xe7, 0xd8}, /* VTM */
+      {0xe7, 0xb4}, /* VGFM */
+      {0xe7, 0xbc}, /* VGFMA */
+      {0xe7, 0xab}, /* VMAH */
+      {0xe7, 0xa9}, /* VMALH */
+      {0xe7, 0xfb}, /* VCH */
+      {0xe7, 0xf9}, /* VCHL */
+   };
+
+   union {
+      struct {
+        unsigned int op1 : 8;
+        unsigned int v1  : 4;
+        unsigned int v2  : 4;
+        unsigned int v3  : 4;
+        unsigned int     : 4;
+        unsigned int m5  : 4;
+        unsigned int     : 4;
+        unsigned int m4  : 4;
+        unsigned int rxb : 4;
+        unsigned int op2 : 8;
+      } VRR;
+      struct {
+        unsigned int op1 : 8;
+        unsigned int v1  : 4;
+        unsigned int v2  : 4;
+        unsigned int v3  : 4;
+        unsigned int m5  : 4;
+        unsigned int m6  : 4;
+        unsigned int     : 4;
+        unsigned int v4  : 4;
+        unsigned int rxb : 4;
+        unsigned int op2 : 8;
+      } VRRd;
+      UChar bytes[6];
+   } the_insn;
+
+   the_insn.VRR.op1 = opcodes[d->op][0];
+   the_insn.bytes[1] = the_insn.bytes[2]
+      = the_insn.bytes[3] = the_insn.bytes[4] = 0;
+   the_insn.VRR.op2 = opcodes[d->op][1];
+
+   switch(d->op) {
+   case S390_VEC_OP_VISTR:
+      the_insn.VRR.v1 = 1;
+      the_insn.VRR.v2 = 2;
+      the_insn.VRR.rxb = 0b1100;
+      the_insn.VRR.m4 = d->m4;
+      the_insn.VRR.m5 = d->m5;
+      break;
 
-   case S390_CC_VEC_VPKLS:
-      /* VPKLS(v1, v2, v3, m4, m5, rxb) */
-      switch(elem_size) {
-      case 1: return S390_CC_FOR_V128_BINOP(VPKLS(3, 1, 2, 1, 1, e));
-      case 2: return S390_CC_FOR_V128_BINOP(VPKLS(3, 1, 2, 2, 1, e));
-      case 3: return S390_CC_FOR_V128_BINOP(VPKLS(3, 1, 2, 3, 1, e));
-      default: vassert(0);
-      }
+   case S390_VEC_OP_VTM:
+      the_insn.VRR.v1 = 2;
+      the_insn.VRR.v2 = 3;
+      the_insn.VRR.rxb = 0b1100;
+      break;
+
+   case S390_VEC_OP_VPKS:
+   case S390_VEC_OP_VPKLS:
+   case S390_VEC_OP_VFAE:
+   case S390_VEC_OP_VFEE:
+   case S390_VEC_OP_VFENE:
+   case S390_VEC_OP_VCEQ:
+   case S390_VEC_OP_VGFM:
+   case S390_VEC_OP_VCH:
+   case S390_VEC_OP_VCHL:
+      the_insn.VRR.v1 = 1;
+      the_insn.VRR.v2 = 2;
+      the_insn.VRR.v3 = 3;
+      the_insn.VRR.rxb = 0b1110;
+      the_insn.VRR.m4 = d->m4;
+      the_insn.VRR.m5 = d->m5;
+      break;
+
+   case S390_VEC_OP_VSTRC:
+   case S390_VEC_OP_VGFMA:
+   case S390_VEC_OP_VMAH:
+   case S390_VEC_OP_VMALH:
+      the_insn.VRRd.v1 = 1;
+      the_insn.VRRd.v2 = 2;
+      the_insn.VRRd.v3 = 3;
+      the_insn.VRRd.v4 = 4;
+      the_insn.VRRd.rxb = 0b1111;
+      the_insn.VRRd.m5 = d->m4;
+      the_insn.VRRd.m6 = d->m5;
+      break;
 
    default:
-      vex_printf("operation = %d\n", op);
-      vpanic("s390x_dirtyhelper_vec_binop: unknown operation");
+      vex_printf("operation = %d\n", d->op);
+      vpanic("s390x_dirtyhelper_vec_op: unknown operation");
    }
 
-   return 0;
+   const V128* guest_v = &(guest_state->guest_v0);
+   __asm__ volatile (
+      "lgr %%r10, %[arg1]\n"
+      VL(2, 0, a, 000, 8)
+      "lgr %%r10, %[arg2]\n"
+      VL(3, 0, a, 000, 8)
+      "lgr %%r10, %[arg3]\n"
+      VL(4, 0, a, 000, 8)
+      "ex %[zero], %[insn]\n"
+
+      "cijne %[read_only], 0, return_cc\n"
+      "lgr %%r10, %[res]\n"
+      VST(1, 0, a, 000, 8)
+
+      "return_cc: "
+      "ipm %[psw]\n\t"
+         : [psw] "=d" (psw)
+
+         : [res]  "r" (&guest_v[d->v1]),
+           [arg1] "r" (&guest_v[d->v2]),
+           [arg2] "r" (&guest_v[d->v3]),
+           [arg3] "r" (&guest_v[d->v4]),
+
+           [zero] "r" (0ULL),
+           [insn] "m" (the_insn),
+           [read_only] "r" (d->read_only)
+
+         : "cc", "r10", "v16", "v17", "v18", "v19"
+      );
+
+   return psw >> 28;   /* cc */
 }
 
 #else
 
 ULong
-s390x_dirtyhelper_vec_binop(VexGuestS390XState *guest_state, ULong opcode,
-                            ULong v1, ULong v2)
+s390x_dirtyhelper_vec_op(VexGuestS390XState *guest_state, ULong opcode,
+                         ULong v1, ULong v2)
 { return 0; }
 
 #endif
diff --git a/VEX/priv/guest_s390_toIR.c b/VEX/priv/guest_s390_toIR.c
index 8f3fb6d3d4..c594ad51bf 100644
--- a/VEX/priv/guest_s390_toIR.c
+++ b/VEX/priv/guest_s390_toIR.c
@@ -748,12 +748,18 @@ s390_cc_thunk_put1d128Z(UInt opc, IRTemp d1, IRTemp nd)
    s390_cc_thunk_fill(op, hi, lox, ndep);
 }
 
+static void
+s390_cc_set(IRTemp cc)
+{
+   vassert(typeOfIRTemp(irsb->tyenv, cc) == Ity_I64);
+
+   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), mkexpr(cc), mkU64(0), mkU64(0));
+}
 
 static void
-s390_cc_set(UInt val)
+s390_cc_set_val(UInt val)
 {
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                      mkU64(val), mkU64(0), mkU64(0));
+   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), mkU64(val), mkU64(0), mkU64(0));
 }
 
 /* Build IR to calculate the condition code from flags thunk.
@@ -1536,7 +1542,7 @@ get_fpc_w0(void)
 
 /* Return the guest state offset of a vr register. */
 static UInt
-vr_offset(UInt archreg)
+vr_offset(const UInt archreg)
 {
    static const UInt offset[32] = {
       S390X_GUEST_OFFSET(guest_v0),
@@ -1580,14 +1586,14 @@ vr_offset(UInt archreg)
 
 /* Return the guest state offset of quadword of a vr register. */
 static UInt
-vr_qw_offset(UInt archreg)
+vr_qw_offset(const UInt archreg)
 {
    return vr_offset(archreg) + 0;
 }
 
 /* Write quadword of a vr to the guest state. */
 static void
-put_vr_qw(UInt archreg, IRExpr *expr)
+put_vr_qw(const UInt archreg, IRExpr *expr)
 {
    vassert(typeOfIRExpr(irsb->tyenv, expr) == Ity_V128);
 
@@ -1596,7 +1602,7 @@ put_vr_qw(UInt archreg, IRExpr *expr)
 
 /* Read quadword of a vr register. */
 static IRExpr *
-get_vr_qw(UInt archreg)
+get_vr_qw(const UInt archreg)
 {
    return IRExpr_Get(vr_qw_offset(archreg), Ity_V128);
 }
@@ -1661,6 +1667,13 @@ vr_w3_offset(UInt archreg)
    return vr_offset(archreg) + 12;
 }
 
+/* Read word #0 of a vr register. */
+static IRExpr *
+get_vr_w0(UInt archreg)
+{
+   return IRExpr_Get(vr_dw0_offset(archreg), Ity_I32);
+}
+
 /* Read word #1 of a vr register. */
 static IRExpr *
 get_vr_w1(UInt archreg)
@@ -1668,6 +1681,13 @@ get_vr_w1(UInt archreg)
    return IRExpr_Get(vr_w1_offset(archreg), Ity_I32);
 }
 
+/* Read word #2 of a vr register. */
+static IRExpr *
+get_vr_w2(UInt archreg)
+{
+   return IRExpr_Get(vr_dw1_offset(archreg), Ity_I32);
+}
+
 /* Read word #3 of a vr register. */
 static IRExpr *
 get_vr_w3(UInt archreg)
@@ -1744,6 +1764,223 @@ s390_vr_get_type(const UChar m)
    return results[m];
 }
 
+/* Determine if Condition Code Set (CS) flag is set in m field */
+#define s390_vr_is_cs_set(m) (((m) & 0x1) != 0)
+
+/* Determine if Zero Search (ZS) flag is set in m field */
+#define s390_vr_is_zs_set(m) (((m) & 0b0010) != 0)
+
+/* Generates arg1 < arg2 (or arg1 <= arg2 if allow_equal == True) expression.
+   Arguments must have V128 type and are treated as unsigned 128-bit numbers.
+*/
+static IRExpr*
+s390_V128_compareLT128x1(IRExpr* arg1, IRExpr* arg2, Bool allow_equal)
+{
+   /* If high halves are equal
+      then we compare lower ones
+      otherwise we compare high halves.
+    */
+   IRExpr* result;
+   result = mkite(binop(Iop_CmpEQ64,
+                        unop(Iop_V128HIto64, arg1),
+                        unop(Iop_V128HIto64, arg2)
+                        ),
+                  unop(Iop_1Uto64,
+                       binop(allow_equal ? Iop_CmpLE64U : Iop_CmpLT64U,
+                             unop(Iop_V128to64, arg1),
+                             unop(Iop_V128to64, arg2)
+                            )
+                      ),
+                  unop(Iop_1Uto64,
+                       binop(Iop_CmpLT64U,
+                             unop(Iop_V128HIto64, arg1),
+                             unop(Iop_V128HIto64, arg2)
+                            )
+                      )
+                  );
+
+   return result;
+}
+
+/* Generates arg1 == 0 expression.
+   Argument must have V128 type and is treated as unsigned 128-bit number.
+*/
+static IRExpr*
+s390_V128_isZero(IRExpr* arg)
+{
+   IRExpr* high_or_low = binop(Iop_Or64,
+                               unop(Iop_V128to64, arg),
+                               unop(Iop_V128HIto64, arg)
+                              );
+
+   return unop(Iop_1Uto64, binop(Iop_CmpEQ64, high_or_low, mkU64(0ULL)));
+}
+
+/* Generate the two's complement for arg.
+   Arg should be V128.
+*/
+static IRExpr*
+s390_V128_get_complement(IRExpr* arg, IRType type)
+{
+   IRExpr* notArg = unop(Iop_NotV128, arg);
+   IRExpr* ones;
+   IRExpr* result;
+   switch(type) {
+   case Ity_I8:
+      ones = unop(Iop_Dup8x16, mkU8(0x01));
+      result = binop(Iop_Add8x16, notArg, ones);
+      break;
+   case Ity_I16:
+      ones = unop(Iop_Dup16x8, mkU16(0x0001));
+      result = binop(Iop_Add16x8, notArg, ones);
+      break;
+   case Ity_I32:
+      ones = unop(Iop_Dup32x4, mkU32(0x00000001));
+      result = binop(Iop_Add32x4, notArg, ones);
+      break;
+   case Ity_I64:
+      ones = binop(Iop_64HLtoV128, mkU64(0x1ULL), mkU64(0x1ULL));
+      result = binop(Iop_Add64x2, notArg, ones);
+      break;
+   case Ity_V128:
+      ones = binop(Iop_64HLtoV128, mkU64(0x0ULL), mkU64(0x1ULL));
+      result = binop(Iop_Add128x1, notArg, ones);
+      break;
+   default:
+      vpanic("s390_V128_get_complement: unknown type");
+   }
+
+   return result;
+}
+
+/* # Elements are treated as 128-bit unsigned integers
+   For i = 0; i < elemCount; i++ do:
+      sum = arg1[i] + arg2[i]
+      result[i] = carry_out_bit(sum)
+   end
+   return result
+ */
+static IRExpr*
+s390_V128_calculate_carry_out(IRExpr* arg1, IRExpr* arg2, IRType type,
+                              Bool allow_equal)
+{
+   IRTemp sum = newTemp(Ity_V128);
+   IRExpr* mask;
+   IRExpr* comparison;
+   IRExpr* result;
+   switch(type){
+   case Ity_I8:
+      assign(sum, binop(Iop_Add8x16, arg1, arg2));
+      mask = unop(Iop_Dup8x16, mkU8(0x1));
+      comparison = binop(Iop_CmpGT8Ux16, arg1, mkexpr(sum));
+      if(allow_equal) {
+         comparison = binop(Iop_OrV128, binop(Iop_CmpEQ8x16, arg1, mkexpr(sum)),
+                            comparison);
+      }
+      result = binop(Iop_AndV128, comparison, mask);
+      break;
+   case Ity_I16:
+      assign(sum, binop(Iop_Add16x8, arg1, arg2));
+      mask = unop(Iop_Dup16x8, mkU16(0x1));
+      comparison = binop(Iop_CmpGT16Ux8, arg1, mkexpr(sum));
+      if(allow_equal) {
+         comparison = binop(Iop_OrV128, binop(Iop_CmpEQ16x8, arg1, mkexpr(sum)),
+                            comparison);
+      }
+      result = binop(Iop_AndV128, comparison, mask);
+      break;
+   case Ity_I32:
+      assign(sum, binop(Iop_Add32x4, arg1, arg2));
+      mask = unop(Iop_Dup32x4, mkU32(0x1));
+      comparison = binop(Iop_CmpGT32Ux4, arg1, mkexpr(sum));
+      if(allow_equal) {
+         comparison = binop(Iop_OrV128, binop(Iop_CmpEQ32x4, arg1, mkexpr(sum)),
+                            comparison);
+      }
+      result = binop(Iop_AndV128, comparison, mask);
+      break;
+   case Ity_I64:
+      assign(sum, binop(Iop_Add64x2, arg1, arg2));
+      mask = binop(Iop_64HLtoV128, mkU64(0x1), mkU64(0x1));
+      comparison = binop(Iop_CmpGT64Ux2, arg1, mkexpr(sum));
+      if(allow_equal) {
+         comparison = binop(Iop_OrV128, binop(Iop_CmpEQ64x2, arg1, mkexpr(sum)),
+                            comparison);
+      }
+      result = binop(Iop_AndV128, comparison, mask);
+      break;
+   case Ity_V128:
+      assign(sum, binop(Iop_Add128x1, arg1, arg2));
+      comparison = s390_V128_compareLT128x1(mkexpr(sum), arg1, allow_equal);
+      result = binop(Iop_64HLtoV128, mkU64(0x0), comparison);
+      break;
+   default:
+      ppIRType(type);
+      vpanic("s390_V128_calculate_carry_out: unknown type");
+   }
+
+   return result;
+}
+
+/* # elemCount = 1 for now (elements are 128-bit unsigned integers)
+   For i = 0; i < elemCount; i++ do:
+      sum = arg1[i] + arg2[i] + arg3[i] & 0x1
+      result[i] = carry_out_bit(sum)
+   end
+   return result
+ */
+static IRExpr*
+s390_V128_calculate_carry_out_with_carry(IRExpr* arg1, IRExpr* arg2, IRExpr* arg3)
+{
+   IRTemp sum = newTemp(Ity_V128);
+   assign(sum, binop(Iop_Add128x1, arg1, arg2));
+
+   IRTemp overflow_before = newTemp(Ity_I64);
+   assign(overflow_before, s390_V128_compareLT128x1(mkexpr(sum), arg1, False));
+
+   IRExpr* mask = binop(Iop_64HLtoV128, mkU64(0), mkU64(1));
+   IRTemp carry_in = newTemp(Ity_V128);
+   assign(carry_in, binop(Iop_AndV128, arg3, mask));
+
+   IRExpr* carry_is_not_zero = unop(Iop_1Uto64,
+                                    binop(Iop_CmpNE64,
+                                          unop(Iop_V128to64, mkexpr(carry_in)),
+                                          mkU64(0ULL)
+                                         )
+                                    );
+
+   IRTemp sum_plus_carry = newTemp(Ity_V128);
+   assign(sum_plus_carry, binop(Iop_Add128x1, mkexpr(sum), mkexpr(carry_in)));
+
+   IRExpr* overflow_after = binop(Iop_And64,
+                                  carry_is_not_zero,
+                                  s390_V128_isZero(mkexpr(sum_plus_carry))
+                                  );
+
+   IRExpr* result = binop(Iop_Or64, mkexpr(overflow_before), overflow_after);
+   result = binop(Iop_64HLtoV128, mkU64(0Ull), result);
+   return result;
+}
+
+/* Performs "arg1 + arg2 + carry_out_bit(arg1 + arg2)".
+   Arguments and result are Ity_I32.
+*/
+static IRTemp
+s390_checksum_add(IRExpr* arg1, IRExpr* arg2)
+{
+   IRTemp sum = newTemp(Ity_I32);
+   IRTemp res = newTemp(Ity_I32);
+
+   assign(sum, binop(Iop_Add32, arg1, arg2));
+   assign(res,
+          mkite(binop(Iop_CmpLT32U, mkexpr(sum), arg1),
+                binop(Iop_Add32, mkexpr(sum), mkU32(1)),
+                mkexpr(sum))
+               );
+
+   return res;
+}
+
 /* Return the guest state offset of element with type's size and given index
    of a vr register.
 */
@@ -1816,7 +2053,7 @@ s390_vr_getVRindex(UChar v,UChar argNumber, UChar rxb)
 {
    vassert(argNumber > 0 && argNumber <= 4);
    vassert(rxb < 16);
-   return v | (((rxb) << (++argNumber)) & 0b00010000);
+   return v | (((rxb) << argNumber) & 0b00010000);
 }
 
 static void
@@ -1834,8 +2071,7 @@ s390_vr_fill(UChar v1, IRExpr *o2)
       put_vr_qw(v1, unop(Iop_Dup32x4, o2));
       break;
    case Ity_I64:
-      put_vr_dw0(v1, o2);
-      put_vr_dw1(v1, o2);
+      put_vr_qw(v1, binop(Iop_64HLtoV128, o2, o2));
       break;
    default:
       ppIRType(o2type);
@@ -1881,43 +2117,65 @@ s390_getCountToBlockBoundary(IRTemp op2addr, UChar m)
    return mkexpr(output);
 }
 
-/* Helper macro for s390_vr_loadWithLength */
-#define s390_vr_loadWithLength_process(elem) \
-   put_vr_qw(v1, triop(Iop_SetElem8x16,\
-                       get_vr_qw(v1), mkU8(elem),\
-                       mkite(binop(Iop_CmpLE32U, mkU32(elem), mkexpr(maxIndexToLoad)),\
-                             load(Ity_I8, binop(Iop_Add64, mkexpr(addr), mkU64(elem))),\
-                             mkU8(0x00)\
-                          )\
-                       )\
-            )
-
 /* Load bytes into v1.
    maxIndex specifies max index to load and must be Ity_I32.
-   If maxIndex > 16, all 16 bytes are loaded.
+   If maxIndex >= 15, all 16 bytes are loaded.
    All bytes after maxIndex are zeroed. */
 static void s390_vr_loadWithLength(UChar v1, IRTemp addr, IRExpr *maxIndex)
 {
-   IRTemp maxIndexToLoad = newTemp(Ity_I32);
-
-   assign(maxIndexToLoad, maxIndex);
-
-   s390_vr_loadWithLength_process(0);
-   s390_vr_loadWithLength_process(1);
-   s390_vr_loadWithLength_process(2);
-   s390_vr_loadWithLength_process(3);
-   s390_vr_loadWithLength_process(4);
-   s390_vr_loadWithLength_process(5);
-   s390_vr_loadWithLength_process(6);
-   s390_vr_loadWithLength_process(7);
-   s390_vr_loadWithLength_process(8);
-   s390_vr_loadWithLength_process(9);
-   s390_vr_loadWithLength_process(10);
-   s390_vr_loadWithLength_process(11);
-   s390_vr_loadWithLength_process(12);
-   s390_vr_loadWithLength_process(13);
-   s390_vr_loadWithLength_process(14);
-   s390_vr_loadWithLength_process(15);
+   IRTemp maxIdx = newTemp(Ity_I32);
+   IRTemp cappedMax = newTemp(Ity_I64);
+   IRTemp offset = newTemp(Ity_I64);
+   IRTemp zeroed = newTemp(Ity_I64);
+   IRTemp back = newTemp(Ity_I64);
+
+   /* Implement the insn with a single 16-byte load, to allow memcheck's
+      "partial-loads-OK" heuristic to apply.  Ensure that a page boundary is
+      crossed if and only if the real insn would have crossed it as well.
+      Thus, if the bytes to load are fully contained in an aligned 16-byte
+      chunk, load the whole 16-byte aligned chunk, and otherwise load 16 bytes
+      from the unaligned address.  Then shift the loaded data left-aligned
+      into the target vector register. */
+
+   assign(maxIdx, maxIndex);
+   assign(cappedMax, mkite(binop(Iop_CmpLT32U, mkexpr(maxIdx), mkU32(15)),
+                           unop(Iop_32Uto64, mkexpr(maxIdx)), mkU64(15)));
+   /* 'offset': addr's offset from last 16-byte aligned address
+      'zeroed': number of bytes to be zeroed in the target vector
+      'back': how much to subtract from addr before loading 16 bytes */
+   assign(offset, binop(Iop_And64, mkexpr(addr), mkU64(15)));
+   assign(zeroed, binop(Iop_Sub64, mkU64(15), mkexpr(cappedMax)));
+   assign(back, mkite(binop(Iop_CmpLE64U, mkexpr(offset), mkexpr(zeroed)),
+                      mkexpr(offset), mkU64(0)));
+
+   /* How much to shift the loaded 16-byte vector to the right, and then to
+      the left.  Since both 'zeroed' and 'back' range from 0 to 15, the shift
+      amounts range from 0 to 120. */
+   IRExpr *shrAmount = binop(Iop_Shl64,
+                             binop(Iop_Sub64, mkexpr(zeroed), mkexpr(back)),
+                             mkU8(3));
+   IRExpr *shlAmount = binop(Iop_Shl64, mkexpr(zeroed), mkU8(3));
+
+   put_vr_qw(v1, binop(Iop_ShlV128,
+                       binop(Iop_ShrV128,
+                             load(Ity_V128,
+                                  binop(Iop_Sub64, mkexpr(addr), mkexpr(back))),
+                             unop(Iop_64to8, shrAmount)),
+                       unop(Iop_64to8, shlAmount)));
+}
+
+/* Bitwise vCond ? v1 : v2
+   All args are V128.
+ */
+static IRExpr*
+s390_V128_bitwiseITE(IRExpr* vCond, IRExpr* v1, IRExpr* v2)
+{
+   IRTemp vc = newTemp(Ity_V128);
+   assign(vc, vCond);
+   /* result = (v1 & vCond) | (v2 & ~vCond) */
+   return binop(Iop_OrV128,
+                binop(Iop_AndV128, v1, mkexpr(vc)),
+                binop(Iop_AndV128, v2, unop(Iop_NotV128, mkexpr(vc))));
 }
 
 /*------------------------------------------------------------*/
@@ -3291,6 +3549,31 @@ s390_format_VRS_RRDVM(const HChar *(*irgen)(UChar r1, IRTemp op2addr, UChar v3,
 }
 
 
+static void
+s390_format_VRS_VRDVM(const HChar *(*irgen)(UChar v1, IRTemp op2addr, UChar v3,
+                      UChar m4), UChar v1, UChar b2, UShort d2, UChar v3,
+                      UChar m4, UChar rxb)
+{
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
+
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   assign(op2addr, binop(Iop_Add64, mkU64(d2), b2 != 0 ? get_gpr_dw0(b2) :
+          mkU64(0)));
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v3  = s390_vr_getVRindex(v3, 2, rxb);
+   mnm = irgen(v1, op2addr, v3, m4);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, UDXB, VR, UINT), mnm, v1, d2, 0, b2, v3, m4);
+}
+
+
 static void
 s390_format_VRS_VRDV(const HChar *(*irgen)(UChar v1, IRTemp op2addr, UChar v3),
                      UChar v1, UChar b2, UShort d2, UChar v3, UChar rxb)
@@ -3396,6 +3679,121 @@ s390_format_VRV_VVRDMT(const HChar *(*irgen)(UChar v1, IRTemp op2addr, UChar m3)
 }
 
 
+static void
+s390_format_VRRd_VVVVMM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                              UChar v4, UChar m5, UChar m6),
+                        UChar v1, UChar v2, UChar v3, UChar v4, UChar m5,
+                        UChar m6, UChar rxb)
+{
+   const HChar *mnm;
+
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   v4  = s390_vr_getVRindex(v4, 4, rxb);
+   mnm = irgen(v1, v2, v3, v4, m5, m6);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC7(MNM, VR, VR, VR, VR, UINT, UINT),
+                  mnm, v1, v2, v3, v4, m5, m6);
+}
+
+
+static void
+s390_format_VRR_VVMM(const HChar *(*irgen)(UChar v1, UChar v2, UChar m3,
+                                           UChar m5),
+                     UChar v1, UChar v2, UChar m3, UChar m5, UChar rxb)
+{
+   const HChar *mnm;
+
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   mnm = irgen(v1, v2, m3, m5);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, VR, UINT, UINT), mnm, v1, v2, m3, m5);
+}
+
+
+static void
+s390_format_VRId_VVVIM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                             UChar i4, UChar m5),
+                       UChar v1, UChar v2, UChar v3, UChar i4, UChar m5,
+                       UChar rxb)
+{
+   const HChar *mnm;
+
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   mnm = irgen(v1, v2, v3, i4, m5);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), mnm, v1, v2, v3, i4, m5);
+}
+
+
+static void
+s390_format_VRId_VVVI(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                            UChar i4),
+                      UChar v1, UChar v2, UChar v3, UChar i4, UChar rxb)
+{
+   const HChar *mnm;
+
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   mnm = irgen(v1, v2, v3, i4);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), mnm, v1, v2, v3, i4);
+}
+
+
+static void
+s390_format_VRRd_VVVVM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                             UChar v4, UChar m5),
+                       UChar v1, UChar v2, UChar v3, UChar v4, UChar m5,
+                       UChar rxb)
+{
+   const HChar *mnm;
+
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   v4  = s390_vr_getVRindex(v4, 4, rxb);
+   mnm = irgen(v1, v2, v3, v4, m5);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC6(MNM, VR, VR, VR, VR, UINT), mnm, v1, v2, v3, v4, m5);
+}
+
+
 /*------------------------------------------------------------*/
 /*--- Build IR for opcodes                                 ---*/
 /*------------------------------------------------------------*/
@@ -11817,7 +12215,7 @@ s390_irgen_CLCL(UChar r1, UChar r2)
    assign(pad, get_gpr_b4(r2 + 1));
 
    /* len1 == 0 and len2 == 0? Exit */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ32, binop(Iop_Or32, mkexpr(len1),
                                          mkexpr(len2)), mkU32(0)));
 
@@ -11893,7 +12291,7 @@ s390_irgen_CLCLE(UChar r1, UChar r3, IRTemp pad2)
    assign(len3, get_gpr_dw0(r3 + 1));
 
    /* len1 == 0 and len3 == 0? Exit */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64,binop(Iop_Or64, mkexpr(len1),
                                         mkexpr(len3)), mkU64(0)));
 
@@ -12255,7 +12653,7 @@ s390_irgen_SRST(UChar r1, UChar r2)
    put_counter_dw0(mkU64(0));
 
    // start = next?  CC=2 and out r1 and r2 unchanged
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    put_gpr_dw0(r2, binop(Iop_Sub64, mkexpr(address), mkexpr(counter)));
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(address), mkexpr(next)));
 
@@ -12263,7 +12661,7 @@ s390_irgen_SRST(UChar r1, UChar r2)
    assign(delim, get_gpr_b7(0));
 
    // byte = delim? CC=1, R1=address
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    put_gpr_dw0(r1,  mkexpr(address));
    next_insn_if(binop(Iop_CmpEQ8, mkexpr(delim), mkexpr(byte)));
 
@@ -12296,7 +12694,7 @@ s390_irgen_CLST(UChar r1, UChar r2)
    assign(byte2, load(Ity_I8, mkexpr(address2)));
 
    // end in both? all equal, reset r1 and r2 to start values
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    put_gpr_dw0(r1, binop(Iop_Sub64, mkexpr(address1), mkexpr(counter)));
    put_gpr_dw0(r2, binop(Iop_Sub64, mkexpr(address2), mkexpr(counter)));
    next_insn_if(binop(Iop_CmpEQ8, mkU8(0),
@@ -12308,20 +12706,20 @@ s390_irgen_CLST(UChar r1, UChar r2)
    put_gpr_dw0(r2, mkexpr(address2));
 
    // End found in string1
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpEQ8, mkexpr(end), mkexpr(byte1)));
 
    // End found in string2
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    next_insn_if(binop(Iop_CmpEQ8, mkexpr(end), mkexpr(byte2)));
 
    // string1 < string2
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT32U, unop(Iop_8Uto32, mkexpr(byte1)),
                       unop(Iop_8Uto32, mkexpr(byte2))));
 
    // string2 < string1
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    next_insn_if(binop(Iop_CmpLT32U, unop(Iop_8Uto32, mkexpr(byte2)),
                       unop(Iop_8Uto32, mkexpr(byte1))));
 
@@ -12647,7 +13045,7 @@ s390_irgen_MVCL(UChar r1, UChar r2)
 
    /* Check for destructive overlap:
       addr1 > addr2 && addr2 + len1 > addr1 && (addr2 + len2) > addr1 */
-   s390_cc_set(3);
+   s390_cc_set_val(3);
    IRTemp cond1 = newTemp(Ity_I32);
    assign(cond1, unop(Iop_1Uto32,
                       binop(Iop_CmpLT64U, mkexpr(addr2), mkexpr(addr1))));
@@ -12777,7 +13175,7 @@ s390_irgen_MVST(UChar r1, UChar r2)
    iterate_if(binop(Iop_CmpNE8, mkexpr(end), mkexpr(byte)));
 
    // and always set cc=1 at the end + update r1
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    put_gpr_dw0(r1, binop(Iop_Add64, mkexpr(addr1), mkexpr(counter)));
    put_counter_dw0(mkU64(0));
 
@@ -14132,8 +14530,7 @@ s390_irgen_STCK(IRTemp op2addr)
    d->mAddr = mkexpr(op2addr);
    d->mSize = 8;
    stmt(IRStmt_Dirty(d));
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                      mkexpr(cc), mkU64(0), mkU64(0));
+   s390_cc_set(cc);
    return "stck";
 }
 
@@ -14152,8 +14549,7 @@ s390_irgen_STCKF(IRTemp op2addr)
       d->mAddr = mkexpr(op2addr);
       d->mSize = 8;
       stmt(IRStmt_Dirty(d));
-      s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                         mkexpr(cc), mkU64(0), mkU64(0));
+      s390_cc_set(cc);
    }
    return "stckf";
 }
@@ -14171,8 +14567,7 @@ s390_irgen_STCKE(IRTemp op2addr)
    d->mAddr = mkexpr(op2addr);
    d->mSize = 16;
    stmt(IRStmt_Dirty(d));
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                      mkexpr(cc), mkU64(0), mkU64(0));
+   s390_cc_set(cc);
    return "stcke";
 }
 
@@ -14206,7 +14601,7 @@ s390_irgen_STFLE(IRTemp op2addr)
 
    stmt(IRStmt_Dirty(d));
 
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), mkexpr(cc), mkU64(0), mkU64(0));
+   s390_cc_set(cc);
 
    return "stfle";
 }
@@ -14229,7 +14624,7 @@ s390_irgen_CKSM(UChar r1,UChar r2)
    assign(len, get_gpr_dw0(r2+1));
 
    /* Condition code is always zero. */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
 
    /* If length is zero, there is no need to calculate the checksum */
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(len), mkU64(0)));
@@ -14296,7 +14691,7 @@ s390_irgen_TROO(UChar m3, UChar r1, UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string, index translation table and
@@ -14308,7 +14703,7 @@ s390_irgen_TROO(UChar m3, UChar r1, UChar r2)
    assign(op1, load(Ity_I8, mkexpr(result)));
 
    if (! s390_host_has_etf2 || (m3 & 0x1) == 0) {
-      s390_cc_set(1);
+      s390_cc_set_val(1);
       next_insn_if(binop(Iop_CmpEQ8, mkexpr(op1), mkexpr(test_byte)));
    }
    store(get_gpr_dw0(r1), mkexpr(op1));
@@ -14343,7 +14738,7 @@ s390_irgen_TRTO(UChar m3, UChar r1, UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string, index translation table and
@@ -14356,7 +14751,7 @@ s390_irgen_TRTO(UChar m3, UChar r1, UChar r2)
    assign(op1, load(Ity_I8, mkexpr(result)));
 
    if (! s390_host_has_etf2 || (m3 & 0x1) == 0) {
-      s390_cc_set(1);
+      s390_cc_set_val(1);
       next_insn_if(binop(Iop_CmpEQ8, mkexpr(op1), mkexpr(test_byte)));
    }
    store(get_gpr_dw0(r1), mkexpr(op1));
@@ -14391,7 +14786,7 @@ s390_irgen_TROT(UChar m3, UChar r1, UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string, index translation table and
@@ -14403,7 +14798,7 @@ s390_irgen_TROT(UChar m3, UChar r1, UChar r2)
    assign(op1, load(Ity_I16, mkexpr(result)));
 
    if (! s390_host_has_etf2 || (m3 & 0x1) == 0) {
-      s390_cc_set(1);
+      s390_cc_set_val(1);
       next_insn_if(binop(Iop_CmpEQ16, mkexpr(op1), mkexpr(test_byte)));
    }
    store(get_gpr_dw0(r1), mkexpr(op1));
@@ -14438,7 +14833,7 @@ s390_irgen_TRTT(UChar m3, UChar r1, UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string, index translation table and
@@ -14450,7 +14845,7 @@ s390_irgen_TRTT(UChar m3, UChar r1, UChar r2)
    assign(op1, load(Ity_I16, mkexpr(result)));
 
    if (! s390_host_has_etf2 || (m3 & 0x1) == 0) {
-      s390_cc_set(1);
+      s390_cc_set_val(1);
       next_insn_if(binop(Iop_CmpEQ16, mkexpr(op1), mkexpr(test_byte)));
    }
 
@@ -14495,13 +14890,13 @@ s390_irgen_TRE(UChar r1,UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */   
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string and compare with test byte */
    assign(op, load(Ity_I8, mkexpr(src_addr)));
    
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpEQ8, mkexpr(op), mkexpr(test_byte)));
 
    assign(result, binop(Iop_Add64, unop(Iop_8Uto64, mkexpr(op)), 
@@ -14548,7 +14943,7 @@ s390_irgen_CU21(UChar m3, UChar r1, UChar r2)
    /* We're processing the 2nd operand 2 bytes at a time. Therefore, if
       there are less than 2 bytes left, then the 2nd operand is exhausted
       and we're done here. cc = 0 */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(2)));
 
    /* There are at least two bytes there. Read them. */
@@ -14594,7 +14989,7 @@ s390_irgen_CU21(UChar m3, UChar r1, UChar r2)
       IRExpr *invalid_low_surrogate =
          binop(Iop_And64, mkexpr(retval), mkU64(0xff));
 
-      s390_cc_set(2);
+      s390_cc_set_val(2);
       next_insn_if(binop(Iop_CmpEQ64, invalid_low_surrogate, mkU64(1)));
    }
 
@@ -14603,7 +14998,7 @@ s390_irgen_CU21(UChar m3, UChar r1, UChar r2)
    assign(num_bytes, binop(Iop_And64,
                            binop(Iop_Shr64, mkexpr(retval), mkU8(8)),
                            mkU64(0xff)));
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkexpr(num_bytes)));
 
    /* Extract the bytes to be stored at addr1 */
@@ -14675,7 +15070,7 @@ s390_irgen_CU24(UChar m3, UChar r1, UChar r2)
    /* We're processing the 2nd operand 2 bytes at a time. Therefore, if
       there are less than 2 bytes left, then the 2nd operand is exhausted
       and we're done here. cc = 0 */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(2)));
 
    /* There are at least two bytes there. Read them. */
@@ -14722,12 +15117,12 @@ s390_irgen_CU24(UChar m3, UChar r1, UChar r2)
       IRExpr *invalid_low_surrogate =
          binop(Iop_And64, mkexpr(retval), mkU64(0xff));
 
-      s390_cc_set(2);
+      s390_cc_set_val(2);
       next_insn_if(binop(Iop_CmpEQ64, invalid_low_surrogate, mkU64(1)));
    }
 
    /* Now test whether the 1st operand is exhausted */
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkU64(4)));
 
    /* Extract the bytes to be stored at addr1 */
@@ -14782,7 +15177,7 @@ s390_irgen_CU42(UChar r1, UChar r2)
    /* We're processing the 2nd operand 4 bytes at a time. Therefore, if
       there are less than 4 bytes left, then the 2nd operand is exhausted
       and we're done here. cc = 0 */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(4)));
 
    /* Read the 2nd operand. */
@@ -14797,7 +15192,7 @@ s390_irgen_CU42(UChar r1, UChar r2)
       cc=2 outranks cc=1 (1st operand exhausted) */
    IRExpr *invalid_character = binop(Iop_And64, mkexpr(retval), mkU64(0xff));
 
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    next_insn_if(binop(Iop_CmpEQ64, invalid_character, mkU64(1)));
 
    /* Now test whether the 1st operand is exhausted */
@@ -14805,7 +15200,7 @@ s390_irgen_CU42(UChar r1, UChar r2)
    assign(num_bytes, binop(Iop_And64,
                            binop(Iop_Shr64, mkexpr(retval), mkU8(8)),
                            mkU64(0xff)));
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkexpr(num_bytes)));
 
    /* Extract the bytes to be stored at addr1 */
@@ -14876,7 +15271,7 @@ s390_irgen_CU41(UChar r1, UChar r2)
    /* We're processing the 2nd operand 4 bytes at a time. Therefore, if
       there are less than 4 bytes left, then the 2nd operand is exhausted
       and we're done here. cc = 0 */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(4)));
 
    /* Read the 2nd operand. */
@@ -14891,7 +15286,7 @@ s390_irgen_CU41(UChar r1, UChar r2)
       cc=2 outranks cc=1 (1st operand exhausted) */
    IRExpr *invalid_character = binop(Iop_And64, mkexpr(retval), mkU64(0xff));
 
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    next_insn_if(binop(Iop_CmpEQ64, invalid_character, mkU64(1)));
 
    /* Now test whether the 1st operand is exhausted */
@@ -14899,7 +15294,7 @@ s390_irgen_CU41(UChar r1, UChar r2)
    assign(num_bytes, binop(Iop_And64,
                            binop(Iop_Shr64, mkexpr(retval), mkU8(8)),
                            mkU64(0xff)));
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkexpr(num_bytes)));
 
    /* Extract the bytes to be stored at addr1 */
@@ -14999,7 +15394,7 @@ s390_irgen_cu12_cu14(UChar m3, UChar r1, UChar r2, Bool is_cu12)
    /* We're processing the 2nd operand 1 byte at a time. Therefore, if
       there is less than 1 byte left, then the 2nd operand is exhausted
       and we're done here. cc = 0 */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(1)));
 
    /* There is at least one byte there. Read it. */
@@ -15013,7 +15408,7 @@ s390_irgen_cu12_cu14(UChar m3, UChar r1, UChar r2, Bool is_cu12)
 
    /* Check for invalid 1st byte */
    IRExpr *is_invalid = unop(Iop_64to1, mkexpr(retval1));
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    next_insn_if(is_invalid);
 
    /* How many bytes do we have to read? */
@@ -15021,7 +15416,7 @@ s390_irgen_cu12_cu14(UChar m3, UChar r1, UChar r2, Bool is_cu12)
    assign(num_src_bytes, binop(Iop_Shr64, mkexpr(retval1), mkU8(8)));
 
    /* Now test whether the 2nd operand is exhausted */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkexpr(num_src_bytes)));
 
    /* Read the remaining bytes */
@@ -15054,7 +15449,7 @@ s390_irgen_cu12_cu14(UChar m3, UChar r1, UChar r2, Bool is_cu12)
    }
 
    /* Check for invalid character */
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    is_invalid = unop(Iop_64to1, mkexpr(retval2));
    next_insn_if(is_invalid);
 
@@ -15063,7 +15458,7 @@ s390_irgen_cu12_cu14(UChar m3, UChar r1, UChar r2, Bool is_cu12)
    assign(num_bytes, binop(Iop_And64,
                            binop(Iop_Shr64, mkexpr(retval2), mkU8(8)),
                            mkU64(0xff)));
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkexpr(num_bytes)));
 
    /* Extract the bytes to be stored at addr1 */
@@ -15409,8 +15804,7 @@ s390_irgen_VLM(UChar v1, IRTemp op2addr, UChar v3)
 static const HChar *
 s390_irgen_VLVGP(UChar v1, UChar r2, UChar r3)
 {
-   put_vr_dw0(v1, get_gpr_dw0(r2));
-   put_vr_dw1(v1, get_gpr_dw0(r3));
+   put_vr_qw(v1, binop(Iop_64HLtoV128, get_gpr_dw0(r2), get_gpr_dw0(r3)));
 
    return "vlvgp";
 }
@@ -15449,28 +15843,10 @@ s390_irgen_VLVG(UChar v1, IRTemp op2addr, UChar r3, UChar m4)
 static const HChar *
 s390_irgen_VMRH(UChar v1, UChar v2, UChar v3, UChar m4)
 {
-   IRType type = s390_vr_get_type(m4);
-   switch (type) {
-   case Ity_I8:
-      put_vr_qw(v1, binop(Iop_InterleaveLO8x16, get_vr_qw(v2), get_vr_qw(v3)));
-      break;
-
-   case Ity_I16:
-      put_vr_qw(v1, binop(Iop_InterleaveLO16x8, get_vr_qw(v2), get_vr_qw(v3)));
-      break;
-
-   case Ity_I32:
-      put_vr_qw(v1, binop(Iop_InterleaveLO32x4, get_vr_qw(v2), get_vr_qw(v3)));
-      break;
-
-   case Ity_I64:
-      put_vr_qw(v1, binop(Iop_InterleaveLO64x2, get_vr_qw(v2), get_vr_qw(v3)));
-      break;
-
-   default:
-      ppIRType(type);
-      vpanic("s390_irgen_VMRH: unknown type");
-   }
+   const IROp ops[] = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
+                        Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
 
    return "vmrh";
 }
@@ -15478,28 +15854,10 @@ s390_irgen_VMRH(UChar v1, UChar v2, UChar v3, UChar m4)
 static const HChar *
 s390_irgen_VMRL(UChar v1, UChar v2, UChar v3, UChar m4)
 {
-   IRType type = s390_vr_get_type(m4);
-   switch (type) {
-   case Ity_I8:
-      put_vr_qw(v1, binop(Iop_InterleaveHI8x16, get_vr_qw(v2), get_vr_qw(v3)));
-      break;
-
-   case Ity_I16:
-      put_vr_qw(v1, binop(Iop_InterleaveHI16x8, get_vr_qw(v2), get_vr_qw(v3)));
-      break;
-
-   case Ity_I32:
-      put_vr_qw(v1, binop(Iop_InterleaveHI32x4, get_vr_qw(v2), get_vr_qw(v3)));
-      break;
-
-   case Ity_I64:
-      put_vr_qw(v1, binop(Iop_InterleaveHI64x2, get_vr_qw(v2), get_vr_qw(v3)));
-      break;
-
-   default:
-      ppIRType(type);
-      vpanic("s390_irgen_VMRL: unknown type");
-   }
+   const IROp ops[] = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
+                        Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
 
    return "vmrl";
 }
@@ -15507,25 +15865,11 @@ s390_irgen_VMRL(UChar v1, UChar v2, UChar v3, UChar m4)
 static const HChar *
 s390_irgen_VPK(UChar v1, UChar v2, UChar v3, UChar m4)
 {
-   IRType type = s390_vr_get_type(m4);
-   IRExpr* result = NULL;
-   switch(type) {
-   case Ity_I16:
-      result = binop(Iop_NarrowBin16to8x16, get_vr_qw(v2), get_vr_qw(v3));
-      break;
-   case Ity_I32:
-      result = binop(Iop_NarrowBin32to16x8, get_vr_qw(v2), get_vr_qw(v3));
-      break;
-   case Ity_I64:
-      result = binop(Iop_NarrowBin64to32x4, get_vr_qw(v2), get_vr_qw(v3));
-      break;
-   default:
-      ppIRType(type);
-      vpanic("s390_irgen_VPK: unknown type");
-   }
-
-   put_vr_qw(v1, result);
-
+   const IROp ops[] = { Iop_NarrowBin16to8x16, Iop_NarrowBin32to16x8,
+                        Iop_NarrowBin64to32x4 };
+   Char index = m4 - 1;
+   vassert((index >= 0) && (index < sizeof(ops) / sizeof(ops[0])));
+   put_vr_qw(v1, binop(ops[index], get_vr_qw(v2), get_vr_qw(v3)));
    return "vpk";
 }
 
@@ -15648,21 +15992,9 @@ s390_irgen_VSTM(UChar v1, IRTemp op2addr, UChar v3)
 static const HChar *
 s390_irgen_VUPH(UChar v1, UChar v2, UChar m3)
 {
-   IRType type = s390_vr_get_type(m3);
-   switch (type) {
-   case Ity_I8:
-     put_vr_qw(v1, unop(Iop_Widen8Sto16x8, get_vr_dw0(v2)));
-     break;
-   case Ity_I16:
-      put_vr_qw(v1, unop(Iop_Widen16Sto32x4, get_vr_dw0(v2)));
-      break;
-   case Ity_I32:
-      put_vr_qw(v1, unop(Iop_Widen32Sto64x2, get_vr_dw0(v2)));
-      break;
-   default:
-      ppIRType(type);
-      vpanic("s390_irgen_VUPH: unknown type");
-   }
+   const IROp ops[] = { Iop_Widen8Sto16x8, Iop_Widen16Sto32x4, Iop_Widen32Sto64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_dw0(v2)));
 
    return "vuph";
 }
@@ -15670,43 +16002,18 @@ s390_irgen_VUPH(UChar v1, UChar v2, UChar m3)
 static const HChar *
 s390_irgen_VUPLH(UChar v1, UChar v2, UChar m3)
 {
-   IRType type = s390_vr_get_type(m3);
-   switch (type) {
-   case Ity_I8:
-     put_vr_qw(v1, unop(Iop_Widen8Uto16x8, get_vr_dw0(v2)));
-     break;
-   case Ity_I16:
-      put_vr_qw(v1, unop(Iop_Widen16Uto32x4, get_vr_dw0(v2)));
-      break;
-   case Ity_I32:
-      put_vr_qw(v1, unop(Iop_Widen32Uto64x2, get_vr_dw0(v2)));
-      break;
-   default:
-      ppIRType(type);
-      vpanic("s390_irgen_VUPLH: unknown type");
-   }
-
+   const IROp ops[] = { Iop_Widen8Uto16x8, Iop_Widen16Uto32x4, Iop_Widen32Uto64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_dw0(v2)));
    return "vuplh";
 }
 
 static const HChar *
 s390_irgen_VUPL(UChar v1, UChar v2, UChar m3)
 {
-   IRType type = s390_vr_get_type(m3);
-   switch (type) {
-   case Ity_I8:
-     put_vr_qw(v1, unop(Iop_Widen8Sto16x8, get_vr_dw1(v2)));
-     break;
-   case Ity_I16:
-      put_vr_qw(v1, unop(Iop_Widen16Sto32x4, get_vr_dw1(v2)));
-      break;
-   case Ity_I32:
-      put_vr_qw(v1, unop(Iop_Widen32Sto64x2, get_vr_dw1(v2)));
-      break;
-   default:
-      ppIRType(type);
-      vpanic("s390_irgen_VUPL: unknown type");
-   }
+   const IROp ops[] = { Iop_Widen8Sto16x8, Iop_Widen16Sto32x4, Iop_Widen32Sto64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_dw1(v2)));
 
    return "vupl";
 }
@@ -15714,21 +16021,9 @@ s390_irgen_VUPL(UChar v1, UChar v2, UChar m3)
 static const HChar *
 s390_irgen_VUPLL(UChar v1, UChar v2, UChar m3)
 {
-   IRType type = s390_vr_get_type(m3);
-   switch (type) {
-   case Ity_I8:
-     put_vr_qw(v1, unop(Iop_Widen8Uto16x8, get_vr_dw1(v2)));
-     break;
-   case Ity_I16:
-      put_vr_qw(v1, unop(Iop_Widen16Uto32x4, get_vr_dw1(v2)));
-      break;
-   case Ity_I32:
-      put_vr_qw(v1, unop(Iop_Widen32Uto64x2, get_vr_dw1(v2)));
-      break;
-   default:
-      ppIRType(type);
-      vpanic("s390_irgen_VUPLL: unknown type");
-   }
+   const IROp ops[] = { Iop_Widen8Uto16x8, Iop_Widen16Uto32x4, Iop_Widen32Uto64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_dw1(v2)));
 
    return "vupll";
 }
@@ -15773,33 +16068,31 @@ s390_irgen_VREPI(UChar v1, UShort i2, UChar m3)
 static const HChar *
 s390_irgen_VPKS(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
 {
-   IRType type = s390_vr_get_type(m4);
-   IRExpr* result = NULL;
-
-   switch(type) {
-   case Ity_I16:
-      result = binop(Iop_QNarrowBin16Sto8Sx16, get_vr_qw(v2), get_vr_qw(v3));
-      break;
-   case Ity_I32:
-      result = binop(Iop_QNarrowBin32Sto16Sx8, get_vr_qw(v2), get_vr_qw(v3));
-      break;
-   case Ity_I64:
-      result = binop(Iop_QNarrowBin64Sto32Sx4, get_vr_qw(v2), get_vr_qw(v3));
-      break;
-   default:
-      ppIRType(type);
-      vpanic("s390_irgen_VPKS: unknown type");
-   }
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_QNarrowBin16Sto8Sx16, Iop_QNarrowBin32Sto16Sx8,
+                           Iop_QNarrowBin64Sto32Sx4 };
+      Char index = m4 - 1;
+      vassert((index >= 0) && (index < sizeof(ops) / sizeof(ops[0])));
+      put_vr_qw(v1, binop(ops[index], get_vr_qw(v2), get_vr_qw(v3)));
 
-   if((m5 & 0x1) != 0) {
+   } else {
       IRDirty* d;
       IRTemp cc = newTemp(Ity_I64);
-      ULong opcode = s390x_cc_vec_opcode(S390_CC_VEC_VPKS, m4);
-      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_binop",
-                            &s390x_dirtyhelper_vec_binop,
-                            mkIRExprVec_4(IRExpr_GSPTR(), mkU64(opcode),
-                                          mkU64(v2), mkU64(v3)));
-      d->nFxState = 2;
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VPKS;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
       vex_bzero(&d->fxState, sizeof(d->fxState));
       d->fxState[0].fx     = Ifx_Read;
       d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
@@ -15807,45 +16100,45 @@ s390_irgen_VPKS(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
       d->fxState[1].fx     = Ifx_Read;
       d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
       d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
 
       stmt(IRStmt_Dirty(d));
-      s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                         mkexpr(cc), mkU64(0), mkU64(0));
+      s390_cc_set(cc);
    }
 
-   put_vr_qw(v1, result);
    return "vpks";
 }
 
 static const HChar *
 s390_irgen_VPKLS(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
 {
-   IRType type = s390_vr_get_type(m4);
-   IRExpr* result = NULL;
-   switch(type) {
-   case Ity_I16:
-      result = binop(Iop_QNarrowBin16Uto8Ux16, get_vr_qw(v2), get_vr_qw(v3));
-      break;
-   case Ity_I32:
-      result = binop(Iop_QNarrowBin32Uto16Ux8, get_vr_qw(v2), get_vr_qw(v3));
-      break;
-   case Ity_I64:
-      result = binop(Iop_QNarrowBin64Uto32Ux4, get_vr_qw(v2), get_vr_qw(v3));
-      break;
-   default:
-      ppIRType(type);
-      vpanic("s390_irgen_VPKLS: unknown type");
-   }
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_QNarrowBin16Uto8Ux16, Iop_QNarrowBin32Uto16Ux8,
+                           Iop_QNarrowBin64Uto32Ux4 };
+      Char index = m4 - 1;
+      vassert((index >= 0) && (index < sizeof(ops) / sizeof(ops[0])));
+      put_vr_qw(v1, binop(ops[index], get_vr_qw(v2), get_vr_qw(v3)));
 
-   if((m5 & 0x1) != 0) {
+   } else {
       IRDirty* d;
       IRTemp cc = newTemp(Ity_I64);
-      ULong opcode = s390x_cc_vec_opcode(S390_CC_VEC_VPKLS, m4);
-      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_binop",
-                            &s390x_dirtyhelper_vec_binop,
-                            mkIRExprVec_4(IRExpr_GSPTR(), mkU64(opcode),
-                                          mkU64(v2), mkU64(v3)));
-      d->nFxState = 2;
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VPKLS;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
       vex_bzero(&d->fxState, sizeof(d->fxState));
       d->fxState[0].fx     = Ifx_Read;
       d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
@@ -15853,30 +16146,25 @@ s390_irgen_VPKLS(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
       d->fxState[1].fx     = Ifx_Read;
       d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
       d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
 
       stmt(IRStmt_Dirty(d));
-      s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                         mkexpr(cc), mkU64(0), mkU64(0));
+      s390_cc_set(cc);
    }
 
-   put_vr_qw(v1, result);
    return "vpkls";
 }
 
 static const HChar *
 s390_irgen_VSEL(UChar v1, UChar v2, UChar v3, UChar v4)
 {
-   IRExpr* vA = get_vr_qw(v3);
-   IRExpr* vB = get_vr_qw(v2);
-   IRExpr* vC = get_vr_qw(v4);
-
-   /* result = (vA & ~vC) | (vB & vC) */
-   put_vr_qw(v1,
-             binop(Iop_OrV128,
-                   binop(Iop_AndV128, vA, unop(Iop_NotV128, vC)),
-                   binop(Iop_AndV128, vB, vC)
-                  )
-            );
+   IRExpr* vIfTrue = get_vr_qw(v2);
+   IRExpr* vIfFalse = get_vr_qw(v3);
+   IRExpr* vCond = get_vr_qw(v4);
+
+   put_vr_qw(v1, s390_V128_bitwiseITE(vCond, vIfTrue, vIfFalse));
    return "vsel";
 }
 
@@ -16021,229 +16309,1595 @@ s390_irgen_LOCHHI(UChar r1, UChar m3, UShort i2, UChar unused)
    next_insn_if(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)));
    put_gpr_w0(r1, mkU32(i2));
 
-   return "lochhi";
+   return "lochhi";
+}
+
+static const HChar *
+s390_irgen_LOCHI(UChar r1, UChar m3, UShort i2, UChar unused)
+{
+   next_insn_if(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)));
+   put_gpr_w1(r1, mkU32(i2));
+
+   return "lochi";
+}
+
+static const HChar *
+s390_irgen_LOCGHI(UChar r1, UChar m3, UShort i2, UChar unused)
+{
+   next_insn_if(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)));
+   put_gpr_dw0(r1, mkU64(i2));
+
+   return "locghi";
+}
+
+static const HChar *
+s390_irgen_STOCFH(UChar r1, IRTemp op2addr)
+{
+   /* condition is checked in format handler */
+   store(mkexpr(op2addr), get_gpr_w1(r1));
+
+   return "stocfh";
+}
+
+static const HChar *
+s390_irgen_LCBB(UChar r1, IRTemp op2addr, UChar m3)
+{
+   IRTemp op2 = newTemp(Ity_I32);
+   assign(op2, s390_getCountToBlockBoundary(op2addr, m3));
+   put_gpr_w1(r1, mkexpr(op2));
+
+   IRExpr* cc = mkite(binop(Iop_CmpEQ32, mkexpr(op2), mkU32(16)), mkU64(0), mkU64(3));
+   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), cc, mkU64(0), mkU64(0));
+
+   return "lcbb";
+}
+
+/* Regarding the use of 
+   // Dummy helper which is used to signal VEX library that memory was loaded
+   sha512_loadparam 
+     = unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_sha512_load_param_block",
+                             &s390x_dirtyhelper_PPNO_sha512_load_param_block,
+                             mkIRExprVec_0());
+
+   in the following function (s390_irgen_PPNO).  This is a workaround to get
+   around the fact that IRDirty annotations cannot indicate two memory side
+   effects, which are unfortunately necessary here.  It will possibly lead to
+   losing undefinedness (undefinedness in some inputs might not be propagated
+   to the outputs as it shouod, in Memcheck).  The correct fix would be to
+   extend IRDirty to represent two memory side effects, but that's quite a bit
+   of work.
+
+   Here's a summary of what this insn does.
+
+   // getReg(RegisterNumber n) returns the value of GPR number 'n'
+
+   // reg1 and reg2 are even
+   void ppno(RegisterNumber reg1, RegisterNumber reg2) {
+
+       switch(getReg(0)) {
+       case 0x0:
+           // Query mode, ignore reg1 and reg2
+           // Write 16 bytes                    at  getReg(1)
+           break;
+
+       case 0x3:
+           // SHA-512 generate mode, ignore reg2
+
+           // Read 240 bytes                    at  getReg(1)
+           // Write getReg(reg1 + 1) bytes      at  getReg(reg1)
+           // Write some of 240 bytes starting  at  getReg(1)
+           break;
+
+       case 0x83:
+           // SHA-512 seed mode, ignore reg1
+
+           // Read some of 240 bytes starting  at  getReg(1)
+           // Read getReg(reg2 + 1) bytes      at  getReg(reg2)
+           // Write 240 bytes                  at  getReg(1)
+           break;
+
+       default:
+           // Specification exception, abort execution.
+       }
+   }
+*/
+/* Also known as "prno"
+   If you implement new functions please don't forget to update
+   "s390x_dirtyhelper_PPNO_query" function.
+ */
+static const HChar *
+s390_irgen_PPNO(UChar r1, UChar r2)
+{
+   if (!s390_host_has_msa5) {
+      emulation_failure(EmFail_S390X_ppno);
+      return "ppno";
+   }
+
+   /* Theese conditions lead to specification exception */
+   vassert(r1 % 2 == 0);
+   vassert(r2 % 2 == 0);
+   vassert((r1 != 0) && (r2 != 0));
+
+   IRDirty *query, *sha512_gen, *sha512_seed, *sha512_loadparam;
+   IRTemp gpr1num = newTemp(Ity_I64);
+   IRTemp gpr2num = newTemp(Ity_I64);
+
+   IRTemp funcCode = newTemp(Ity_I8);
+   IRTemp is_query = newTemp(Ity_I1);
+   IRTemp is_sha512_gen = newTemp(Ity_I1);
+   IRTemp is_sha512_seed = newTemp(Ity_I1);
+   IRTemp is_sha512 = newTemp(Ity_I1);
+
+   assign(funcCode, unop(Iop_64to8, binop(Iop_And64, get_gpr_dw0(0),
+                                          mkU64(0xffULL))));
+   assign(gpr1num, mkU64(r1));
+   assign(gpr2num, mkU64(r2));
+
+   assign(is_query, binop(Iop_CmpEQ8, mkexpr(funcCode), mkU8(S390_PPNO_QUERY)));
+   assign(is_sha512_gen, binop(Iop_CmpEQ8, mkexpr(funcCode),
+                               mkU8(S390_PPNO_SHA512_GEN)));
+   assign(is_sha512_seed, binop(Iop_CmpEQ8, mkexpr(funcCode),
+                                mkU8(S390_PPNO_SHA512_SEED)));
+   assign(is_sha512, binop(Iop_CmpEQ8,
+                           mkU8(S390_PPNO_SHA512_GEN),
+                           binop(Iop_And8,
+                                 mkexpr(funcCode),
+                                 mkU8(S390_PPNO_SHA512_GEN)
+                                 )
+                           ));
+
+   query = unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_query",
+                             &s390x_dirtyhelper_PPNO_query,
+                             mkIRExprVec_3(IRExpr_GSPTR(), mkexpr(gpr1num),
+                                           mkexpr(gpr2num)));
+   query->guard = mkexpr(is_query);
+   query->nFxState = 1;
+   vex_bzero(&query->fxState, sizeof(query->fxState));
+   query->fxState[0].fx     = Ifx_Read;
+   query->fxState[0].offset = S390X_GUEST_OFFSET(guest_r0);
+   query->fxState[0].size   = 2 * sizeof(ULong); /* gpr0 and gpr1 are read */
+   query->mAddr = get_gpr_dw0(1);
+   query->mSize = S390_PPNO_PARAM_BLOCK_SIZE_QUERY;
+   query->mFx   = Ifx_Write;
+
+   IRTemp gen_cc = newTemp(Ity_I64);
+   sha512_gen = unsafeIRDirty_1_N(gen_cc, 0, "s390x_dirtyhelper_PPNO_sha512",
+                             &s390x_dirtyhelper_PPNO_sha512,
+                             mkIRExprVec_3(IRExpr_GSPTR(), mkexpr(gpr1num),
+                                           mkexpr(gpr2num)));
+   sha512_gen->guard = mkexpr(is_sha512_gen);
+   sha512_gen->nFxState = 3;
+   vex_bzero(&sha512_gen->fxState, sizeof(sha512_gen->fxState));
+   sha512_gen->fxState[0].fx     = Ifx_Read;
+   sha512_gen->fxState[0].offset = S390X_GUEST_OFFSET(guest_r0);
+   sha512_gen->fxState[0].size   = 2 * sizeof(ULong); /* gpr0 and gpr1 are read */
+   sha512_gen->fxState[1].fx     = Ifx_Read;
+   sha512_gen->fxState[1].offset = S390X_GUEST_OFFSET(guest_r0) + r1 * sizeof(ULong);
+   sha512_gen->fxState[1].size   = sizeof(ULong);
+   sha512_gen->fxState[2].fx     = Ifx_Modify;
+   sha512_gen->fxState[2].offset = S390X_GUEST_OFFSET(guest_r0) + (r1 + 1) * sizeof(ULong);
+   sha512_gen->fxState[2].size   = sizeof(ULong);
+   sha512_gen->mAddr = get_gpr_dw0(r1);
+   sha512_gen->mSize = S390_PPNO_MAX_SIZE_SHA512_GEN;
+   sha512_gen->mFx   = Ifx_Write;
+
+   IRTemp unused = newTemp(Ity_I64);
+   sha512_seed = unsafeIRDirty_1_N(unused, 0, "s390x_dirtyhelper_PPNO_sha512",
+                             &s390x_dirtyhelper_PPNO_sha512,
+                             mkIRExprVec_3(IRExpr_GSPTR(), mkexpr(gpr1num),
+                                           mkexpr(gpr2num)));
+   sha512_seed->guard = mkexpr(is_sha512_seed);
+   sha512_seed->nFxState = 2;
+   vex_bzero(&sha512_seed->fxState, sizeof(sha512_seed->fxState));
+   sha512_seed->fxState[0].fx     = Ifx_Read;
+   sha512_seed->fxState[0].offset = S390X_GUEST_OFFSET(guest_r0);
+   sha512_seed->fxState[0].size   = 2 * sizeof(ULong); /* gpr0 and gpr1 are read */
+   sha512_seed->fxState[1].fx     = Ifx_Read;
+   sha512_seed->fxState[1].offset = S390X_GUEST_OFFSET(guest_r0) + r2 * sizeof(ULong);
+   sha512_seed->fxState[1].size   = 2 * sizeof(ULong); /* r2 and r2 + 1 are read */
+   sha512_seed->mAddr = get_gpr_dw0(r2);
+   sha512_seed->mSize = S390_PPNO_MAX_SIZE_SHA512_SEED;
+   sha512_seed->mFx   = Ifx_Write;
+
+   /* Dummy helper which is used to signal VEX library that memory was loaded */
+   sha512_loadparam =
+      unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_sha512_load_param_block",
+                        &s390x_dirtyhelper_PPNO_sha512_load_param_block,
+                        mkIRExprVec_0());
+   sha512_loadparam->guard = mkexpr(is_sha512);
+   sha512_loadparam->nFxState = 0;
+   vex_bzero(&sha512_loadparam->fxState, sizeof(sha512_loadparam->fxState));
+   sha512_loadparam->mAddr = get_gpr_dw0(1);
+   sha512_loadparam->mSize = S390_PPNO_PARAM_BLOCK_SIZE_SHA512;
+   sha512_loadparam->mFx   = Ifx_Read;
+
+   IRDirty* sha512_saveparam =
+      unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_sha512_save_param_block",
+                        &s390x_dirtyhelper_PPNO_sha512_load_param_block,
+                        mkIRExprVec_0());
+   sha512_saveparam->guard = mkexpr(is_sha512);
+   sha512_saveparam->nFxState = 0;
+   vex_bzero(&sha512_saveparam->fxState, sizeof(sha512_saveparam->fxState));
+   sha512_saveparam->mAddr = get_gpr_dw0(1);
+   sha512_saveparam->mSize = S390_PPNO_PARAM_BLOCK_SIZE_SHA512;
+   sha512_saveparam->mFx   = Ifx_Write;
+
+   stmt(IRStmt_Dirty(query));
+   stmt(IRStmt_Dirty(sha512_loadparam));
+   stmt(IRStmt_Dirty(sha512_gen));
+   stmt(IRStmt_Dirty(sha512_seed));
+   stmt(IRStmt_Dirty(sha512_saveparam));
+
+   IRTemp cc = newTemp(Ity_I64);
+   assign(cc,
+          mkite(mkexpr(is_sha512_gen),
+                mkexpr(gen_cc),
+                mkU64(0)
+               )
+         );
+
+   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), mkexpr(cc), mkU64(0), mkU64(0));
+
+   return "ppno";
+}
+
+static const HChar *
+s390_irgen_VFAE(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m4 < 3);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VFAE;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.m4 = m4;
+   details.m5 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 3;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Write;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   if (s390_vr_is_cs_set(m5)) {
+      s390_cc_set(cc);
+   }
+
+   return "vfae";
+}
+
+static const HChar *
+s390_irgen_VFEE(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m4 < 3);
+   vassert((m5 & 0b1100) == 0);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VFEE;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.m4 = m4;
+   details.m5 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 3;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Write;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   if (s390_vr_is_cs_set(m5)) {
+      s390_cc_set(cc);
+   }
+
+   return "vfee";
+}
+
+static const HChar *
+s390_irgen_VFENE(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   const Bool negateComparison = True;
+   const IRType type = s390_vr_get_type(m4);
+
+   /* Check for specification exception */
+   vassert(m4 < 3);
+   vassert((m5 & 0b1100) == 0);
+
+   static const IROp elementGetters[] = {
+      Iop_GetElem8x16, Iop_GetElem16x8, Iop_GetElem32x4
+   };
+   IROp getter = elementGetters[m4];
+
+   static const IROp elementComparators[] = {
+      Iop_CmpEQ8, Iop_CmpEQ16, Iop_CmpEQ32
+   };
+   IROp comparator = elementComparators[m4];
+
+   static const IROp resultConverter[] = {Iop_64to8, Iop_64to16, Iop_64to32};
+   IROp converter = resultConverter[m4];
+
+   IRTemp isZeroElem;
+
+   IRTemp counter = newTemp(Ity_I64);
+   assign(counter, get_counter_dw0());
+
+   IRTemp arg1 = newTemp(type);
+   assign(arg1, binop(getter, get_vr_qw(v2), unop(Iop_64to8, mkexpr(counter))));
+   IRTemp arg2 = newTemp(type);
+   assign(arg2, binop(getter, get_vr_qw(v3), unop(Iop_64to8, mkexpr(counter))));
+
+   IRTemp isGoodPair = newTemp(Ity_I1);
+   if(negateComparison) {
+      assign(isGoodPair, unop(Iop_Not1, binop(comparator, mkexpr(arg1),
+                                              mkexpr(arg2))));
+   } else {
+      assign(isGoodPair, binop(comparator, mkexpr(arg1), mkexpr(arg2)));
+   }
+
+   if(s390_vr_is_zs_set(m5)) {
+      isZeroElem = newTemp(Ity_I1);
+      assign(isZeroElem, binop(comparator, mkexpr(arg1),
+                               unop(converter, mkU64(0))));
+   }
+
+   static const UChar invalidIndices[] = {16, 8, 4};
+   const UChar invalidIndex = invalidIndices[m4];
+   IRTemp endOfVectorIsReached = newTemp(Ity_I1);
+   assign(endOfVectorIsReached, binop(Iop_CmpEQ64, mkexpr(counter),
+                                      mkU64(invalidIndex)));
+
+   put_counter_dw0(binop(Iop_Add64, mkexpr(counter), mkU64(1)));
+   IRExpr* shouldBreak = binop(Iop_Or32,
+                               unop(Iop_1Uto32, mkexpr(isGoodPair)),
+                               unop(Iop_1Uto32, mkexpr(endOfVectorIsReached))
+                              );
+   if(s390_vr_is_zs_set(m5)) {
+      shouldBreak = binop(Iop_Or32,
+                          shouldBreak,
+                          unop(Iop_1Uto32, mkexpr(isZeroElem)));
+   }
+   iterate_if(binop(Iop_CmpEQ32, shouldBreak, mkU32(0)));
+
+   IRExpr* foundIndex = binop(Iop_Sub64, get_counter_dw0(), mkU64(1));
+   if(m4 > 0) {
+      /* We should return index of byte but we found index of element in
+         general case.
+            if byte elem (m4 == 0) then indexOfByte = indexOfElement
+            if halfword elem (m4 == 1) then indexOfByte = 2 * indexOfElement
+                                                        = indexOfElement << 1
+            if word elem (m4 == 2) then indexOfByte = 4 * indexOfElement
+                                                    = indexOfElement << 2
+      */
+      foundIndex = binop(Iop_Shl64, foundIndex, mkU8(m4));
+   }
+
+   IRTemp result = newTemp(Ity_I64);
+   assign(result, mkite(mkexpr(endOfVectorIsReached),
+                        mkU64(16),
+                        foundIndex));
+   put_vr_qw(v1, binop(Iop_64HLtoV128, mkexpr(result), mkU64(0)));
+
+
+   if (s390_vr_is_cs_set(m5)) {
+      static const IROp to64Converters[] = {Iop_8Uto64, Iop_16Uto64, Iop_32Uto64};
+      IROp to64Converter = to64Converters[m4];
+
+      IRExpr* arg1IsLessThanArg2 = binop(Iop_CmpLT64U,
+                                         unop(to64Converter, mkexpr(arg1)),
+                                         unop(to64Converter, mkexpr(arg2)));
+
+      IRExpr* ccexp = mkite(binop(Iop_CmpEQ32,
+                                  unop(Iop_1Uto32, mkexpr(isGoodPair)),
+                                  mkU32(1)),
+                            mkite(arg1IsLessThanArg2, mkU64(1), mkU64(2)),
+                            mkU64(3));
+
+      if(s390_vr_is_zs_set(m5)) {
+         IRExpr* arg2IsZero = binop(comparator, mkexpr(arg2),
+                                    unop(converter, mkU64(0)));
+         IRExpr* bothArgsAreZero = binop(Iop_And32,
+                                         unop(Iop_1Uto32, mkexpr(isZeroElem)),
+                                         unop(Iop_1Uto32, arg2IsZero));
+         ccexp = mkite(binop(Iop_CmpEQ32, bothArgsAreZero, mkU32(1)),
+                       mkU64(0),
+                       ccexp);
+      }
+      IRTemp cc = newTemp(Ity_I64);
+      assign(cc, ccexp);
+
+      s390_cc_set(cc);
+   }
+
+
+   put_counter_dw0(mkU64(0));
+   return "vfene";
+}
+
+static const HChar *
+s390_irgen_VISTR(UChar v1, UChar v2, UChar m3, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m3 < 3);
+   vassert((m5 & 0b1110) == 0);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VISTR;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.m4 = m3;
+   details.m5 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 2;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Write;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   if (s390_vr_is_cs_set(m5)) {
+      s390_cc_set(cc);
+   }
+
+   return "vistr";
+}
+
+static const HChar *
+s390_irgen_VSTRC(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5, UChar m6)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m5 < 3);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VSTRC;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.v4 = v4;
+   details.m4 = m5;
+   details.m5 = m6;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 4;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Read;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v4 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[3].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   if (s390_vr_is_cs_set(m6)) {
+      s390_cc_set(cc);
+   }
+
+   return "vstrc";
+}
+
+static const HChar *
+s390_irgen_VNC(UChar v1, UChar v2, UChar v3)
+{
+   put_vr_qw(v1, binop(Iop_AndV128,
+             get_vr_qw(v2), unop(Iop_NotV128, get_vr_qw(v3)))
+             );
+
+   return "vnc";
+}
+
+static const HChar *
+s390_irgen_VA(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4,
+                        Iop_Add64x2, Iop_Add128x1 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "va";
+}
+
+static const HChar *
+s390_irgen_VS(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4,
+                        Iop_Sub64x2, Iop_Sub128x1 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vs";
+}
+
+static const HChar *
+s390_irgen_VMX(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmx";
+}
+
+static const HChar *
+s390_irgen_VMXL(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmxl";
+}
+
+static const HChar *
+s390_irgen_VMN(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmn";
+}
+
+static const HChar *
+s390_irgen_VMNL(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmnl";
+}
+
+static const HChar *
+s390_irgen_VAVG(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Avg8Sx16, Iop_Avg16Sx8, Iop_Avg32Sx4, Iop_Avg64Sx2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vavg";
+}
+
+static const HChar *
+s390_irgen_VAVGL(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4, Iop_Avg64Ux2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vavgl";
+}
+
+static const HChar *
+s390_irgen_VLC(UChar v1, UChar v2, UChar m3)
+{
+   vassert(m3 < 4);
+   IRType type = s390_vr_get_type(m3);
+   put_vr_qw(v1, s390_V128_get_complement(get_vr_qw(v2), type));
+   return "vlc";
+}
+
+static const HChar *
+s390_irgen_VLP(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_qw(v2)));
+
+   return "vlp";
+}
+
+static const HChar *
+s390_irgen_VCH(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4,
+                           Iop_CmpGT64Sx2 };
+      vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+      put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   } else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VCH;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size   = sizeof(V128);
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
+
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
+
+   return "vch";
+}
+
+static const HChar *
+s390_irgen_VCHL(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4,
+                           Iop_CmpGT64Ux2 };
+      vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+      put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   } else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VCHL;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size   = sizeof(V128);
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
+
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
+
+   return "vchl";
+}
+
+static const HChar *
+s390_irgen_VCLZ(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4, Iop_Clz64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_qw(v2)));
+
+   return "vclz";
+}
+
+static const HChar *
+s390_irgen_VCTZ(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Ctz8x16, Iop_Ctz16x8, Iop_Ctz32x4, Iop_Ctz64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_qw(v2)));
+
+   return "vctz";
+}
+
+static const HChar *
+s390_irgen_VPOPCT(UChar v1, UChar v2, UChar m3)
+{
+   vassert(m3 == 0);
+
+   put_vr_qw(v1, unop(Iop_Cnt8x16, get_vr_qw(v2)));
+
+   return "vpopct";
+}
+
+static const HChar *
+s390_irgen_VML(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vml";
+}
+
+static const HChar *
+s390_irgen_VMLH(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MulHi8Ux16, Iop_MulHi16Ux8, Iop_MulHi32Ux4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmlh";
+}
+
+static const HChar *
+s390_irgen_VMH(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MulHi8Sx16, Iop_MulHi16Sx8, Iop_MulHi32Sx4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmh";
+}
+
+static const HChar *
+s390_irgen_VME(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MullEven8Sx16, Iop_MullEven16Sx8, Iop_MullEven32Sx4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vme";
+}
+
+static const HChar *
+s390_irgen_VMLE(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MullEven8Ux16, Iop_MullEven16Ux8, Iop_MullEven32Ux4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmle";
+}
+
+static const HChar *
+s390_irgen_VESLV(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Shl8x16, Iop_Shl16x8, Iop_Shl32x4, Iop_Shl64x2};
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "veslv";
+}
+
+static const HChar *
+s390_irgen_VESL(UChar v1, IRTemp op2addr, UChar v3, UChar m4)
+{
+   IRExpr* shift_amount = unop(Iop_64to8, mkexpr(op2addr));
+   const IROp ops[] = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v3), shift_amount));
+
+   return "vesl";
+}
+
+static const HChar *
+s390_irgen_VESRAV(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Sar8x16, Iop_Sar16x8, Iop_Sar32x4, Iop_Sar64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vesrav";
+}
+
+static const HChar *
+s390_irgen_VESRA(UChar v1, IRTemp op2addr, UChar v3, UChar m4)
+{
+   IRExpr* shift_amount = unop(Iop_64to8, mkexpr(op2addr));
+   const IROp ops[] = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v3), shift_amount));
+
+   return "vesra";
+}
+
+static const HChar *
+s390_irgen_VESRLV(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Shr8x16, Iop_Shr16x8, Iop_Shr32x4, Iop_Shr64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vesrlv";
+}
+
+static const HChar *
+s390_irgen_VESRL(UChar v1, IRTemp op2addr, UChar v3, UChar m4)
+{
+   IRExpr* shift_amount = unop(Iop_64to8, mkexpr(op2addr));
+   const IROp ops[] = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v3), shift_amount));
+
+   return "vesrl";
+}
+
+static const HChar *
+s390_irgen_VERLLV(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Rol8x16, Iop_Rol16x8, Iop_Rol32x4, Iop_Rol64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "verllv";
+}
+
+static const HChar *
+s390_irgen_VERLL(UChar v1, IRTemp op2addr, UChar v3, UChar m4)
+{
+   /*
+      There is no Iop_RolN?x?? operations
+      so we have to use VECTOR x VECTOR variant.
+    */
+   IRExpr* shift_vector = unop(Iop_Dup8x16, unop(Iop_64to8, mkexpr(op2addr)));
+   const IROp ops[] = { Iop_Rol8x16, Iop_Rol16x8, Iop_Rol32x4, Iop_Rol64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v3), shift_vector));
+
+   return "verll";
+}
+
+static const HChar *
+s390_irgen_VSL(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b00000111)));
+
+   put_vr_qw(v1, binop(Iop_ShlV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsl";
+}
+
+static const HChar *
+s390_irgen_VSRL(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b00000111)));
+
+   put_vr_qw(v1, binop(Iop_ShrV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsrl";
+}
+
+static const HChar *
+s390_irgen_VSRA(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b00000111)));
+
+   put_vr_qw(v1, binop(Iop_SarV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsra";
+}
+
+static const HChar *
+s390_irgen_VERIM(UChar v1, UChar v2, UChar v3, UChar i4, UChar m5)
+{
+   /*
+      There is no Iop_RolN?x?? operations
+      so we have to use VECTOR x VECTOR variant.
+    */
+   const IROp ops[] = { Iop_Rol8x16, Iop_Rol16x8, Iop_Rol32x4, Iop_Rol64x2 };
+   vassert(m5 < sizeof(ops) / sizeof(ops[0]));
+   IRExpr* shift_vector = unop(Iop_Dup8x16, mkU8(i4));
+   IRExpr* rotated_vector = binop(ops[m5], get_vr_qw(v2), shift_vector);
+
+   /* result = (result & ~mask) | (rotated_vector & mask) */
+   IRExpr* mask = get_vr_qw(v3);
+   IRExpr* result = get_vr_qw(v1);
+   put_vr_qw(v1, s390_V128_bitwiseITE(mask, rotated_vector, result));
+
+   return "verim";
+}
+
+static const HChar *
+s390_irgen_VEC(UChar v1, UChar v2, UChar m3)
+{
+   IRType type = s390_vr_get_type(m3);
+   IRTemp op1 = newTemp(type);
+   IRTemp op2 = newTemp(type);
+
+   switch(type) {
+   case Ity_I8:
+      assign(op1, get_vr_b7(v1));
+      assign(op2, get_vr_b7(v2));
+      break;
+   case Ity_I16:
+      assign(op1, get_vr_hw3(v1));
+      assign(op2, get_vr_hw3(v2));
+      break;
+   case Ity_I32:
+      assign(op1, get_vr_w1(v1));
+      assign(op2, get_vr_w1(v2));
+      break;
+   case Ity_I64:
+      assign(op1, get_vr_dw0(v1));
+      assign(op2, get_vr_dw0(v2));
+      break;
+   default:
+      vpanic("s390_irgen_VEC: unknown type");
+   }
+
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+
+   return "vec";
+}
+
+static const HChar *
+s390_irgen_VECL(UChar v1, UChar v2, UChar m3)
+{
+   IRType type = s390_vr_get_type(m3);
+   IRTemp op1 = newTemp(type);
+   IRTemp op2 = newTemp(type);
+
+   switch(type) {
+   case Ity_I8:
+      assign(op1, get_vr_b7(v1));
+      assign(op2, get_vr_b7(v2));
+      break;
+   case Ity_I16:
+      assign(op1, get_vr_hw3(v1));
+      assign(op2, get_vr_hw3(v2));
+      break;
+   case Ity_I32:
+      assign(op1, get_vr_w1(v1));
+      assign(op2, get_vr_w1(v2));
+      break;
+   case Ity_I64:
+      assign(op1, get_vr_dw0(v1));
+      assign(op2, get_vr_dw0(v2));
+      break;
+   default:
+      vpanic("s390_irgen_VECL: unknown type");
+   }
+
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+
+   return "vecl";
+}
+
+static const HChar *
+s390_irgen_VCEQ(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4,
+                           Iop_CmpEQ64x2 };
+      vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+      put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   } else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VCEQ;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size   = sizeof(V128);
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
+
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
+
+   return "vceq";
+}
+
+static const HChar *
+s390_irgen_VSLB(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b01111000)));
+
+   put_vr_qw(v1, binop(Iop_ShlV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vslb";
+}
+
+static const HChar *
+s390_irgen_VSRLB(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b01111000)));
+
+   put_vr_qw(v1, binop(Iop_ShrV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsrlb";
+}
+
+static const HChar *
+s390_irgen_VSRAB(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b01111000)));
+
+   put_vr_qw(v1, binop(Iop_SarV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsrab";
+}
+
+static const HChar *
+s390_irgen_VSLDB(UChar v1, UChar v2, UChar v3, UChar i4)
+{
+   UChar imm = i4 & 0b00001111;
+
+   if (imm == 0)
+   {
+      put_vr_qw(v1, get_vr_qw(v2));
+   }
+   else if (imm == 16)
+   {
+      put_vr_qw(v1, get_vr_qw(v3));
+   }
+   else
+   {
+      put_vr_qw(v1,
+                binop(Iop_OrV128,
+                      binop(Iop_ShlV128, get_vr_qw(v2), mkU8(imm * 8)),
+                      binop(Iop_ShrV128, get_vr_qw(v3), mkU8((16 - imm) * 8))
+                     )
+               );
+   }
+
+   return "vsldb";
+}
+
+static const HChar *
+s390_irgen_VMO(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MullEven8Sx16, Iop_MullEven16Sx8,
+                        Iop_MullEven32Sx4 };
+   UChar shifts[] = { 8, 16, 32 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   IRExpr* result = binop(ops[m4],
+                          binop(Iop_ShlV128, get_vr_qw(v2), mkU8(shifts[m4])),
+                          binop(Iop_ShlV128, get_vr_qw(v3), mkU8(shifts[m4]))
+                         );
+   put_vr_qw(v1, result);
+
+   return "vmo";
+}
+
+static const HChar *
+s390_irgen_VMLO(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MullEven8Ux16, Iop_MullEven16Ux8,
+                        Iop_MullEven32Ux4 };
+   UChar shifts[] = { 8, 16, 32 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   IRExpr* result = binop(ops[m4],
+                          binop(Iop_ShlV128, get_vr_qw(v2), mkU8(shifts[m4])),
+                          binop(Iop_ShlV128, get_vr_qw(v3), mkU8(shifts[m4]))
+                         );
+   put_vr_qw(v1, result);
+
+   return "vmlo";
+}
+
+static const HChar *
+s390_irgen_VMAE(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_MullEven8Sx16, Iop_MullEven16Sx8,
+                            Iop_MullEven32Sx4 };
+   const IROp add_ops[] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2};
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result = binop(mul_ops[m5], get_vr_qw(v2), get_vr_qw(v3));
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmae";
+}
+
+static const HChar *
+s390_irgen_VMALE(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_MullEven8Ux16, Iop_MullEven16Ux8,
+                            Iop_MullEven32Ux4 };
+   const IROp add_ops[] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result = binop(mul_ops[m5], get_vr_qw(v2), get_vr_qw(v3));
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmale";
+}
+
+static const HChar *
+s390_irgen_VMAO(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_MullEven8Sx16, Iop_MullEven16Sx8,
+                            Iop_MullEven32Sx4 };
+   const IROp add_ops[] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+   UChar shifts[] = { 8, 16, 32 };
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result =
+      binop(mul_ops[m5],
+            binop(Iop_ShlV128, get_vr_qw(v2), mkU8(shifts[m5])),
+            binop(Iop_ShlV128, get_vr_qw(v3), mkU8(shifts[m5])));
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmao";
+}
+
+static const HChar *
+s390_irgen_VMALO(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_MullEven8Ux16, Iop_MullEven16Ux8,
+                            Iop_MullEven32Ux4 };
+   const IROp add_ops[] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+   UChar shifts[] = { 8, 16, 32 };
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result = binop(mul_ops[m5],
+                              binop(Iop_ShlV128,
+                                    get_vr_qw(v2), mkU8(shifts[m5])),
+                              binop(Iop_ShlV128,
+                                    get_vr_qw(v3), mkU8(shifts[m5]))
+   );
+
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmalo";
+}
+
+static const HChar *
+s390_irgen_VMAL(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4 };
+   const IROp add_ops[] = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4 };
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result = binop(mul_ops[m5], get_vr_qw(v2), get_vr_qw(v3));
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmal";
+}
+
+static const HChar *
+s390_irgen_VSUM(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* mask;
+   IRExpr* sum;
+   switch(type) {
+   case Ity_I8:
+      sum = unop(Iop_PwAddL16Ux8, unop(Iop_PwAddL8Ux16, get_vr_qw(v2)));
+      mask = IRExpr_Const(IRConst_V128(0b0001000100010001));
+      break;
+   case Ity_I16:
+      sum = unop(Iop_PwAddL16Ux8, get_vr_qw(v2));
+      mask = IRExpr_Const(IRConst_V128(0b0011001100110011));
+      break;
+   default:
+      vpanic("s390_irgen_VSUM: invalid type ");
+   }
+
+   IRExpr* addition = binop(Iop_AndV128, get_vr_qw(v3), mask);
+   put_vr_qw(v1, binop(Iop_Add32x4, sum, addition));
+
+   return "vsum";
+}
+
+static const HChar *
+s390_irgen_VSUMG(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* mask;
+   IRExpr* sum;
+   switch(type) {
+   case Ity_I16:
+      sum = unop(Iop_PwAddL32Ux4, unop(Iop_PwAddL16Ux8, get_vr_qw(v2)));
+      mask = IRExpr_Const(IRConst_V128(0b0000001100000011));
+      break;
+   case Ity_I32:
+      sum = unop(Iop_PwAddL32Ux4, get_vr_qw(v2));
+      mask = IRExpr_Const(IRConst_V128(0b0000111100001111));
+      break;
+   default:
+      vpanic("s390_irgen_VSUMG: invalid type ");
+   }
+
+   IRExpr* addition = binop(Iop_AndV128, get_vr_qw(v3), mask);
+   put_vr_qw(v1, binop(Iop_Add64x2, sum, addition));
+
+   return "vsumg";
+}
+
+static const HChar *
+s390_irgen_VSUMQ(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* mask;
+   IRExpr* sum;
+   switch(type) {
+   case Ity_I32:
+      sum = unop(Iop_PwAddL64Ux2, unop(Iop_PwAddL32Ux4, get_vr_qw(v2)));
+      mask = IRExpr_Const(IRConst_V128(0b0000000000001111));
+      break;
+   case Ity_I64:
+      sum = unop(Iop_PwAddL64Ux2, get_vr_qw(v2));
+      mask = IRExpr_Const(IRConst_V128(0b0000000011111111));
+      break;
+   default:
+      vpanic("s390_irgen_VSUMQ: invalid type ");
+   }
+
+   IRExpr* addition = binop(Iop_AndV128, get_vr_qw(v3), mask);
+   put_vr_qw(v1, binop(Iop_Add128x1, sum, addition));
+
+   return "vsumq";
+}
+
+static const HChar *
+s390_irgen_VTM(UChar v1, UChar v2)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VTM;
+   details.v2 = v1;
+   details.v3 = v2;
+   details.read_only = 1;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 2;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+   s390_cc_set(cc);
+
+   return "vtm";
+}
+
+static const HChar *
+s390_irgen_VAC(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   vassert(m5 == 4); /* specification exception otherwise */
+
+   IRTemp sum = newTemp(Ity_V128);
+   assign(sum, binop(Iop_Add128x1, get_vr_qw(v2), get_vr_qw(v3)));
+
+   IRExpr* mask = binop(Iop_64HLtoV128, mkU64(0), mkU64(1));
+   IRExpr* carry_in = binop(Iop_AndV128, get_vr_qw(v4), mask);
+   put_vr_qw(v1, binop(Iop_Add128x1, mkexpr(sum), carry_in));
+
+   return "vac";
+}
+
+static const HChar *
+s390_irgen_VACC(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* arg1 = get_vr_qw(v2);
+   IRExpr* arg2 = get_vr_qw(v3);
+
+   put_vr_qw(v1, s390_V128_calculate_carry_out(arg1, arg2, type, False));
+   return "vacc";
 }
 
 static const HChar *
-s390_irgen_LOCHI(UChar r1, UChar m3, UShort i2, UChar unused)
+s390_irgen_VACCC(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
 {
-   next_insn_if(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)));
-   put_gpr_w1(r1, mkU32(i2));
+   vassert(m5 == 4); /* specification exception otherwise */
+   IRExpr* result =
+         s390_V128_calculate_carry_out_with_carry(get_vr_qw(v2),
+                                                  get_vr_qw(v3),
+                                                  get_vr_qw(v4)
+                                                  );
 
-   return "lochi";
+   put_vr_qw(v1, result);
+   return "vaccc";
 }
 
-static const HChar *
-s390_irgen_LOCGHI(UChar r1, UChar m3, UShort i2, UChar unused)
+static const HChar*
+s390_irgen_VCKSM(UChar v1, UChar v2, UChar v3)
 {
-   next_insn_if(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)));
-   put_gpr_dw0(r1, mkU64(i2));
 
-   return "locghi";
+   IRTemp sum1 = s390_checksum_add(get_vr_w1(v3), get_vr_w0(v2));
+   IRTemp sum2 = s390_checksum_add(mkexpr(sum1), get_vr_w1(v2));
+   IRTemp sum3 = s390_checksum_add(mkexpr(sum2), get_vr_w2(v2));
+   IRTemp result = s390_checksum_add(mkexpr(sum3), get_vr_w3(v2));
+
+   put_vr_qw(v1, binop(Iop_64HLtoV128,
+                       unop(Iop_32Uto64, mkexpr(result)), mkU64(0ULL)));
+
+   return "vcksm";
 }
 
 static const HChar *
-s390_irgen_STOCFH(UChar r1, IRTemp op2addr)
+s390_irgen_VGFM(UChar v1, UChar v2, UChar v3, UChar m4)
 {
-   /* condition is checked in format handler */
-   store(mkexpr(op2addr), get_gpr_w1(r1));
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
 
-   return "stocfh";
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VGFM;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.m4 = m4;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 3;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Write;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+   return "vgfm";
 }
 
 static const HChar *
-s390_irgen_LCBB(UChar r1, IRTemp op2addr, UChar m3)
+s390_irgen_VGFMA(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
 {
-   IRTemp op2 = newTemp(Ity_I32);
-   assign(op2, s390_getCountToBlockBoundary(op2addr, m3));
-   put_gpr_w1(r1, mkexpr(op2));
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
 
-   IRExpr* cc = mkite(binop(Iop_CmpEQ32, mkexpr(op2), mkU32(16)), mkU64(0), mkU64(3));
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), cc, mkU64(0), mkU64(0));
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VGFMA;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.v4 = v4;
+   details.m4 = m5;
 
-   return "lcbb";
-}
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
 
-/* Regarding the use of 
-   // Dummy helper which is used to signal VEX library that memory was loaded
-   sha512_loadparam 
-     = unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_sha512_load_param_block",
-                             &s390x_dirtyhelper_PPNO_sha512_load_param_block,
-                             mkIRExprVec_0());
+   d->nFxState = 4;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Read;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v4 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[3].size   = sizeof(V128);
 
-   in the following function (s390_irgen_PPNO).  This is a workaround to get
-   around the fact that IRDirty annotations cannot indicate two memory side
-   effects, which are unfortunately necessary here.  It will possibly lead to
-   losing undefinedness (undefinedness in some inputs might not be propagated
-   to the outputs as it shouod, in Memcheck).  The correct fix would be to
-   extend IRDirty to represent two memory side effects, but that's quite a bit
-   of work.
+   stmt(IRStmt_Dirty(d));
+   return "vgfma";
+}
 
-   Here's a summary of what this insn does.
+static const HChar *
+s390_irgen_VSBI(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   vassert(m5 == 4); /* specification exception otherwise */
 
-   // getReg(RegisterNumber n) returns the value of GPR number 'n'
+   IRExpr* mask = binop(Iop_64HLtoV128, mkU64(0ULL), mkU64(1ULL));
+   IRExpr* carry_in = binop(Iop_AndV128, get_vr_qw(v4), mask);
 
-   // reg1 and reg2 are even
-   void ppno(RegisterNumber reg1, RegisterNumber reg2) {
+   IRTemp sum = newTemp(Ity_V128);
+   assign(sum, binop(Iop_Add128x1,
+                     get_vr_qw(v2),
+                     unop(Iop_NotV128, get_vr_qw(v3))
+                     )
+         );
 
-       switch(getReg(0)) {
-       case 0x0:
-           // Query mode, ignore reg1 and reg2
-           // Write 16 bytes                    at  getReg(1)
-           break;
+   put_vr_qw(v1, binop(Iop_Add128x1, mkexpr(sum), carry_in));
+   return "vsbi";
+}
 
-       case 0x3:
-           // SHA-512 generate mode, ignore reg2
+static const HChar *
+s390_irgen_VSCBI(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* arg1 = get_vr_qw(v2);
+   IRExpr* arg2 = s390_V128_get_complement(get_vr_qw(v3), type);
+   IRExpr* result = s390_V128_calculate_carry_out(arg1, arg2, type, True);
 
-           // Read 240 bytes                    at  getReg(1)
-           // Write getReg(reg1 + 1) bytes      at  getReg(reg1)
-           // Write some of 240 bytes starting  at  getReg(1)
-           break;
+   put_vr_qw(v1, result);
+   return "vscbi";
+}
 
-       case 0x83:
-           // SHA-512 seed mode, ignore reg1
+static const HChar *
+s390_irgen_VSBCBI(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   vassert(m5 == 4); /* specification exception otherwise */
+   IRExpr* result =
+      s390_V128_calculate_carry_out_with_carry(get_vr_qw(v2),
+                                               unop(Iop_NotV128, get_vr_qw(v3)),
+                                               get_vr_qw(v4));
 
-           // Read some of 240 bytes starting  at  getReg(1)
-           // Read getReg(reg2 + 1) bytes      at  getReg(reg2)
-           // Write 240 bytes                  at  getReg(1)
-           break;
+   put_vr_qw(v1, result);
+   return "vsbcbi";
+}
 
-       default:
-           // Specification exception, abort execution.
-       }
-   }
-*/
-/* Also known as "prno"
-   If you implement new functions please don't forget to update
-   "s390x_dirtyhelper_PPNO_query" function.
- */
 static const HChar *
-s390_irgen_PPNO(UChar r1, UChar r2)
+s390_irgen_VMAH(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
 {
-   if (!s390_host_has_msa5) {
-      emulation_failure(EmFail_S390X_ppno);
-      return "ppno";
-   }
-
-   /* Theese conditions lead to specification exception */
-   vassert(r1 % 2 == 0);
-   vassert(r2 % 2 == 0);
-   vassert((r1 != 0) && (r2 != 0));
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
 
-   IRDirty *query, *sha512_gen, *sha512_seed, *sha512_loadparam;
-   IRTemp gpr1num = newTemp(Ity_I64);
-   IRTemp gpr2num = newTemp(Ity_I64);
+   /* Check for specification exception */
+   vassert(m5 < 3);
 
-   IRTemp funcCode = newTemp(Ity_I8);
-   IRTemp is_query = newTemp(Ity_I1);
-   IRTemp is_sha512_gen = newTemp(Ity_I1);
-   IRTemp is_sha512_seed = newTemp(Ity_I1);
-   IRTemp is_sha512 = newTemp(Ity_I1);
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VMAH;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.v4 = v4;
+   details.m4 = m5;
 
-   assign(funcCode, unop(Iop_64to8, binop(Iop_And64, get_gpr_dw0(0), mkU64(0xffULL))));
-   assign(gpr1num, mkU64(r1));
-   assign(gpr2num, mkU64(r2));
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
 
-   assign(is_query, binop(Iop_CmpEQ8, mkexpr(funcCode), mkU8(S390_PPNO_QUERY)));
-   assign(is_sha512_gen, binop(Iop_CmpEQ8, mkexpr(funcCode), mkU8(S390_PPNO_SHA512_GEN)));
-   assign(is_sha512_seed, binop(Iop_CmpEQ8, mkexpr(funcCode), mkU8(S390_PPNO_SHA512_SEED)));
-   assign(is_sha512, binop(Iop_CmpEQ8,
-                           mkU8(S390_PPNO_SHA512_GEN),
-                           binop(Iop_And8,
-                                 mkexpr(funcCode),
-                                 mkU8(S390_PPNO_SHA512_GEN)
-                                 )
-                           ));
+   d->nFxState = 4;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Read;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v4 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[3].size   = sizeof(V128);
 
-   query = unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_query",
-                             &s390x_dirtyhelper_PPNO_query,
-                             mkIRExprVec_3(IRExpr_GSPTR(), mkexpr(gpr1num), mkexpr(gpr2num)));
-   query->guard = mkexpr(is_query);
-   query->nFxState = 1;
-   vex_bzero(&query->fxState, sizeof(query->fxState));
-   query->fxState[0].fx     = Ifx_Read;
-   query->fxState[0].offset = S390X_GUEST_OFFSET(guest_r0);
-   query->fxState[0].size   = 2 * sizeof(ULong); /* gpr0 and gpr1 are read */
-   query->mAddr = get_gpr_dw0(1);
-   query->mSize = S390_PPNO_PARAM_BLOCK_SIZE_QUERY;
-   query->mFx   = Ifx_Write;
+   stmt(IRStmt_Dirty(d));
 
-   IRTemp gen_cc = newTemp(Ity_I64);
-   sha512_gen = unsafeIRDirty_1_N(gen_cc, 0, "s390x_dirtyhelper_PPNO_sha512",
-                             &s390x_dirtyhelper_PPNO_sha512,
-                             mkIRExprVec_3(IRExpr_GSPTR(), mkexpr(gpr1num), mkexpr(gpr2num)));
-   sha512_gen->guard = mkexpr(is_sha512_gen);
-   sha512_gen->nFxState = 3;
-   vex_bzero(&sha512_gen->fxState, sizeof(sha512_gen->fxState));
-   sha512_gen->fxState[0].fx     = Ifx_Read;
-   sha512_gen->fxState[0].offset = S390X_GUEST_OFFSET(guest_r0);
-   sha512_gen->fxState[0].size   = 2 * sizeof(ULong); /* gpr0 and gpr1 are read */
-   sha512_gen->fxState[1].fx     = Ifx_Read;
-   sha512_gen->fxState[1].offset = S390X_GUEST_OFFSET(guest_r0) + r1 * sizeof(ULong);
-   sha512_gen->fxState[1].size   = sizeof(ULong);
-   sha512_gen->fxState[2].fx     = Ifx_Modify;
-   sha512_gen->fxState[2].offset = S390X_GUEST_OFFSET(guest_r0) + (r1 + 1) * sizeof(ULong);
-   sha512_gen->fxState[2].size   = sizeof(ULong);
-   sha512_gen->mAddr = get_gpr_dw0(r1);
-   sha512_gen->mSize = S390_PPNO_MAX_SIZE_SHA512_GEN;
-   sha512_gen->mFx   = Ifx_Write;
+   return "vmah";
+}
 
-   IRTemp unused = newTemp(Ity_I64);
-   sha512_seed = unsafeIRDirty_1_N(unused, 0, "s390x_dirtyhelper_PPNO_sha512",
-                             &s390x_dirtyhelper_PPNO_sha512,
-                             mkIRExprVec_3(IRExpr_GSPTR(), mkexpr(gpr1num), mkexpr(gpr2num)));
-   sha512_seed->guard = mkexpr(is_sha512_seed);
-   sha512_seed->nFxState = 2;
-   vex_bzero(&sha512_seed->fxState, sizeof(sha512_seed->fxState));
-   sha512_seed->fxState[0].fx     = Ifx_Read;
-   sha512_seed->fxState[0].offset = S390X_GUEST_OFFSET(guest_r0);
-   sha512_seed->fxState[0].size   = 2 * sizeof(ULong); /* gpr0 and gpr1 are read */
-   sha512_seed->fxState[1].fx     = Ifx_Read;
-   sha512_seed->fxState[1].offset = S390X_GUEST_OFFSET(guest_r0) + r2 * sizeof(ULong);
-   sha512_seed->fxState[1].size   = 2 * sizeof(ULong); /* r2 and r2 + 1 are read */
-   sha512_seed->mAddr = get_gpr_dw0(r2);
-   sha512_seed->mSize = S390_PPNO_MAX_SIZE_SHA512_SEED;
-   sha512_seed->mFx   = Ifx_Write;
+static const HChar *
+s390_irgen_VMALH(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
 
-   /* Dummy helper which is used to signal VEX library that memory was loaded */
-   sha512_loadparam = unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_sha512_load_param_block",
-                             &s390x_dirtyhelper_PPNO_sha512_load_param_block,
-                             mkIRExprVec_0());
-   sha512_loadparam->guard = mkexpr(is_sha512);
-   sha512_loadparam->nFxState = 0;
-   vex_bzero(&sha512_loadparam->fxState, sizeof(sha512_loadparam->fxState));
-   sha512_loadparam->mAddr = get_gpr_dw0(1);
-   sha512_loadparam->mSize = S390_PPNO_PARAM_BLOCK_SIZE_SHA512;
-   sha512_loadparam->mFx   = Ifx_Read;
+   /* Check for specification exception */
+   vassert(m5 < 3);
 
-   IRDirty*
-   sha512_saveparam = unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_sha512_save_param_block",
-                             &s390x_dirtyhelper_PPNO_sha512_load_param_block,
-                             mkIRExprVec_0());
-   sha512_saveparam->guard = mkexpr(is_sha512);
-   sha512_saveparam->nFxState = 0;
-   vex_bzero(&sha512_saveparam->fxState, sizeof(sha512_saveparam->fxState));
-   sha512_saveparam->mAddr = get_gpr_dw0(1);
-   sha512_saveparam->mSize = S390_PPNO_PARAM_BLOCK_SIZE_SHA512;
-   sha512_saveparam->mFx   = Ifx_Write;
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VMALH;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.v4 = v4;
+   details.m4 = m5;
 
-   stmt(IRStmt_Dirty(query));
-   stmt(IRStmt_Dirty(sha512_loadparam));
-   stmt(IRStmt_Dirty(sha512_gen));
-   stmt(IRStmt_Dirty(sha512_seed));
-   stmt(IRStmt_Dirty(sha512_saveparam));
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
 
-   IRTemp cc = newTemp(Ity_I64);
-   assign(cc,
-          mkite(mkexpr(is_sha512_gen),
-                mkexpr(gen_cc),
-                mkU64(0)
-               )
-         );
+   d->nFxState = 4;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Read;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v4 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[3].size   = sizeof(V128);
 
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), mkexpr(cc), mkU64(0), mkU64(0));
+   stmt(IRStmt_Dirty(d));
 
-   return "ppno";
+   return "vmalh";
 }
 
 /* New insns are added here.
@@ -17708,6 +19362,18 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
         unsigned int rxb : 4;
         unsigned int op2 : 8;
       } VRR;
+      struct {
+        unsigned int op1 : 8;
+        unsigned int v1  : 4;
+        unsigned int v2  : 4;
+        unsigned int v3  : 4;
+        unsigned int m5  : 4;
+        unsigned int m6  : 4;
+        unsigned int     : 4;
+        unsigned int v4  : 4;
+        unsigned int rxb : 4;
+        unsigned int op2 : 8;
+      } VRRd;
       struct {
         unsigned int op1 : 8;
         unsigned int v1  : 4;
@@ -17717,6 +19383,17 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
         unsigned int rxb : 4;
         unsigned int op2 : 8;
       } VRI;
+      struct {
+        unsigned int op1 : 8;
+        unsigned int v1  : 4;
+        unsigned int v2  : 4;
+        unsigned int v3  : 4;
+        unsigned int     : 4;
+        unsigned int i4  : 8;
+        unsigned int m5  : 4;
+        unsigned int rxb : 4;
+        unsigned int op2 : 8;
+      } VRId;
       struct {
         unsigned int op1 : 8;
         unsigned int v1  : 4;
@@ -18253,16 +19930,28 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
    case 0xe70000000027ULL: s390_format_RXE_RRRDR(s390_irgen_LCBB, ovl.fmt.RXE.r1,
                                                  ovl.fmt.RXE.x2, ovl.fmt.RXE.b2,
                                                  ovl.fmt.RXE.d2, ovl.fmt.RXE.m3);  goto ok;
-   case 0xe70000000030ULL: /* VESL */ goto unimplemented;
-   case 0xe70000000033ULL: /* VERLL */ goto unimplemented;
+   case 0xe70000000030ULL: s390_format_VRS_VRDVM(s390_irgen_VESL, ovl.fmt.VRS.v1,
+                                                 ovl.fmt.VRS.b2, ovl.fmt.VRS.d2,
+                                                 ovl.fmt.VRS.v3, ovl.fmt.VRS.m4,
+                                                 ovl.fmt.VRS.rxb);  goto ok;
+   case 0xe70000000033ULL: s390_format_VRS_VRDVM(s390_irgen_VERLL, ovl.fmt.VRS.v1,
+                                                 ovl.fmt.VRS.b2, ovl.fmt.VRS.d2,
+                                                 ovl.fmt.VRS.v3, ovl.fmt.VRS.m4,
+                                                 ovl.fmt.VRS.rxb);  goto ok;
    case 0xe70000000036ULL: s390_format_VRS_VRDV(s390_irgen_VLM, ovl.fmt.VRS.v1,
                                                 ovl.fmt.VRS.b2, ovl.fmt.VRS.d2, ovl.fmt.VRS.v3,
                                                 ovl.fmt.VRS.rxb);  goto ok;
    case 0xe70000000037ULL: s390_format_VRS_VRRD(s390_irgen_VLL, ovl.fmt.VRS.v1,
                                                 ovl.fmt.VRS.b2, ovl.fmt.VRS.d2, ovl.fmt.VRS.v3,
                                                 ovl.fmt.VRS.rxb);  goto ok;
-   case 0xe70000000038ULL: /* VESRL */ goto unimplemented;
-   case 0xe7000000003aULL: /* VESRA */ goto unimplemented;
+   case 0xe70000000038ULL: s390_format_VRS_VRDVM(s390_irgen_VESRL, ovl.fmt.VRS.v1,
+                                                 ovl.fmt.VRS.b2, ovl.fmt.VRS.d2,
+                                                 ovl.fmt.VRS.v3, ovl.fmt.VRS.m4,
+                                                 ovl.fmt.VRS.rxb);  goto ok;
+   case 0xe7000000003aULL: s390_format_VRS_VRDVM(s390_irgen_VESRA, ovl.fmt.VRS.v1,
+                                                 ovl.fmt.VRS.b2, ovl.fmt.VRS.d2,
+                                                 ovl.fmt.VRS.v3, ovl.fmt.VRS.m4,
+                                                 ovl.fmt.VRS.rxb);  goto ok;
    case 0xe7000000003eULL: s390_format_VRS_VRDV(s390_irgen_VSTM, ovl.fmt.VRS.v1,
                                                 ovl.fmt.VRS.b2, ovl.fmt.VRS.d2, ovl.fmt.VRS.v3,
                                                 ovl.fmt.VRS.rxb);  goto ok;
@@ -18294,12 +19983,20 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
    case 0xe7000000004dULL: s390_format_VRI_VVIM(s390_irgen_VREP, ovl.fmt.VRI.v1,
                                                ovl.fmt.VRI.v3, ovl.fmt.VRI.i2,
                                                ovl.fmt.VRI.m3, ovl.fmt.VRI.rxb);  goto ok;
-   case 0xe70000000050ULL: /* VPOPCT */ goto unimplemented;
-   case 0xe70000000052ULL: /* VCTZ */ goto unimplemented;
-   case 0xe70000000053ULL: /* VCLZ */ goto unimplemented;
+   case 0xe70000000050ULL: s390_format_VRR_VVM(s390_irgen_VPOPCT, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000052ULL: s390_format_VRR_VVM(s390_irgen_VCTZ, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000053ULL: s390_format_VRR_VVM(s390_irgen_VCLZ, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
+                                               ovl.fmt.VRR.rxb);  goto ok;
    case 0xe70000000056ULL: s390_format_VRR_VV(s390_irgen_VLR, ovl.fmt.VRR.v1,
                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.rxb);  goto ok;
-   case 0xe7000000005cULL: /* VISTR */ goto unimplemented;
+   case 0xe7000000005cULL: s390_format_VRR_VVMM(s390_irgen_VISTR, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
+                                                ovl.fmt.VRR.m5, ovl.fmt.VRR.rxb);  goto ok;
    case 0xe7000000005fULL: s390_format_VRR_VVM(s390_irgen_VSEG, ovl.fmt.VRR.v1,
                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
                                                ovl.fmt.VRR.rxb);  goto ok;
@@ -18312,14 +20009,24 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
    case 0xe70000000062ULL: s390_format_VRR_VRR(s390_irgen_VLVGP, ovl.fmt.VRR.v1,
                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
                                                ovl.fmt.VRR.rxb);  goto ok;
-   case 0xe70000000064ULL: /* VSUM */ goto unimplemented;
-   case 0xe70000000065ULL: /* VSUMG */ goto unimplemented;
-   case 0xe70000000066ULL: /* VCKSM */ goto unimplemented;
-   case 0xe70000000067ULL: /* VSUMQ */ goto unimplemented;
+   case 0xe70000000064ULL: s390_format_VRR_VVVM(s390_irgen_VSUM, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000065ULL: s390_format_VRR_VVVM(s390_irgen_VSUMG, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000066ULL: s390_format_VRR_VVV(s390_irgen_VCKSM, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000067ULL: s390_format_VRR_VVVM(s390_irgen_VSUMQ, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
    case 0xe70000000068ULL: s390_format_VRR_VVV(s390_irgen_VN, ovl.fmt.VRR.v1,
                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
                                                ovl.fmt.VRR.rxb);  goto ok;
-   case 0xe70000000069ULL: /* VNC */ goto unimplemented;
+   case 0xe70000000069ULL: s390_format_VRR_VVV(s390_irgen_VNC, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.rxb);  goto ok;
    case 0xe7000000006aULL: s390_format_VRR_VVV(s390_irgen_VO, ovl.fmt.VRR.v1,
                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
                                                ovl.fmt.VRR.rxb);  goto ok;
@@ -18332,26 +20039,64 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
                                                ovl.fmt.VRR.rxb);  goto ok;
    case 0xe7000000006eULL: /* VNN */ goto unimplemented;
    case 0xe7000000006fULL: /* VOC */ goto unimplemented;
-   case 0xe70000000070ULL: /* VESLV */ goto unimplemented;
-   case 0xe70000000072ULL: /* VERIM */ goto unimplemented;
-   case 0xe70000000073ULL: /* VERLLV */ goto unimplemented;
-   case 0xe70000000074ULL: /* VSL */ goto unimplemented;
-   case 0xe70000000075ULL: /* VSLB */ goto unimplemented;
-   case 0xe70000000077ULL: /* VSLDB */ goto unimplemented;
-   case 0xe70000000078ULL: /* VESRLV */ goto unimplemented;
-   case 0xe7000000007aULL: /* VESRAV */ goto unimplemented;
-   case 0xe7000000007cULL: /* VSRL */ goto unimplemented;
-   case 0xe7000000007dULL: /* VSRLB */ goto unimplemented;
-   case 0xe7000000007eULL: /* VSRA */ goto unimplemented;
-   case 0xe7000000007fULL: /* VSRAB */ goto unimplemented;
-   case 0xe70000000080ULL: /* VFEE */ goto unimplemented;
-   case 0xe70000000081ULL: /* VFENE */ goto unimplemented;
-   case 0xe70000000082ULL: /* VFAE */ goto unimplemented;
+   case 0xe70000000070ULL: s390_format_VRR_VVVM(s390_irgen_VESLV, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000072ULL: s390_format_VRId_VVVIM(s390_irgen_VERIM, ovl.fmt.VRId.v1,
+                                                  ovl.fmt.VRId.v2, ovl.fmt.VRId.v3,
+                                                  ovl.fmt.VRId.i4, ovl.fmt.VRId.m5,
+                                                  ovl.fmt.VRId.rxb);  goto ok;
+   case 0xe70000000073ULL: s390_format_VRR_VVVM(s390_irgen_VERLLV, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000074ULL: s390_format_VRR_VVV(s390_irgen_VSL, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000075ULL: s390_format_VRR_VVV(s390_irgen_VSLB, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000077ULL: s390_format_VRId_VVVI(s390_irgen_VSLDB, ovl.fmt.VRId.v1,
+                                                 ovl.fmt.VRId.v2, ovl.fmt.VRId.v3,
+                                                 ovl.fmt.VRId.i4, ovl.fmt.VRId.rxb);  goto ok;
+   case 0xe70000000078ULL: s390_format_VRR_VVVM(s390_irgen_VESRLV, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe7000000007aULL: s390_format_VRR_VVVM(s390_irgen_VESRAV, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe7000000007cULL: s390_format_VRR_VVV(s390_irgen_VSRL, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe7000000007dULL: s390_format_VRR_VVV(s390_irgen_VSRLB, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe7000000007eULL: s390_format_VRR_VVV(s390_irgen_VSRA, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe7000000007fULL: s390_format_VRR_VVV(s390_irgen_VSRAB, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000080ULL: s390_format_VRR_VVVMM(s390_irgen_VFEE, ovl.fmt.VRR.v1,
+                                                 ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                 ovl.fmt.VRR.m4, ovl.fmt.VRR.m5,
+                                                 ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000081ULL: s390_format_VRR_VVVMM(s390_irgen_VFENE, ovl.fmt.VRR.v1,
+                                                 ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                 ovl.fmt.VRR.m4, ovl.fmt.VRR.m5,
+                                                 ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe70000000082ULL: s390_format_VRR_VVVMM(s390_irgen_VFAE, ovl.fmt.VRR.v1,
+                                                 ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                 ovl.fmt.VRR.m4, ovl.fmt.VRR.m5,
+                                                 ovl.fmt.VRR.rxb);  goto ok;
    case 0xe70000000084ULL: s390_format_VRR_VVVM(s390_irgen_VPDI, ovl.fmt.VRR.v1,
                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
    case 0xe70000000085ULL: /* VBPERM */ goto unimplemented;
-   case 0xe7000000008aULL: /* VSTRC */ goto unimplemented;
+   case 0xe7000000008aULL: s390_format_VRRd_VVVVMM(s390_irgen_VSTRC, ovl.fmt.VRRd.v1,
+                                                   ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                   ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                   ovl.fmt.VRRd.m6,
+                                                   ovl.fmt.VRRd.rxb);  goto ok;
    case 0xe7000000008cULL: s390_format_VRR_VVVV(s390_irgen_VPERM, ovl.fmt.VRR.v1,
                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
@@ -18371,27 +20116,79 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.m5, ovl.fmt.VRR.rxb);  goto ok;
    case 0xe7000000009eULL: /* VFNMS */ goto unimplemented;
    case 0xe7000000009fULL: /* VFNMA */ goto unimplemented;
-   case 0xe700000000a1ULL: /* VMLH */ goto unimplemented;
-   case 0xe700000000a2ULL: /* VML */ goto unimplemented;
-   case 0xe700000000a3ULL: /* VMH */ goto unimplemented;
-   case 0xe700000000a4ULL: /* VMLE */ goto unimplemented;
-   case 0xe700000000a5ULL: /* VMLO */ goto unimplemented;
-   case 0xe700000000a6ULL: /* VME */ goto unimplemented;
-   case 0xe700000000a7ULL: /* VMO */ goto unimplemented;
-   case 0xe700000000a9ULL: /* VMALH */ goto unimplemented;
-   case 0xe700000000aaULL: /* VMAL */ goto unimplemented;
-   case 0xe700000000abULL: /* VMAH */ goto unimplemented;
-   case 0xe700000000acULL: /* VMALE */ goto unimplemented;
-   case 0xe700000000adULL: /* VMALO */ goto unimplemented;
-   case 0xe700000000aeULL: /* VMAE */ goto unimplemented;
-   case 0xe700000000afULL: /* VMAO */ goto unimplemented;
-   case 0xe700000000b4ULL: /* VGFM */ goto unimplemented;
+   case 0xe700000000a1ULL: s390_format_VRR_VVVM(s390_irgen_VMLH, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000a2ULL: s390_format_VRR_VVVM(s390_irgen_VML, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000a3ULL: s390_format_VRR_VVVM(s390_irgen_VMH, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000a4ULL: s390_format_VRR_VVVM(s390_irgen_VMLE, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000a5ULL: s390_format_VRR_VVVM(s390_irgen_VMLO, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000a6ULL: s390_format_VRR_VVVM(s390_irgen_VME, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000a7ULL: s390_format_VRR_VVVM(s390_irgen_VMO, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000a9ULL: s390_format_VRRd_VVVVM(s390_irgen_VMALH, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000aaULL: s390_format_VRRd_VVVVM(s390_irgen_VMAL, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000abULL: s390_format_VRRd_VVVVM(s390_irgen_VMAH, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000acULL: s390_format_VRRd_VVVVM(s390_irgen_VMALE, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000adULL: s390_format_VRRd_VVVVM(s390_irgen_VMALO, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000aeULL: s390_format_VRRd_VVVVM(s390_irgen_VMAE, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000afULL: s390_format_VRRd_VVVVM(s390_irgen_VMAO, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000b4ULL: s390_format_VRR_VVVM(s390_irgen_VGFM, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
    case 0xe700000000b8ULL: /* VMSL */ goto unimplemented;
-   case 0xe700000000b9ULL: /* VACCC */ goto unimplemented;
-   case 0xe700000000bbULL: /* VAC */ goto unimplemented;
-   case 0xe700000000bcULL: /* VGFMA */ goto unimplemented;
-   case 0xe700000000bdULL: /* VSBCBI */ goto unimplemented;
-   case 0xe700000000bfULL: /* VSBI */ goto unimplemented;
+   case 0xe700000000b9ULL: s390_format_VRRd_VVVVM(s390_irgen_VACCC, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000bbULL: s390_format_VRRd_VVVVM(s390_irgen_VAC, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000bcULL: s390_format_VRRd_VVVVM(s390_irgen_VGFMA, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000bdULL: s390_format_VRRd_VVVVM(s390_irgen_VSBCBI, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
+   case 0xe700000000bfULL: s390_format_VRRd_VVVVM(s390_irgen_VSBI, ovl.fmt.VRRd.v1,
+                                                  ovl.fmt.VRRd.v2, ovl.fmt.VRRd.v3,
+                                                  ovl.fmt.VRRd.v4, ovl.fmt.VRRd.m5,
+                                                  ovl.fmt.VRRd.rxb);  goto ok;
    case 0xe700000000c0ULL: /* VCLGD */ goto unimplemented;
    case 0xe700000000c1ULL: /* VCDLG */ goto unimplemented;
    case 0xe700000000c2ULL: /* VCGD */ goto unimplemented;
@@ -18415,11 +20212,20 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
    case 0xe700000000d7ULL: s390_format_VRR_VVM(s390_irgen_VUPH, ovl.fmt.VRR.v1,
                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
                                                ovl.fmt.VRR.rxb);  goto ok;
-   case 0xe700000000d8ULL: /* VTM */ goto unimplemented;
-   case 0xe700000000d9ULL: /* VECL */ goto unimplemented;
-   case 0xe700000000dbULL: /* VEC */ goto unimplemented;
-   case 0xe700000000deULL: /* VLC */ goto unimplemented;
-   case 0xe700000000dfULL: /* VLP */ goto unimplemented;
+   case 0xe700000000d8ULL: s390_format_VRR_VV(s390_irgen_VTM, ovl.fmt.VRR.v1,
+                                              ovl.fmt.VRR.v2, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000d9ULL: s390_format_VRR_VVM(s390_irgen_VECL, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000dbULL: s390_format_VRR_VVM(s390_irgen_VEC, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000deULL: s390_format_VRR_VVM(s390_irgen_VLC, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
+                                               ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000dfULL: s390_format_VRR_VVM(s390_irgen_VLP, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.m4,
+                                               ovl.fmt.VRR.rxb);  goto ok;
    case 0xe700000000e2ULL: /* VFS */ goto unimplemented;
    case 0xe700000000e3ULL: /* VFA */ goto unimplemented;
    case 0xe700000000e5ULL: /* VFD */ goto unimplemented;
@@ -18429,19 +20235,48 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
    case 0xe700000000ebULL: /* VFCH */ goto unimplemented;
    case 0xe700000000eeULL: /* VFMIN */ goto unimplemented;
    case 0xe700000000efULL: /* VFMAX */ goto unimplemented;
-   case 0xe700000000f0ULL: /* VAVGL */ goto unimplemented;
-   case 0xe700000000f1ULL: /* VACC */ goto unimplemented;
-   case 0xe700000000f2ULL: /* VAVG */ goto unimplemented;
-   case 0xe700000000f3ULL: /* VA */ goto unimplemented;
-   case 0xe700000000f5ULL: /* VSCBI */ goto unimplemented;
-   case 0xe700000000f7ULL: /* VS */ goto unimplemented;
-   case 0xe700000000f8ULL: /* VCEQ */ goto unimplemented;
-   case 0xe700000000f9ULL: /* VCHL */ goto unimplemented;
-   case 0xe700000000fbULL: /* VCH */ goto unimplemented;
-   case 0xe700000000fcULL: /* VMNL */ goto unimplemented;
-   case 0xe700000000fdULL: /* VMXL */ goto unimplemented;
-   case 0xe700000000feULL: /* VMN */ goto unimplemented;
-   case 0xe700000000ffULL: /* VMX */ goto unimplemented;
+   case 0xe700000000f0ULL: s390_format_VRR_VVVM(s390_irgen_VAVGL, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000f1ULL: s390_format_VRR_VVVM(s390_irgen_VACC, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000f2ULL: s390_format_VRR_VVVM(s390_irgen_VAVG, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000f3ULL: s390_format_VRR_VVVM(s390_irgen_VA, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000f5ULL: s390_format_VRR_VVVM(s390_irgen_VSCBI, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000f7ULL: s390_format_VRR_VVVM(s390_irgen_VS, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000f8ULL: s390_format_VRR_VVVMM(s390_irgen_VCEQ, ovl.fmt.VRR.v1,
+                                                 ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                 ovl.fmt.VRR.m4, ovl.fmt.VRR.m5,
+                                                 ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000f9ULL: s390_format_VRR_VVVMM(s390_irgen_VCHL, ovl.fmt.VRR.v1,
+                                                 ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                 ovl.fmt.VRR.m4, ovl.fmt.VRR.m5,
+                                                 ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000fbULL: s390_format_VRR_VVVMM(s390_irgen_VCH, ovl.fmt.VRR.v1,
+                                                 ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                 ovl.fmt.VRR.m4, ovl.fmt.VRR.m5,
+                                                 ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000fcULL: s390_format_VRR_VVVM(s390_irgen_VMNL, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000fdULL: s390_format_VRR_VVVM(s390_irgen_VMXL, ovl.fmt.VRR.v1,
+                                                ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                                ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000feULL: s390_format_VRR_VVVM(s390_irgen_VMN, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
+   case 0xe700000000ffULL: s390_format_VRR_VVVM(s390_irgen_VMX, ovl.fmt.VRR.v1,
+                                               ovl.fmt.VRR.v2, ovl.fmt.VRR.r3,
+                                               ovl.fmt.VRR.m4, ovl.fmt.VRR.rxb);  goto ok;
    case 0xeb0000000004ULL: s390_format_RSY_RRRD(s390_irgen_LMG, ovl.fmt.RSY.r1,
                                                 ovl.fmt.RSY.r3, ovl.fmt.RSY.b2,
                                                 ovl.fmt.RSY.dl2,
diff --git a/VEX/priv/host_s390_defs.c b/VEX/priv/host_s390_defs.c
index 6c35c67246..6c22ac8430 100644
--- a/VEX/priv/host_s390_defs.c
+++ b/VEX/priv/host_s390_defs.c
@@ -1766,6 +1766,20 @@ emit_VRR_VVVV(UChar *p, ULong op, UChar v1, UChar v2, UChar v3, UChar v4)
 }
 
 
+static UChar *
+emit_VRR_VRR(UChar *p, ULong op, UChar v1, UChar r2, UChar r3)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)r2) << 32;
+   the_insn |= ((ULong)r3) << 28;
+   the_insn |= ((ULong)rxb)<< 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
 /*------------------------------------------------------------*/
 /*--- Functions to emit particular instructions            ---*/
 /*------------------------------------------------------------*/
@@ -5713,6 +5727,338 @@ s390_emit_VMRL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
    return emit_VRR_VVVM(p, 0xE70000000060ULL, v1, v2, v3, m4);
 }
 
+static UChar *
+s390_emit_VA(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "va", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f3ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VS(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vs", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f7ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VNO(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vno", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000006bULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VCH(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vch", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000fbULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VCHL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vchl", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f9ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VCLZ(UChar *p, UChar v1, UChar v2, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vclz", v1, v2, m4);
+
+   return emit_VRR_VVM(p, 0xE70000000053ULL, v1, v2, m4);
+}
+
+static UChar *
+s390_emit_VCTZ(UChar *p, UChar v1, UChar v2, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vctz", v1, v2, m4);
+
+   return emit_VRR_VVM(p, 0xE70000000052ULL, v1, v2, m4);
+}
+
+static UChar *
+s390_emit_VPOPCT(UChar *p, UChar v1, UChar v2, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vpopct", v1, v2, m4);
+
+   return emit_VRR_VVM(p, 0xE70000000050ULL, v1, v2, m4);
+}
+
+static UChar *
+s390_emit_VMX(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmx", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000ffULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMXL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmxl", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000fdULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMN(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmn", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000feULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMNL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmnl", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000fcULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VAVG(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vavg", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f2ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VAVGL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vavgl", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f0ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VLP(UChar *p, UChar v1, UChar v2, UChar m3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vlp", v1, v2, m3);
+
+   return emit_VRR_VVM(p, 0xE700000000DFULL, v1, v2, m3);
+}
+
+static UChar *
+s390_emit_VMH(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmh", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a3ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMLH(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmlh", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a1ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VML(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vml", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a2ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VME(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vme", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a6ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMLE(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmle", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a4ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESLV(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "veslv", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000070ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESRAV(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vesrav", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE7000000007aULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESRLV(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vesrlv", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000078ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESL(UChar *p, UChar v1, UChar b2, UShort d2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, UDXB, VR, UINT), "vesl", v1, d2, 0, b2, v3, m4);
+
+   return emit_VRS(p, 0xE70000000030ULL, v1, b2, d2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESRA(UChar *p, UChar v1, UChar b2, UShort d2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, UDXB, VR, UINT), "vesra", v1, d2, 0, b2, v3, m4);
+
+   return emit_VRS(p, 0xE7000000003aULL, v1, b2, d2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESRL(UChar *p, UChar v1, UChar b2, UShort d2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, UDXB, VR, UINT), "vesrl", v1, d2, 0, b2, v3, m4);
+
+   return emit_VRS(p, 0xE70000000038ULL, v1, b2, d2, v3, m4);
+}
+
+static UChar *
+s390_emit_VERLLV(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "verllv", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000073ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VSL(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsl", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE70000000074ULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSRL(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsrl", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000007cULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSRA(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsra", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000007eULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSLB(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vslb", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE70000000075ULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSRLB(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsrlb", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000007dULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSRAB(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsrab", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000007fULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSUM(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vsum", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000064ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VSUMG(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vsumg", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000065ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VSUMQ(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vsumq", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000067ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VLVGP(UChar *p, UChar v1, UChar r2, UChar r3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, GPR, GPR), "vlvgp", v1, r2, r3);
+
+   return emit_VRR_VRR(p, 0xE70000000062ULL, v1, r2, r3);
+}
 
 /*---------------------------------------------------------------*/
 /*--- Constructors for the various s390_insn kinds            ---*/
@@ -7476,6 +7822,9 @@ s390_insn_as_string(const s390_insn *insn)
    case S390_INSN_VEC_AMODEOP:
       switch (insn->variant.vec_amodeop.tag) {
       case S390_VEC_GET_ELEM:  op = "v-vgetelem";  break;
+      case S390_VEC_ELEM_SHL_INT: op = "v-veshl"; break;
+      case S390_VEC_ELEM_SHRA_INT: op = "v-veshra"; break;
+      case S390_VEC_ELEM_SHRL_INT: op = "v-veshrl"; break;
       default: goto fail;
       }
       s390_sprintf(buf, "%M %R, %R, %A", op, insn->variant.vec_amodeop.dst,
@@ -7504,6 +7853,36 @@ s390_insn_as_string(const s390_insn *insn)
       case S390_VEC_AND:            op = "v-vand"; break;
       case S390_VEC_MERGEL:         op = "v-vmergel"; break;
       case S390_VEC_MERGEH:         op = "v-vmergeh"; break;
+      case S390_VEC_NOR:            op = "v-vnor"; break;
+      case S390_VEC_INT_ADD:        op = "v-vintadd"; break;
+      case S390_VEC_INT_SUB:        op = "v-vintsub"; break;
+      case S390_VEC_MAXU:           op = "v-vmaxu"; break;
+      case S390_VEC_MAXS:           op = "v-vmaxs"; break;
+      case S390_VEC_MINU:           op = "v-vminu"; break;
+      case S390_VEC_MINS:           op = "v-vmins"; break;
+      case S390_VEC_AVGU:           op = "v-vavgu"; break;
+      case S390_VEC_AVGS:           op = "v-vavgs"; break;
+      case S390_VEC_COMPARE_GREATERS: op = "v-vcmpgts"; break;
+      case S390_VEC_COMPARE_GREATERU: op = "v-vcmpgtu"; break;
+      case S390_VEC_INT_MUL_HIGHS:    op = "v-vintmulhis"; break;
+      case S390_VEC_INT_MUL_HIGHU:    op = "v-vintmulhiu"; break;
+      case S390_VEC_INT_MUL_LOW:      op = "v-vintmullo"; break;
+      case S390_VEC_INT_MUL_EVENS:    op = "v-vintmulevens"; break;
+      case S390_VEC_INT_MUL_EVENU:    op = "v-vintmulevenu"; break;
+      case S390_VEC_ELEM_SHL_V:       op = "v-velemshl"; break;
+      case S390_VEC_ELEM_SHRA_V:      op = "v-vshrav"; break;
+      case S390_VEC_ELEM_SHRL_V:      op = "v-vshrlv"; break;
+      case S390_VEC_ELEM_ROLL_V:      op = "v-vrollv"; break;
+      case S390_VEC_SHL_BITS:         op = "v-vshlbits"; break;
+      case S390_VEC_SHRL_BITS:        op = "v-vshrlbits"; break;
+      case S390_VEC_SHRA_BITS:        op = "v-vshrabits"; break;
+      case S390_VEC_SHL_BYTES:        op = "v-vshlbytes"; break;
+      case S390_VEC_SHRL_BYTES:       op = "v-vshrlbytes"; break;
+      case S390_VEC_SHRA_BYTES:       op = "v-vshrabytes"; break;
+      case S390_VEC_PWSUM_W:          op = "v-vpwsumw"; break;
+      case S390_VEC_PWSUM_DW:         op = "v-vpwsumdw"; break;
+      case S390_VEC_PWSUM_QW:         op = "v-vpwsumqw"; break;
+      case S390_VEC_INIT_FROM_GPRS:   op = "v-vinitfromgprs"; break;
       default: goto fail;
       }
       s390_sprintf(buf, "%M %R, %R, %R", op, insn->variant.vec_binop.dst,
@@ -7884,6 +8263,9 @@ s390_insn_move_emit(UChar *buf, const s390_insn *insn)
          return s390_emit_LGR(buf, dst, src);
       if (dst_class == HRcFlt64)
          return s390_emit_LDR(buf, dst, src);
+      if (dst_class == HRcVec128) {
+         return s390_emit_VLR(buf, dst, src);
+      }
    } else {
       if (dst_class == HRcFlt64 && src_class == HRcInt64) {
          if (insn->size == 4) {
@@ -7901,12 +8283,6 @@ s390_insn_move_emit(UChar *buf, const s390_insn *insn)
             return s390_emit_LGDRw(buf, dst, src);
          }
       }
-
-      if (dst_class == HRcVec128 && src_class == HRcVec128) {
-         if(insn->size == 16) {
-            return s390_emit_VLR(buf, dst, src);
-         }
-      }
       /* A move between floating point registers and general purpose
          registers of different size should never occur and indicates
          an error elsewhere. */
@@ -8634,9 +9010,38 @@ s390_insn_unop_emit(UChar *buf, const s390_insn *insn)
       UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
       return s390_emit_VUPLH(buf, v1, v2, s390_getM_from_size(insn->size));
       }
+
+   case S390_VEC_ABS:{
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VLP(buf, v1, v2, s390_getM_from_size(insn->size));
+   }
+
+   case S390_VEC_COUNT_LEADING_ZEROES:{
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VCLZ(buf, v1, v2, s390_getM_from_size(insn->size));
+   }
+
+   case S390_VEC_COUNT_TRAILING_ZEROES:{
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VCTZ(buf, v1, v2, s390_getM_from_size(insn->size));
    }
 
-   vpanic("s390_insn_unop_emit");
+   case S390_VEC_COUNT_ONES:{
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VPOPCT(buf, v1, v2, s390_getM_from_size(insn->size));
+   }
+
+   default:
+      vpanic("s390_insn_unop_emit");
+   }
 }
 
 
@@ -10502,18 +10907,30 @@ s390_insn_profinc_emit(UChar *buf,
 static UChar *
 s390_insn_vec_amodeop_emit(UChar *buf, const s390_insn *insn)
 {
-   UChar r1 = hregNumber(insn->variant.vec_amodeop.dst);
-   UChar v1 = hregNumber(insn->variant.vec_amodeop.op1);
+   UChar v1 = hregNumber(insn->variant.vec_amodeop.dst);
+   UChar v2 = hregNumber(insn->variant.vec_amodeop.op1);
    s390_amode* op2 = insn->variant.vec_amodeop.op2;
 
    vassert(hregNumber(op2->x) == 0);
+   vassert(fits_unsigned_12bit(op2->d));
+
    UChar b = hregNumber(op2->b);
    UShort d = op2->d;
 
 
    switch (insn->variant.vec_amodeop.tag) {
    case S390_VEC_GET_ELEM:
-      return s390_emit_VLGV(buf, r1, b, d, v1, s390_getM_from_size(insn->size));
+      return s390_emit_VLGV(buf, v1, b, d, v2, s390_getM_from_size(insn->size));
+
+   case S390_VEC_ELEM_SHL_INT:
+      return s390_emit_VESL(buf, v1, b, d, v2, s390_getM_from_size(insn->size));
+
+   case S390_VEC_ELEM_SHRA_INT:
+      return s390_emit_VESRA(buf, v1, b, d, v2, s390_getM_from_size(insn->size));
+
+   case S390_VEC_ELEM_SHRL_INT:
+      return s390_emit_VESRL(buf, v1, b, d, v2, s390_getM_from_size(insn->size));
+
    default:  goto fail;
    }
 
@@ -10569,9 +10986,72 @@ s390_insn_vec_binop_emit(UChar *buf, const s390_insn *insn)
       case S390_VEC_AND:
          return s390_emit_VN(buf, v1, v2, v3);
       case S390_VEC_MERGEL:
-         return s390_emit_VMRH(buf, v1, v2, v3, s390_getM_from_size(size));
-      case S390_VEC_MERGEH:
          return s390_emit_VMRL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MERGEH:
+         return s390_emit_VMRH(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_NOR:
+         return s390_emit_VNO(buf, v1, v2, v3);
+      case S390_VEC_INT_ADD:
+         return s390_emit_VA(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_SUB:
+         return s390_emit_VS(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MAXU:
+         return s390_emit_VMXL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MAXS:
+         return s390_emit_VMX(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MINU:
+         return s390_emit_VMNL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MINS:
+         return s390_emit_VMN(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_AVGU:
+         return s390_emit_VAVGL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_AVGS:
+         return s390_emit_VAVG(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_COMPARE_GREATERS:
+         return s390_emit_VCH(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_COMPARE_GREATERU:
+         return s390_emit_VCHL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_HIGHS:
+         return s390_emit_VMH(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_HIGHU:
+         return s390_emit_VMLH(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_LOW:
+         return s390_emit_VML(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_EVENS:
+         return s390_emit_VME(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_EVENU:
+         return s390_emit_VMLE(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_ELEM_SHL_V:
+         return s390_emit_VESLV(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_ELEM_SHRA_V:
+         return s390_emit_VESRAV(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_ELEM_SHRL_V:
+         return s390_emit_VESRLV(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_ELEM_ROLL_V:
+         return s390_emit_VERLLV(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_SHL_BITS:
+         return s390_emit_VSL(buf, v1, v2, v3);
+      case S390_VEC_SHRL_BITS:
+         return s390_emit_VSRL(buf, v1, v2, v3);
+      case S390_VEC_SHRA_BITS:
+         return s390_emit_VSRA(buf, v1, v2, v3);
+      case S390_VEC_SHL_BYTES:
+         return s390_emit_VSLB(buf, v1, v2, v3);
+      case S390_VEC_SHRL_BYTES:
+         return s390_emit_VSRLB(buf, v1, v2, v3);
+      case S390_VEC_SHRA_BYTES:
+         return s390_emit_VSRAB(buf, v1, v2, v3);
+      case S390_VEC_PWSUM_W:
+         vassert((size == 1) || (size == 2));
+         return s390_emit_VSUM(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_PWSUM_DW:
+         vassert((size == 2) || (size == 4));
+         return s390_emit_VSUMG(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_PWSUM_QW:
+         vassert((size == 4) || (size == 8));
+         return s390_emit_VSUMQ(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INIT_FROM_GPRS:
+         return s390_emit_VLVGP(buf, v1, v2, v3);
       default:
          goto fail;
    }
diff --git a/VEX/priv/host_s390_defs.h b/VEX/priv/host_s390_defs.h
index c88075d5f0..7ea01010e2 100644
--- a/VEX/priv/host_s390_defs.h
+++ b/VEX/priv/host_s390_defs.h
@@ -198,7 +198,11 @@ typedef enum {
    S390_VEC_FILL,
    S390_VEC_DUPLICATE,
    S390_VEC_UNPACKLOWS,
-   S390_VEC_UNPACKLOWU
+   S390_VEC_UNPACKLOWU,
+   S390_VEC_ABS,
+   S390_VEC_COUNT_LEADING_ZEROES,
+   S390_VEC_COUNT_TRAILING_ZEROES,
+   S390_VEC_COUNT_ONES
 } s390_unop_t;
 
 /* The kind of ternary BFP operations */
@@ -337,7 +341,10 @@ typedef enum {
 
 /* The vector operations with 2 operands one of them being amode */
 typedef enum {
-   S390_VEC_GET_ELEM
+   S390_VEC_GET_ELEM,
+   S390_VEC_ELEM_SHL_INT,
+   S390_VEC_ELEM_SHRA_INT,
+   S390_VEC_ELEM_SHRL_INT
 } s390_vec_amodeop_t;
 
 /* The vector operations with three (vector, amode and integer) operands */
@@ -355,7 +362,38 @@ typedef enum {
    S390_VEC_XOR,
    S390_VEC_AND,
    S390_VEC_MERGEL,
-   S390_VEC_MERGEH
+   S390_VEC_MERGEH,
+   S390_VEC_NOR,
+   S390_VEC_INT_ADD,
+   S390_VEC_INT_SUB,
+   S390_VEC_MAXU,
+   S390_VEC_MAXS,
+   S390_VEC_MINU,
+   S390_VEC_MINS,
+   S390_VEC_AVGU,
+   S390_VEC_AVGS,
+   S390_VEC_COMPARE_GREATERS,
+   S390_VEC_COMPARE_GREATERU,
+   S390_VEC_INT_MUL_HIGHS,
+   S390_VEC_INT_MUL_HIGHU,
+   S390_VEC_INT_MUL_LOW,
+   S390_VEC_INT_MUL_EVENS,
+   S390_VEC_INT_MUL_EVENU,
+   S390_VEC_ELEM_SHL_V,
+   S390_VEC_ELEM_SHRA_V,
+   S390_VEC_ELEM_SHRL_V,
+   S390_VEC_ELEM_ROLL_V,
+
+   /* host_s390_isel depends on this order. */
+   S390_VEC_SHL_BITS, S390_VEC_SHL_BYTES,
+   S390_VEC_SHRL_BITS, S390_VEC_SHRL_BYTES,
+   S390_VEC_SHRA_BITS, S390_VEC_SHRA_BYTES,
+
+   S390_VEC_PWSUM_W,
+   S390_VEC_PWSUM_DW,
+   S390_VEC_PWSUM_QW,
+
+   S390_VEC_INIT_FROM_GPRS,
 } s390_vec_binop_t;
 
 /* The vector operations with three operands */
diff --git a/VEX/priv/host_s390_isel.c b/VEX/priv/host_s390_isel.c
index dec1259f6e..bc34f90ff3 100644
--- a/VEX/priv/host_s390_isel.c
+++ b/VEX/priv/host_s390_isel.c
@@ -526,21 +526,36 @@ vec_generate_zeroes(ISelEnv* env)
 }
 
 static HReg
-vec_generate_ones(ISelEnv* env)
+vec_do_notV128(ISelEnv* env, HReg arg)
 {
    HReg dst = newVRegV(env);
-   addInstr(env, s390_insn_unop(16, S390_VEC_FILL, dst, s390_opnd_imm(0xffff)));
+   addInstr(env, s390_insn_vec_binop(16, S390_VEC_NOR, dst, arg, arg));
    return dst;
 }
 
-static HReg
-vec_do_notV128(ISelEnv* env, HReg arg)
+#define IRCONST_IS_EQUAL_U8(arg, val)                   \
+   ( ((arg)->tag == Iex_Const)                          \
+     && ((arg)->Iex.Const.con->tag == Ico_U8)           \
+     && ((arg)->Iex.Const.con->Ico.U8 == (val)) )
+
+/* Returns true if (expr & 0x7 == 0) */
+static Bool
+vec_is_bytes_only_shift(const IRExpr* expr)
 {
-   HReg dst = newVRegV(env);
-   addInstr(env, s390_insn_vec_binop(16, S390_VEC_XOR,dst,
-            arg, vec_generate_ones(env)));
-   return dst;
+   const Bool is_good_const =
+                  (expr->tag == Iex_Const) &&
+                  ((expr->Iex.Const.con->Ico.U8 & 0b00000111) == 0);
+
+   const Bool good_mask_applied =
+                  (expr->tag == Iex_Binop) && (expr->Iex.Binop.op == Iop_And8) &&
+                     (IRCONST_IS_EQUAL_U8(expr->Iex.Binop.arg1, 0b01111000)
+                        ||
+                      IRCONST_IS_EQUAL_U8(expr->Iex.Binop.arg2, 0b01111000)
+                     );
+
+   return is_good_const || good_mask_applied;
 }
+#undef IRCONST_IS_EQUAL_U8
 
 /* Call a helper (clean or dirty)
    Arguments must satisfy the following conditions:
@@ -3687,11 +3702,25 @@ s390_isel_vec_expr_wrk(ISelEnv *env, IRExpr *expr)
       UChar size_for_int_arg = 0;
       HReg dst;
       HReg reg1;
-      s390_unop_t vec_op = 0;
+      s390_unop_t vec_op;
       IROp op = expr->Iex.Unop.op;
       IRExpr* arg = expr->Iex.Unop.arg;
       switch(op) {
       case Iop_NotV128:
+         /* Not(Or(arg1, arg2)) -> Nor(arg1, arg2) */
+         if(UNLIKELY((arg->tag == Iex_Binop ) && (arg->Iex.Binop.op == Iop_OrV128)))
+         {
+            dst = newVRegV(env);
+            addInstr(env,
+                     s390_insn_vec_binop(16,
+                                         S390_VEC_NOR,
+                                         dst,
+                                         s390_isel_vec_expr(env, arg->Iex.Binop.arg1),
+                                         s390_isel_vec_expr(env, arg->Iex.Binop.arg2)
+                                        )
+                     );
+            return dst;
+         }
          reg1 = s390_isel_vec_expr(env, arg);
          return vec_do_notV128(env, reg1);
 
@@ -3715,6 +3744,20 @@ s390_isel_vec_expr_wrk(ISelEnv *env, IRExpr *expr)
             return vec_do_notV128(env, dst);
          }
 
+      case Iop_CmpNEZ128x1: {
+         IRExpr* low64     = IRExpr_Unop(Iop_V128to64, arg);
+         IRExpr* high64    = IRExpr_Unop(Iop_V128HIto64, arg);
+         IRExpr* both      = IRExpr_Binop(Iop_Or64, low64, high64);
+         IRExpr* anyNonZ   = IRExpr_Unop(Iop_CmpNEZ64, both);
+         IRExpr* anyNonZ64 = IRExpr_Unop(Iop_1Sto64, anyNonZ);
+         reg1 = s390_isel_int_expr(env, anyNonZ64);
+
+         dst = newVRegV(env);
+         addInstr(env, s390_insn_vec_binop(size, S390_VEC_INIT_FROM_GPRS,
+                                           dst, reg1, reg1));
+         return dst;
+      }
+
       case Iop_Dup8x16:
          size = size_for_int_arg = 1;
          vec_op = S390_VEC_DUPLICATE;
@@ -3773,6 +3816,122 @@ s390_isel_vec_expr_wrk(ISelEnv *env, IRExpr *expr)
          return dst;
       }
 
+      case Iop_Abs8x16:
+         size = 1;
+         vec_op = S390_VEC_ABS;
+         goto Iop_V_wrk;
+      case Iop_Abs16x8:
+         size = 2;
+         vec_op = S390_VEC_ABS;
+         goto Iop_V_wrk;
+      case Iop_Abs32x4:
+         size = 4;
+         vec_op = S390_VEC_ABS;
+         goto Iop_V_wrk;
+      case Iop_Abs64x2:
+         size = 8;
+         vec_op = S390_VEC_ABS;
+         goto Iop_V_wrk;
+
+      case Iop_Clz8x16:
+         size = 1;
+         vec_op = S390_VEC_COUNT_LEADING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Ctz8x16:
+         size = 1;
+         vec_op = S390_VEC_COUNT_TRAILING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Clz16x8:
+         size = 2;
+         vec_op = S390_VEC_COUNT_LEADING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Ctz16x8:
+         size = 2;
+         vec_op = S390_VEC_COUNT_TRAILING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Clz32x4:
+         size = 4;
+         vec_op = S390_VEC_COUNT_LEADING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Ctz32x4:
+         size = 4;
+         vec_op = S390_VEC_COUNT_TRAILING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Clz64x2:
+         size = 8;
+         vec_op = S390_VEC_COUNT_LEADING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Ctz64x2:
+         size = 8;
+         vec_op = S390_VEC_COUNT_TRAILING_ZEROES;
+         goto Iop_V_wrk;
+
+      case Iop_Cnt8x16:
+         size = 1;
+         vec_op = S390_VEC_COUNT_ONES;
+         goto Iop_V_wrk;
+
+      Iop_V_wrk: {
+         dst = newVRegV(env);
+         reg1 = s390_isel_vec_expr(env, arg);
+
+         addInstr(env,
+                  s390_insn_unop(size, vec_op, dst, s390_opnd_reg(reg1)));
+         return dst;
+      }
+
+      case Iop_PwAddL8Ux16: {
+         /* There is no such instruction. We have to emulate it. */
+         IRExpr *even = IRExpr_Binop(Iop_InterleaveEvenLanes8x16,
+                                     IRExpr_Const(IRConst_V128(0x0000)),
+                                     arg);
+         IRExpr *odd = IRExpr_Binop(Iop_InterleaveOddLanes8x16,
+                                    IRExpr_Const(IRConst_V128(0x0000)),
+                                    arg);
+         dst = s390_isel_vec_expr(env, IRExpr_Binop(Iop_Add16x8, even, odd));
+         return dst;
+      }
+
+      case Iop_PwAddL16Ux8:
+         if (arg->tag == Iex_Unop && arg->Iex.Unop.op == Iop_PwAddL8Ux16) {
+            size = 1;
+            arg = arg->Iex.Unop.arg;
+         } else {
+            size = 2;
+         }
+         vec_op = S390_VEC_PWSUM_W;
+         goto Iop_Pairwise_wrk;
+
+      case Iop_PwAddL32Ux4:
+         if (arg->tag == Iex_Unop && arg->Iex.Unop.op == Iop_PwAddL16Ux8) {
+            size = 2;
+            arg = arg->Iex.Unop.arg;
+         } else {
+            size = 4;
+         }
+         vec_op = S390_VEC_PWSUM_DW;
+         goto Iop_Pairwise_wrk;
+
+      case Iop_PwAddL64Ux2:
+         if (arg->tag == Iex_Unop && arg->Iex.Unop.op == Iop_PwAddL32Ux4) {
+            size = 4;
+            arg = arg->Iex.Unop.arg;
+         } else {
+            size = 8;
+         }
+         vec_op = S390_VEC_PWSUM_QW;
+         goto Iop_Pairwise_wrk;
+
+      Iop_Pairwise_wrk: {
+         dst = newVRegV(env);
+         reg1 = s390_isel_vec_expr(env, arg);
+
+         addInstr(env,
+                  s390_insn_vec_binop(size, vec_op, dst, reg1,
+                                      vec_generate_zeroes(env)));
+         return dst;
+      }
+
       default:
          goto irreducible;
       }
@@ -3784,6 +3943,7 @@ s390_isel_vec_expr_wrk(ISelEnv *env, IRExpr *expr)
       HReg reg1, reg2;
       IROp op = expr->Iex.Binop.op;
       s390_vec_binop_t vec_op = 0;
+      s390_vec_amodeop_t shift_op = 0;
       IRExpr* arg1 = expr->Iex.Binop.arg1;
       IRExpr* arg2 = expr->Iex.Binop.arg2;
       switch(op) {
@@ -3874,6 +4034,455 @@ s390_isel_vec_expr_wrk(ISelEnv *env, IRExpr *expr)
          vec_op = S390_VEC_MERGEH;
          goto Iop_VV_wrk;
 
+      case Iop_InterleaveEvenLanes8x16: {
+         /* There is no such instruction. We have to emulate it. */
+         IRExpr* mask = IRExpr_Binop(Iop_64HLtoV128,
+                                     mkU64(0x0010021204140616ULL),
+                                     mkU64(0x08180a1a0c1c0e1eULL));
+         HReg reg_mask = s390_isel_vec_expr(env, mask);
+         reg1 = s390_isel_vec_expr(env, arg1);
+         reg2 = s390_isel_vec_expr(env, arg2);
+
+         addInstr(env,
+                  s390_insn_vec_triop(16, S390_VEC_PERM, dst, reg1, reg2,
+                                      reg_mask)
+                  );
+
+         return dst;
+      }
+      case Iop_InterleaveOddLanes8x16: {
+         /* There is no such instruction. We have to emulate it. */
+         IRExpr* mask = IRExpr_Binop(Iop_64HLtoV128,
+                                     mkU64(0x0111031305150717ULL),
+                                     mkU64(0x09190b1b0d1d0f1fULL));
+         HReg reg_mask = s390_isel_vec_expr(env, mask);
+         reg1 = s390_isel_vec_expr(env, arg1);
+         reg2 = s390_isel_vec_expr(env, arg2);
+
+         addInstr(env,
+                  s390_insn_vec_triop(16, S390_VEC_PERM, dst, reg1, reg2, reg_mask)
+                  );
+
+         return dst;
+      }
+
+      case Iop_CmpEQ8x16:
+         size = 1;
+         vec_op = S390_VEC_COMPARE_EQUAL;
+         goto Iop_VV_wrk;
+      case Iop_CmpEQ16x8:
+         size = 2;
+         vec_op = S390_VEC_COMPARE_EQUAL;
+         goto Iop_VV_wrk;
+      case Iop_CmpEQ32x4:
+         size = 4;
+         vec_op = S390_VEC_COMPARE_EQUAL;
+         goto Iop_VV_wrk;
+      case Iop_CmpEQ64x2:
+         size = 8;
+         vec_op = S390_VEC_COMPARE_EQUAL;
+         goto Iop_VV_wrk;
+
+      case Iop_Add8x16:
+         size = 1;
+         vec_op = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+      case Iop_Add16x8:
+         size = 2;
+         vec_op = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+      case Iop_Add32x4:
+         size = 4;
+         vec_op = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+      case Iop_Add64x2:
+         size = 8;
+         vec_op = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+      case Iop_Add128x1:
+         size = 16;
+         vec_op = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+
+      case Iop_Sub8x16:
+         size = 1;
+         vec_op = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+      case Iop_Sub16x8:
+         size = 2;
+         vec_op = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+      case Iop_Sub32x4:
+         size = 4;
+         vec_op = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+      case Iop_Sub64x2:
+         size = 8;
+         vec_op = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+      case Iop_Sub128x1:
+         size = 16;
+         vec_op = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+
+      case Iop_Max8Ux16:
+         size = 1;
+         vec_op = S390_VEC_MAXU;
+         goto Iop_VV_wrk;
+      case Iop_Max8Sx16:
+         size = 1;
+         vec_op = S390_VEC_MAXS;
+         goto Iop_VV_wrk;
+      case Iop_Max16Ux8:
+         size = 2;
+         vec_op = S390_VEC_MAXU;
+         goto Iop_VV_wrk;
+      case Iop_Max16Sx8:
+         size = 2;
+         vec_op = S390_VEC_MAXS;
+         goto Iop_VV_wrk;
+      case Iop_Max32Ux4:
+         size = 4;
+         vec_op = S390_VEC_MAXU;
+         goto Iop_VV_wrk;
+      case Iop_Max32Sx4:
+         size = 4;
+         vec_op = S390_VEC_MAXS;
+         goto Iop_VV_wrk;
+      case Iop_Max64Ux2:
+         size = 8;
+         vec_op = S390_VEC_MAXU;
+         goto Iop_VV_wrk;
+      case Iop_Max64Sx2:
+         size = 8;
+         vec_op = S390_VEC_MAXS;
+         goto Iop_VV_wrk;
+
+      case Iop_Min8Ux16:
+         size = 1;
+         vec_op = S390_VEC_MINU;
+         goto Iop_VV_wrk;
+      case Iop_Min8Sx16:
+         size = 1;
+         vec_op = S390_VEC_MINS;
+         goto Iop_VV_wrk;
+      case Iop_Min16Ux8:
+         size = 2;
+         vec_op = S390_VEC_MINU;
+         goto Iop_VV_wrk;
+      case Iop_Min16Sx8:
+         size = 2;
+         vec_op = S390_VEC_MINS;
+         goto Iop_VV_wrk;
+      case Iop_Min32Ux4:
+         size = 4;
+         vec_op = S390_VEC_MINU;
+         goto Iop_VV_wrk;
+      case Iop_Min32Sx4:
+         size = 4;
+         vec_op = S390_VEC_MINS;
+         goto Iop_VV_wrk;
+      case Iop_Min64Ux2:
+         size = 8;
+         vec_op = S390_VEC_MINU;
+         goto Iop_VV_wrk;
+      case Iop_Min64Sx2:
+         size = 8;
+         vec_op = S390_VEC_MINS;
+         goto Iop_VV_wrk;
+
+      case Iop_Avg8Ux16:
+         size = 1;
+         vec_op = S390_VEC_AVGU;
+         goto Iop_VV_wrk;
+      case Iop_Avg8Sx16:
+         size = 1;
+         vec_op = S390_VEC_AVGS;
+         goto Iop_VV_wrk;
+      case Iop_Avg16Ux8:
+         size = 2;
+         vec_op = S390_VEC_AVGU;
+         goto Iop_VV_wrk;
+      case Iop_Avg16Sx8:
+         size = 2;
+         vec_op = S390_VEC_AVGS;
+         goto Iop_VV_wrk;
+      case Iop_Avg32Ux4:
+         size = 4;
+         vec_op = S390_VEC_AVGU;
+         goto Iop_VV_wrk;
+      case Iop_Avg32Sx4:
+         size = 4;
+         vec_op = S390_VEC_AVGS;
+         goto Iop_VV_wrk;
+      case Iop_Avg64Ux2:
+         size = 8;
+         vec_op = S390_VEC_AVGU;
+         goto Iop_VV_wrk;
+      case Iop_Avg64Sx2:
+         size = 8;
+         vec_op = S390_VEC_AVGS;
+         goto Iop_VV_wrk;
+
+      case Iop_CmpGT8Ux16:
+         size = 1;
+         vec_op = S390_VEC_COMPARE_GREATERU;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT8Sx16:
+         size = 1;
+         vec_op = S390_VEC_COMPARE_GREATERS;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT16Ux8:
+         size = 2;
+         vec_op = S390_VEC_COMPARE_GREATERU;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT16Sx8:
+         size = 2;
+         vec_op = S390_VEC_COMPARE_GREATERS;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT32Ux4:
+         size = 4;
+         vec_op = S390_VEC_COMPARE_GREATERU;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT32Sx4:
+         size = 4;
+         vec_op = S390_VEC_COMPARE_GREATERS;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT64Ux2:
+         size = 8;
+         vec_op = S390_VEC_COMPARE_GREATERU;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT64Sx2:
+         size = 8;
+         vec_op = S390_VEC_COMPARE_GREATERS;
+         goto Iop_VV_wrk;
+
+      case Iop_MulHi8Ux16:
+         size = 1;
+         vec_op = S390_VEC_INT_MUL_HIGHU;
+         goto Iop_VV_wrk;
+      case Iop_MulHi8Sx16:
+         size = 1;
+         vec_op = S390_VEC_INT_MUL_HIGHS;
+         goto Iop_VV_wrk;
+      case Iop_MulHi16Ux8:
+         size = 2;
+         vec_op = S390_VEC_INT_MUL_HIGHU;
+         goto Iop_VV_wrk;
+      case Iop_MulHi16Sx8:
+         size = 2;
+         vec_op = S390_VEC_INT_MUL_HIGHS;
+         goto Iop_VV_wrk;
+      case Iop_MulHi32Ux4:
+         size = 4;
+         vec_op = S390_VEC_INT_MUL_HIGHU;
+         goto Iop_VV_wrk;
+      case Iop_MulHi32Sx4:
+         size = 4;
+         vec_op = S390_VEC_INT_MUL_HIGHS;
+         goto Iop_VV_wrk;
+
+      case Iop_Mul8x16:
+         size = 1;
+         vec_op = S390_VEC_INT_MUL_LOW;
+         goto Iop_VV_wrk;
+      case Iop_Mul16x8:
+         size = 2;
+         vec_op = S390_VEC_INT_MUL_LOW;
+         goto Iop_VV_wrk;
+      case Iop_Mul32x4:
+         size = 4;
+         vec_op = S390_VEC_INT_MUL_LOW;
+         goto Iop_VV_wrk;
+
+      case Iop_MullEven8Sx16:
+         size = 1;
+         vec_op = S390_VEC_INT_MUL_EVENS;
+         goto Iop_VV_wrk;
+      case Iop_MullEven8Ux16:
+         size = 1;
+         vec_op = S390_VEC_INT_MUL_EVENU;
+         goto Iop_VV_wrk;
+      case Iop_MullEven16Sx8:
+         size = 2;
+         vec_op = S390_VEC_INT_MUL_EVENS;
+         goto Iop_VV_wrk;
+      case Iop_MullEven16Ux8:
+         size = 2;
+         vec_op = S390_VEC_INT_MUL_EVENU;
+         goto Iop_VV_wrk;
+      case Iop_MullEven32Sx4:
+         size = 4;
+         vec_op = S390_VEC_INT_MUL_EVENS;
+         goto Iop_VV_wrk;
+      case Iop_MullEven32Ux4:
+         size = 4;
+         vec_op = S390_VEC_INT_MUL_EVENU;
+         goto Iop_VV_wrk;
+
+      case Iop_Shl8x16:
+         size = 1;
+         vec_op = S390_VEC_ELEM_SHL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shl16x8:
+         size = 2;
+         vec_op = S390_VEC_ELEM_SHL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shl32x4:
+         size = 4;
+         vec_op = S390_VEC_ELEM_SHL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shl64x2:
+         size = 8;
+         vec_op = S390_VEC_ELEM_SHL_V;
+         goto Iop_VV_wrk;
+
+      case Iop_Shr8x16:
+         size = 1;
+         vec_op = S390_VEC_ELEM_SHRL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shr16x8:
+         size = 2;
+         vec_op = S390_VEC_ELEM_SHRL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shr32x4:
+         size = 4;
+         vec_op = S390_VEC_ELEM_SHRL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shr64x2:
+         size = 8;
+         vec_op = S390_VEC_ELEM_SHRL_V;
+         goto Iop_VV_wrk;
+
+      case Iop_Sar8x16:
+         size = 1;
+         vec_op = S390_VEC_ELEM_SHRA_V;
+         goto Iop_VV_wrk;
+      case Iop_Sar16x8:
+         size = 2;
+         vec_op = S390_VEC_ELEM_SHRA_V;
+         goto Iop_VV_wrk;
+      case Iop_Sar32x4:
+         size = 4;
+         vec_op = S390_VEC_ELEM_SHRA_V;
+         goto Iop_VV_wrk;
+      case Iop_Sar64x2:
+         size = 8;
+         vec_op = S390_VEC_ELEM_SHRA_V;
+         goto Iop_VV_wrk;
+
+      case Iop_Rol8x16:
+         size = 1;
+         vec_op = S390_VEC_ELEM_ROLL_V;
+         goto Iop_VV_wrk;
+      case Iop_Rol16x8:
+         size = 2;
+         vec_op = S390_VEC_ELEM_ROLL_V;
+         goto Iop_VV_wrk;
+      case Iop_Rol32x4:
+         size = 4;
+         vec_op = S390_VEC_ELEM_ROLL_V;
+         goto Iop_VV_wrk;
+      case Iop_Rol64x2:
+         size = 8;
+         vec_op = S390_VEC_ELEM_ROLL_V;
+         goto Iop_VV_wrk;
+
+      case Iop_ShlN8x16:
+         size = 1;
+         shift_op = S390_VEC_ELEM_SHL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShlN16x8:
+         size = 2;
+         shift_op = S390_VEC_ELEM_SHL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShlN32x4:
+         size = 4;
+         shift_op = S390_VEC_ELEM_SHL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShlN64x2:
+         size = 8;
+         shift_op = S390_VEC_ELEM_SHL_INT;
+         goto Iop_ShiftN_wrk;
+
+      case Iop_ShrN8x16:
+         size = 1;
+         shift_op = S390_VEC_ELEM_SHRL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShrN16x8:
+         size = 2;
+         shift_op = S390_VEC_ELEM_SHRL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShrN32x4:
+         size = 4;
+         shift_op = S390_VEC_ELEM_SHRL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShrN64x2:
+         size = 8;
+         shift_op = S390_VEC_ELEM_SHRL_INT;
+         goto Iop_ShiftN_wrk;
+
+      case Iop_SarN8x16:
+         size = 1;
+         shift_op = S390_VEC_ELEM_SHRA_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_SarN16x8:
+         size = 2;
+         shift_op = S390_VEC_ELEM_SHRA_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_SarN32x4:
+         size = 4;
+         shift_op = S390_VEC_ELEM_SHRA_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_SarN64x2:
+         size = 8;
+         shift_op = S390_VEC_ELEM_SHRA_INT;
+         goto Iop_ShiftN_wrk;
+
+         Iop_ShiftN_wrk: {
+            HReg vec = s390_isel_vec_expr(env, arg1);
+            s390_amode* number = s390_isel_amode(env,IRExpr_Unop(Iop_8Uto64, arg2));
+
+            addInstr(env,
+                     s390_insn_vec_amodeop(size, shift_op, dst, vec, number));
+
+         return dst;
+         }
+
+      case Iop_ShlV128:
+         vec_op = S390_VEC_SHL_BITS;
+         goto Iop_ShiftVV_wrk;
+      case Iop_ShrV128:
+         vec_op = S390_VEC_SHRL_BITS;
+         goto Iop_ShiftVV_wrk;
+      case Iop_SarV128:
+         vec_op = S390_VEC_SHRA_BITS;
+         goto Iop_ShiftVV_wrk;
+
+      Iop_ShiftVV_wrk: {
+         reg1 = s390_isel_vec_expr(env, arg1);
+         reg2 = s390_isel_vec_expr(env, IRExpr_Unop(Iop_Dup8x16, arg2));
+
+         /* Handle special case */
+         if (vec_is_bytes_only_shift(arg2))
+         {
+            /* In this case we skip the BITS shift step. */
+            addInstr(env, s390_insn_vec_binop(16, (vec_op + 1),
+                     dst, reg1, reg2));
+
+            return dst;
+         }
+
+         /* General case (BYTES shift & BITS shift) */
+         addInstr(env, s390_insn_vec_binop(16, (vec_op + 1),
+                  dst, reg1, reg2));
+
+         addInstr(env, s390_insn_vec_binop(16, vec_op,
+                  dst, dst, reg2));
+
+         return dst;
+      }
+
       Iop_VV_wrk: {
          reg1 = s390_isel_vec_expr(env, arg1);
          reg2 = s390_isel_vec_expr(env, arg2);
@@ -3884,6 +4493,15 @@ s390_isel_vec_expr_wrk(ISelEnv *env, IRExpr *expr)
          return dst;
       }
 
+      case Iop_64HLtoV128:
+         reg1 = s390_isel_int_expr(env, arg1);
+         reg2 = s390_isel_int_expr(env, arg2);
+
+         addInstr(env, s390_insn_vec_binop(size, S390_VEC_INIT_FROM_GPRS,
+                  dst, reg1, reg2));
+
+         return dst;
+
       default:
          goto irreducible;
       }
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
index 15524bffb0..823b6be7dd 100644
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -764,6 +764,7 @@ void ppIROp ( IROp op )
       case Iop_CmpNEZ16x8: vex_printf("CmpNEZ16x8"); return;
       case Iop_CmpNEZ32x4: vex_printf("CmpNEZ32x4"); return;
       case Iop_CmpNEZ64x2: vex_printf("CmpNEZ64x2"); return;
+      case Iop_CmpNEZ128x1: vex_printf("CmpNEZ128x1"); return;
 
       case Iop_Abs8x16: vex_printf("Abs8x16"); return;
       case Iop_Abs16x8: vex_printf("Abs16x8"); return;
@@ -774,6 +775,7 @@ void ppIROp ( IROp op )
       case Iop_Add16x8:   vex_printf("Add16x8"); return;
       case Iop_Add32x4:   vex_printf("Add32x4"); return;
       case Iop_Add64x2:   vex_printf("Add64x2"); return;
+      case Iop_Add128x1:  vex_printf("Add128x1"); return;
       case Iop_QAdd8Ux16: vex_printf("QAdd8Ux16"); return;
       case Iop_QAdd16Ux8: vex_printf("QAdd16Ux8"); return;
       case Iop_QAdd32Ux4: vex_printf("QAdd32Ux4"); return;
@@ -798,6 +800,7 @@ void ppIROp ( IROp op )
       case Iop_PwAddL8Ux16: vex_printf("PwAddL8Ux16"); return;
       case Iop_PwAddL16Ux8: vex_printf("PwAddL16Ux8"); return;
       case Iop_PwAddL32Ux4: vex_printf("PwAddL32Ux4"); return;
+      case Iop_PwAddL64Ux2: vex_printf("PwAddL64Ux2"); return;
       case Iop_PwAddL8Sx16: vex_printf("PwAddL8Sx16"); return;
       case Iop_PwAddL16Sx8: vex_printf("PwAddL16Sx8"); return;
       case Iop_PwAddL32Sx4: vex_printf("PwAddL32Sx4"); return;
@@ -806,6 +809,7 @@ void ppIROp ( IROp op )
       case Iop_Sub16x8:   vex_printf("Sub16x8"); return;
       case Iop_Sub32x4:   vex_printf("Sub32x4"); return;
       case Iop_Sub64x2:   vex_printf("Sub64x2"); return;
+      case Iop_Sub128x1:  vex_printf("Sub128x1"); return;
       case Iop_QSub8Ux16: vex_printf("QSub8Ux16"); return;
       case Iop_QSub16Ux8: vex_printf("QSub16Ux8"); return;
       case Iop_QSub32Ux4: vex_printf("QSub32Ux4"); return;
@@ -826,8 +830,10 @@ void ppIROp ( IROp op )
       case Iop_Mull32Sx2:    vex_printf("Mull32Sx2"); return;
       case Iop_PolynomialMul8x16: vex_printf("PolynomialMul8x16"); return;
       case Iop_PolynomialMull8x8: vex_printf("PolynomialMull8x8"); return;
+      case Iop_MulHi8Ux16: vex_printf("MulHi8Ux16"); return;
       case Iop_MulHi16Ux8: vex_printf("MulHi16Ux8"); return;
       case Iop_MulHi32Ux4: vex_printf("MulHi32Ux4"); return;
+      case Iop_MulHi8Sx16: vex_printf("MulHi8Sx16"); return;
       case Iop_MulHi16Sx8: vex_printf("MulHi16Sx8"); return;
       case Iop_MulHi32Sx4: vex_printf("MulHi32Sx4"); return;
       case Iop_QDMulHi16Sx8: vex_printf("QDMulHi16Sx8"); return;
@@ -854,9 +860,11 @@ void ppIROp ( IROp op )
       case Iop_Avg8Ux16: vex_printf("Avg8Ux16"); return;
       case Iop_Avg16Ux8: vex_printf("Avg16Ux8"); return;
       case Iop_Avg32Ux4: vex_printf("Avg32Ux4"); return;
+      case Iop_Avg64Ux2: vex_printf("Avg64Ux2"); return;
       case Iop_Avg8Sx16: vex_printf("Avg8Sx16"); return;
       case Iop_Avg16Sx8: vex_printf("Avg16Sx8"); return;
       case Iop_Avg32Sx4: vex_printf("Avg32Sx4"); return;
+      case Iop_Avg64Sx2: vex_printf("Avg64Sx2"); return;
 
       case Iop_Max8Sx16: vex_printf("Max8Sx16"); return;
       case Iop_Max16Sx8: vex_printf("Max16Sx8"); return;
@@ -904,6 +912,7 @@ void ppIROp ( IROp op )
 
       case Iop_ShlV128: vex_printf("ShlV128"); return;
       case Iop_ShrV128: vex_printf("ShrV128"); return;
+      case Iop_SarV128: vex_printf("SarV128"); return;
 
       case Iop_ShlN8x16: vex_printf("ShlN8x16"); return;
       case Iop_ShlN16x8: vex_printf("ShlN16x8"); return;
@@ -1567,6 +1576,7 @@ void ppIRJumpKind ( IRJumpKind kind )
       case Ijk_SigTRAP:       vex_printf("SigTRAP"); break;
       case Ijk_SigSEGV:       vex_printf("SigSEGV"); break;
       case Ijk_SigBUS:        vex_printf("SigBUS"); break;
+      case Ijk_SigFPE:        vex_printf("SigFPE"); break;
       case Ijk_SigFPE_IntDiv: vex_printf("SigFPE_IntDiv"); break;
       case Ijk_SigFPE_IntOvf: vex_printf("SigFPE_IntOvf"); break;
       case Ijk_Sys_syscall:   vex_printf("Sys_syscall"); break;
@@ -3038,7 +3048,7 @@ void typeOfPrimop ( IROp op,
       case Iop_Sub64F0x2:
       case Iop_AndV128: case Iop_OrV128: case Iop_XorV128:
       case Iop_Add8x16:   case Iop_Add16x8:   
-      case Iop_Add32x4:   case Iop_Add64x2:
+      case Iop_Add32x4:   case Iop_Add64x2: case Iop_Add128x1:
       case Iop_QAdd8Ux16: case Iop_QAdd16Ux8:
       case Iop_QAdd32Ux4: case Iop_QAdd64Ux2:
       case Iop_QAdd8Sx16: case Iop_QAdd16Sx8:
@@ -3049,7 +3059,7 @@ void typeOfPrimop ( IROp op,
       case Iop_QAddExtSUsatUU32x4: case Iop_QAddExtSUsatUU64x2:
       case Iop_PwAdd8x16: case Iop_PwAdd16x8: case Iop_PwAdd32x4:
       case Iop_Sub8x16:   case Iop_Sub16x8:
-      case Iop_Sub32x4:   case Iop_Sub64x2:
+      case Iop_Sub32x4:   case Iop_Sub64x2: case Iop_Sub128x1:
       case Iop_QSub8Ux16: case Iop_QSub16Ux8:
       case Iop_QSub32Ux4: case Iop_QSub64Ux2:
       case Iop_QSub8Sx16: case Iop_QSub16Sx8:
@@ -3058,14 +3068,14 @@ void typeOfPrimop ( IROp op,
       case Iop_PolynomialMul8x16:
       case Iop_PolynomialMulAdd8x16: case Iop_PolynomialMulAdd16x8:
       case Iop_PolynomialMulAdd32x4: case Iop_PolynomialMulAdd64x2:
-      case Iop_MulHi16Ux8: case Iop_MulHi32Ux4: 
-      case Iop_MulHi16Sx8: case Iop_MulHi32Sx4: 
+      case Iop_MulHi8Ux16: case Iop_MulHi16Ux8: case Iop_MulHi32Ux4:
+      case Iop_MulHi8Sx16: case Iop_MulHi16Sx8: case Iop_MulHi32Sx4:
       case Iop_QDMulHi16Sx8: case Iop_QDMulHi32Sx4:
       case Iop_QRDMulHi16Sx8: case Iop_QRDMulHi32Sx4:
       case Iop_MullEven8Ux16: case Iop_MullEven16Ux8: case Iop_MullEven32Ux4:
       case Iop_MullEven8Sx16: case Iop_MullEven16Sx8: case Iop_MullEven32Sx4:
-      case Iop_Avg8Ux16: case Iop_Avg16Ux8: case Iop_Avg32Ux4:
-      case Iop_Avg8Sx16: case Iop_Avg16Sx8: case Iop_Avg32Sx4:
+      case Iop_Avg8Ux16: case Iop_Avg16Ux8: case Iop_Avg32Ux4: case Iop_Avg64Ux2:
+      case Iop_Avg8Sx16: case Iop_Avg16Sx8: case Iop_Avg32Sx4: case Iop_Avg64Sx2:
       case Iop_Max8Sx16: case Iop_Max16Sx8: case Iop_Max32Sx4:
       case Iop_Max64Sx2:
       case Iop_Max8Ux16: case Iop_Max16Ux8: case Iop_Max32Ux4:
@@ -3144,11 +3154,12 @@ void typeOfPrimop ( IROp op,
       case Iop_Sqrt32F0x4:
       case Iop_Sqrt64F0x2:
       case Iop_CmpNEZ8x16: case Iop_CmpNEZ16x8:
-      case Iop_CmpNEZ32x4: case Iop_CmpNEZ64x2:
+      case Iop_CmpNEZ32x4: case Iop_CmpNEZ64x2: case Iop_CmpNEZ128x1:
       case Iop_Cnt8x16:
       case Iop_Clz8x16: case Iop_Clz16x8: case Iop_Clz32x4: case Iop_Clz64x2:
       case Iop_Cls8x16: case Iop_Cls16x8: case Iop_Cls32x4:
       case Iop_PwAddL8Ux16: case Iop_PwAddL16Ux8: case Iop_PwAddL32Ux4:
+      case Iop_PwAddL64Ux2:
       case Iop_PwAddL8Sx16: case Iop_PwAddL16Sx8: case Iop_PwAddL32Sx4:
       case Iop_Reverse8sIn64_x2: case Iop_Reverse16sIn64_x2:
       case Iop_Reverse32sIn64_x2:
@@ -3170,7 +3181,7 @@ void typeOfPrimop ( IROp op,
       case Iop_BCD128toI128S:
          UNARY(Ity_V128, Ity_V128);
 
-      case Iop_ShlV128: case Iop_ShrV128:
+      case Iop_ShlV128: case Iop_ShrV128: case Iop_SarV128:
       case Iop_ShlN8x16: case Iop_ShlN16x8: 
       case Iop_ShlN32x4: case Iop_ShlN64x2:
       case Iop_ShrN8x16: case Iop_ShrN16x8: 
diff --git a/VEX/priv/s390_disasm.c b/VEX/priv/s390_disasm.c
index f95d7ec3b2..58189f1233 100644
--- a/VEX/priv/s390_disasm.c
+++ b/VEX/priv/s390_disasm.c
@@ -344,8 +344,8 @@ dvb_operand(HChar *p, UInt d, UInt v, UInt b, Bool displacement_is_signed)
    are separated by a ','. The command holds the arguments. Each argument is
    encoded using a 4-bit S390_ARG_xyz value. The first argument is placed
    in the least significant bits of the command and so on. There are at most
-   5 arguments in an insn and a sentinel (S390_ARG_DONE) is needed to identify
-   the end of the argument list. 6 * 4 = 24 bits are required for the
+   7 arguments in an insn and a sentinel (S390_ARG_DONE) is needed to identify
+   the end of the argument list. 8 * 4 = 32 bits are required for the
    command. */
 void
 s390_disasm(UInt command, ...)
diff --git a/VEX/priv/s390_disasm.h b/VEX/priv/s390_disasm.h
index 9d29b9600f..3cccceadfc 100644
--- a/VEX/priv/s390_disasm.h
+++ b/VEX/priv/s390_disasm.h
@@ -51,6 +51,10 @@
 #undef  ENC6
 #define ENC6(a,b,c,d,e,f) ((P(DONE) << 24) | (P(f) << 20) | (P(e) << 16) | \
                            (P(d) << 12) | (P(c) << 8) | (P(b) << 4) | P(a))
+#undef  ENC7
+#define ENC7(a,b,c,d,e,f,g) ((P(DONE) << 28) | (P(g) << 24) | (P(f) << 20) | \
+                             (P(e) << 16) | (P(d) << 12) | (P(c) << 8) | \
+                             (P(b) << 4) | P(a))
 
 /* The different kinds of operands in an asm insn */
 enum {
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index 4beaabd387..17bcb55840 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -1479,13 +1479,14 @@ typedef
       Iop_AndV128, Iop_OrV128, Iop_XorV128, 
 
       /* VECTOR SHIFT (shift amt :: Ity_I8) */
-      Iop_ShlV128, Iop_ShrV128,
+      Iop_ShlV128, Iop_ShrV128, Iop_SarV128,
 
       /* MISC (vector integer cmp != 0) */
       Iop_CmpNEZ8x16, Iop_CmpNEZ16x8, Iop_CmpNEZ32x4, Iop_CmpNEZ64x2,
+      Iop_CmpNEZ128x1,
 
       /* ADDITION (normal / U->U sat / S->S sat) */
-      Iop_Add8x16,    Iop_Add16x8,    Iop_Add32x4,    Iop_Add64x2,
+      Iop_Add8x16,    Iop_Add16x8,    Iop_Add32x4,    Iop_Add64x2,   Iop_Add128x1,
       Iop_QAdd8Ux16,  Iop_QAdd16Ux8,  Iop_QAdd32Ux4,  Iop_QAdd64Ux2,
       Iop_QAdd8Sx16,  Iop_QAdd16Sx8,  Iop_QAdd32Sx4,  Iop_QAdd64Sx2,
 
@@ -1500,14 +1501,14 @@ typedef
       Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2,
 
       /* SUBTRACTION (normal / unsigned sat / signed sat) */
-      Iop_Sub8x16,   Iop_Sub16x8,   Iop_Sub32x4,   Iop_Sub64x2,
+      Iop_Sub8x16,   Iop_Sub16x8,   Iop_Sub32x4,   Iop_Sub64x2,   Iop_Sub128x1,
       Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2,
       Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2,
 
       /* MULTIPLICATION (normal / high half of signed/unsigned) */
       Iop_Mul8x16,  Iop_Mul16x8,    Iop_Mul32x4,
-                    Iop_MulHi16Ux8, Iop_MulHi32Ux4,
-                    Iop_MulHi16Sx8, Iop_MulHi32Sx4,
+      Iop_MulHi8Ux16, Iop_MulHi16Ux8, Iop_MulHi32Ux4,
+      Iop_MulHi8Sx16, Iop_MulHi16Sx8, Iop_MulHi32Sx4,
       /* (widening signed/unsigned of even lanes, with lowest lane=zero) */
       Iop_MullEven8Ux16, Iop_MullEven16Ux8, Iop_MullEven32Ux4,
       Iop_MullEven8Sx16, Iop_MullEven16Sx8, Iop_MullEven32Sx4,
@@ -1584,7 +1585,7 @@ typedef
          Example:
             Iop_PwAddL16Ux4( [a,b,c,d] ) = [a+b,c+d]
                where a+b and c+d are unsigned 32-bit values. */
-      Iop_PwAddL8Ux16, Iop_PwAddL16Ux8, Iop_PwAddL32Ux4,
+      Iop_PwAddL8Ux16, Iop_PwAddL16Ux8, Iop_PwAddL32Ux4, Iop_PwAddL64Ux2,
       Iop_PwAddL8Sx16, Iop_PwAddL16Sx8, Iop_PwAddL32Sx4,
 
       /* Other unary pairwise ops */
@@ -1598,8 +1599,8 @@ typedef
       Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2,
 
       /* AVERAGING: note: (arg1 + arg2 + 1) >>u 1 */
-      Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4,
-      Iop_Avg8Sx16, Iop_Avg16Sx8, Iop_Avg32Sx4,
+      Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4, Iop_Avg64Ux2,
+      Iop_Avg8Sx16, Iop_Avg16Sx8, Iop_Avg32Sx4, Iop_Avg64Sx2,
 
       /* MIN/MAX */
       Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2,
diff --git a/VEX/useful/test_main.c b/VEX/useful/test_main.c
index 2d24aaf0a0..a7fc06b246 100644
--- a/VEX/useful/test_main.c
+++ b/VEX/useful/test_main.c
@@ -1416,6 +1416,10 @@ static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
    return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
 }
 
+static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
+{
+   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
+}
 
 /* Here's a simple scheme capable of handling ops derived from SSE1
    code and while only generating ops that can be efficiently
@@ -1631,6 +1635,14 @@ IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
    return at;   
 }
 
+static
+IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
+{
+   IRAtom* at;
+   at = mkUifUV128(mce, vatom1, vatom2);
+   at = mkPCast128x1(mce, at);
+   return at;
+}
 
 /*------------------------------------------------------------*/
 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
@@ -1674,6 +1686,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
       case Iop_QSub8Ux16:
       case Iop_QSub8Sx16:
       case Iop_Sub8x16:
+      case Iop_MulHi8Sx16:
+      case Iop_MulHi8Ux16:
       case Iop_Min8Ux16:
       case Iop_Max8Ux16:
       case Iop_CmpGT8Sx16:
@@ -1713,11 +1727,18 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
       case Iop_Sub64x2:
       case Iop_QSub64Ux2:
       case Iop_QSub64Sx2:
+      case Iop_Avg64Ux2:
+      case Iop_Avg64Sx2:
       case Iop_Add64x2:
       case Iop_QAdd64Ux2:
       case Iop_QAdd64Sx2:
          return binary64Ix2(mce, vatom1, vatom2);
 
+      case Iop_Add128x1:
+      case Iop_Sub128x1:
+      case Iop_CmpNEZ128x1:
+         return binary128Ix1(mce, vatom1, vatom2);
+
       case Iop_QNarrowBin32Sto16Sx8:
       case Iop_QNarrowBin16Sto8Sx16:
       case Iop_QNarrowBin16Sto8Ux16:
diff --git a/docs/internals/3_13_BUGSTATUS.txt b/docs/internals/3_13_BUGSTATUS.txt
index f803c9a8aa..944dde8f83 100644
--- a/docs/internals/3_13_BUGSTATUS.txt
+++ b/docs/internals/3_13_BUGSTATUS.txt
@@ -758,14 +758,6 @@ Should fold these to constant zero in iropt; awaiting test case
 
 === VEX/s390x ==========================================================
 
-(carried over)
-366413  s390x: New z13 instructions not implemented
-        [Per cborntraeger, is not important for 3.12.0]
-
-385409  s390x: z13 vector integer instructions not implemented
-
-385410  s390x: z13 vector string instructions not implemented
-
 385411  s390x: z13 vector floating-point instructions not implemented
 
 === VEX/x86 ============================================================
diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c
index a1edb9a749..9b2e654902 100644
--- a/memcheck/mc_main.c
+++ b/memcheck/mc_main.c
@@ -1350,7 +1350,16 @@ void mc_LOADV_128_or_256_slow ( /*OUT*/ULong* res,
       ok |= pessim[j] != V_BITS64_DEFINED;
    tl_assert(ok);
 
-   if (0 == (a & (szB - 1)) && n_addrs_bad < szB) {
+#  if defined(VGP_s390x_linux)
+   tl_assert(szB == 16); // s390 doesn't have > 128 bit SIMD
+   /* OK if all loaded bytes are from the same page. */
+   Bool alignedOK = ((a & 0xfff) <= 0x1000 - szB);
+#  else
+   /* OK if the address is aligned by the load size. */
+   Bool alignedOK = (0 == (a & (szB - 1)));
+#  endif
+
+   if (alignedOK && n_addrs_bad < szB) {
       /* Exemption applies.  Use the previously computed pessimising
          value and return the combined result, but don't flag an
          addressing error.  The pessimising value is Defined for valid
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 5ed39ae21e..68a2ab3bb6 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -2324,6 +2324,11 @@ static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
 }
 
+static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
+{
+   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
+}
+
 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
 {
    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
@@ -2933,6 +2938,15 @@ IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
    return at;   
 }
 
+static
+IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
+{
+   IRAtom* at;
+   at = mkUifUV128(mce, vatom1, vatom2);
+   at = mkPCast128x1(mce, at);
+   return at;
+}
+
 /* --- 64-bit versions --- */
 
 static
@@ -3609,6 +3623,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
       case Iop_QShl8x16:
       case Iop_Add8x16:
       case Iop_Mul8x16:
+      case Iop_MulHi8Sx16:
+      case Iop_MulHi8Ux16:
       case Iop_PolynomialMul8x16:
       case Iop_PolynomialMulAdd8x16:
          return binary8Ix16(mce, vatom1, vatom2);
@@ -3660,6 +3676,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
       case Iop_Min32Ux4:
       case Iop_Min32Sx4:
       case Iop_Mul32x4:
+      case Iop_MulHi32Sx4:
+      case Iop_MulHi32Ux4:
       case Iop_QDMulHi32Sx4:
       case Iop_QRDMulHi32Sx4:
       case Iop_PolynomialMulAdd32x4:
@@ -3667,6 +3685,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
 
       case Iop_Sub64x2:
       case Iop_Add64x2:
+      case Iop_Avg64Ux2:
+      case Iop_Avg64Sx2:
       case Iop_Max64Sx2:
       case Iop_Max64Ux2:
       case Iop_Min64Sx2:
@@ -3691,6 +3711,11 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
       case Iop_MulI128by10ECarry:
         return binary64Ix2(mce, vatom1, vatom2);
 
+      case Iop_Add128x1:
+      case Iop_Sub128x1:
+      case Iop_CmpNEZ128x1:
+         return binary128Ix1(mce, vatom1, vatom2);
+
       case Iop_QNarrowBin64Sto32Sx4:
       case Iop_QNarrowBin64Uto32Ux4:
       case Iop_QNarrowBin32Sto16Sx8:
@@ -3998,6 +4023,7 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
                                     binop(op, vatom1, vatom2));
 
       case Iop_ShrV128:
+      case Iop_SarV128:
       case Iop_ShlV128:
       case Iop_I128StoBCD128:
          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
@@ -4950,6 +4976,10 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
          return mkPCast64x2(mce,
                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
 
+      case Iop_PwAddL64Ux2:
+         return mkPCast128x1(mce,
+               assignNew('V', mce, Ity_V128, unop(op, mkPCast64x2(mce, vatom))));
+
       case Iop_PwAddL16Ux8:
       case Iop_PwAddL16Sx8:
          return mkPCast32x4(mce,
diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c
index adc7845aee..bfd82fcec9 100644
--- a/memcheck/tests/vbit-test/irops.c
+++ b/memcheck/tests/vbit-test/irops.c
@@ -712,14 +712,17 @@ static irop_t irops[] = {
   { DEFOP(Iop_XorV128, UNDEF_UNKNOWN), },
   { DEFOP(Iop_ShlV128, UNDEF_UNKNOWN), },
   { DEFOP(Iop_ShrV128, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_SarV128, UNDEF_UNKNOWN), },
   { DEFOP(Iop_CmpNEZ8x16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_CmpNEZ16x8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_CmpNEZ32x4, UNDEF_UNKNOWN), },
   { DEFOP(Iop_CmpNEZ64x2, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_CmpNEZ128x1, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Add8x16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Add16x8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Add32x4, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Add64x2, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_Add128x1, UNDEF_UNKNOWN), },
   { DEFOP(Iop_QAdd8Ux16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_QAdd16Ux8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_QAdd32Ux4, UNDEF_UNKNOWN), },
@@ -742,6 +745,7 @@ static irop_t irops[] = {
   { DEFOP(Iop_Sub16x8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Sub32x4, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Sub64x2, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_Sub128x1, UNDEF_UNKNOWN), },
   { DEFOP(Iop_QSub8Ux16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_QSub16Ux8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_QSub32Ux4, UNDEF_UNKNOWN), },
@@ -753,8 +757,10 @@ static irop_t irops[] = {
   { DEFOP(Iop_Mul8x16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Mul16x8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Mul32x4, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_MulHi8Ux16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_MulHi16Ux8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_MulHi32Ux4, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_MulHi8Sx16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_MulHi16Sx8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_MulHi32Sx4, UNDEF_UNKNOWN), },
   /* Result of the Iop_MullEvenBxE is 2*BxE/2 */
@@ -785,6 +791,7 @@ static irop_t irops[] = {
   { DEFOP(Iop_PwAddL8Ux16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_PwAddL16Ux8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_PwAddL32Ux4, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_PwAddL64Ux2, UNDEF_UNKNOWN), },
   { DEFOP(Iop_PwAddL8Sx16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_PwAddL16Sx8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_PwAddL32Sx4, UNDEF_UNKNOWN), },
@@ -795,9 +802,11 @@ static irop_t irops[] = {
   { DEFOP(Iop_Avg8Ux16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Avg16Ux8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Avg32Ux4, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_Avg64Ux2, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Avg8Sx16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Avg16Sx8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Avg32Sx4, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_Avg64Sx2, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Max8Sx16, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Max16Sx8, UNDEF_UNKNOWN), },
   { DEFOP(Iop_Max32Sx4, UNDEF_UNKNOWN), },