From 3cc0232c46a5905b4a6c2fbd302b58bf5f90b3d5 Mon Sep 17 00:00:00 2001
From: Carl Love <cel@us.ibm.com>
Date: Mon, 11 Jan 2021 16:00:57 -0600
Subject: [PATCH] PPC64: ISA 3.1 VSX PCV Generate Operations

xgenpcvbm VSX Vector Generate PCV from Byte Mask
xxgenpcvdmVSX Vector Generate PCV from Doubleword Mask
xxgenpcvhmVSX Vector Generate PCV from Halfword Mask
xxgenpcvwmVSX Vector Generate PCV from Word Mask
---
 VEX/priv/guest_ppc_defs.h    |  17 +
 VEX/priv/guest_ppc_helpers.c | 732 +++++++++++++++++++++++++++++++++++
 VEX/priv/guest_ppc_toIR.c    | 163 +++++++-
 3 files changed, 898 insertions(+), 14 deletions(-)

diff --git a/VEX/priv/guest_ppc_defs.h b/VEX/priv/guest_ppc_defs.h
index deda4dfcee..54ce923a9b 100644
--- a/VEX/priv/guest_ppc_defs.h
+++ b/VEX/priv/guest_ppc_defs.h
@@ -169,6 +169,23 @@ void write_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc,
 void get_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc,
                     UInt reg, UInt *result);
 
+extern void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State* gst,
+                                                   ULong src_hi,
+                                                   ULong src_lo,
+                                                   UInt rtn_val, UInt IMM );
+extern void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State* gst,
+                                                    ULong src_hi,
+                                                    ULong src_lo,
+                                                    UInt rtn_val, UInt IMM );
+extern void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State* gst,
+                                                   ULong src_hi,
+                                                   ULong src_lo,
+                                                   UInt rtn_val, UInt IMM );
+extern void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,
+                                                    ULong src_hi,
+                                                    ULong src_lo,
+                                                    UInt rtn_val, UInt IMM );
+
 /* 8-bit XO value from instruction description */
 #define XVI4GER8       0b00100011
 #define XVI4GER8PP     0b00100010
diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c
index c24191ef3c..75497abb96 100644
--- a/VEX/priv/guest_ppc_helpers.c
+++ b/VEX/priv/guest_ppc_helpers.c
@@ -701,6 +701,738 @@ ULong vector_evaluate64_helper( ULong srcA, ULong srcB, ULong srcC,
 #undef MAX_IMM_BITS
 }
 
+/*--------------------------------------------------*/
+/*---- VSX Vector Generate PCV from Mask helpers ---*/
+/*--------------------------------------------------*/
+static void write_VSX_entry (VexGuestPPC64State* gst, UInt reg_offset,
+                             ULong *vsx_entry)
+{
+   U128* pU128_dst;
+   pU128_dst = (U128*) (((UChar*) gst) + reg_offset);
+
+   /* The U128 type is defined as an array of unsigned intetgers.  */
+   /* Writing in LE order */
+   (*pU128_dst)[0] = (UInt)(vsx_entry[1] & 0xFFFFFFFF);
+   (*pU128_dst)[1] = (UInt)(vsx_entry[1] >> 32);
+   (*pU128_dst)[2] = (UInt)(vsx_entry[0] & 0xFFFFFFFF);
+   (*pU128_dst)[3] = (UInt)(vsx_entry[0] >> 32);
+   return;
+}
+
+/* CALLED FROM GENERATED CODE */
+void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State* gst,
+                                            ULong src_hi, ULong src_lo,
+                                            UInt reg_offset, UInt imm ) {
+   /* The function computes the 128-bit result then writes it directly
+      into the guest state VSX register.  */
+
+   UInt  i, shift_by, sel_shift_by, half_sel;
+   ULong index, src, result[2];
+   ULong j;
+
+   result[0] = 0;
+   result[1] = 0;
+   j = 0;
+
+   /* The algorithm in the ISA is written with IBM numbering zero on left and
+      N-1 on right. The loop index is converted to "i" to match the algorithm
+      for claritiy of matching the C code to the algorithm in the ISA.  */
+
+   if (imm == 0b00) {    // big endian expansion
+      for( index = 0; index < 16; index++) {
+         i = 15 - index;
+
+         shift_by = i*8;
+
+         if ( i >= 8) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 7;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+               result[half_sel] |= j << shift_by;
+            j++;
+         } else {
+            result[half_sel] |= (index + (unsigned long long)0x10) << shift_by;
+         }
+      }
+
+
+   } else if (imm == 0b01) {    // big endian compression
+      /* If IMM=0b00001, let pcv be the permute control vector required to
+         enable a left-indexed permute (vperm or xxperm) to implement a
+         compression of the sparse byte elements in a source vector specified
+         by the byte-element mask in VSR[VRB+32] into the leftmost byte
+         elements of a result vector.
+      */
+      for( index = 0; index < 16; index++) {
+         i = 15 - index;
+         shift_by = i*8;
+
+         if ( i >= 8) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 7;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            if (j >= 8)
+               result[1] |= (index) << (15 - j)*8;
+            else
+               result[0] |= (index) << (7 - j)*8;
+            j++;
+         }
+      }
+      /* The algorithim says set to undefined, leave as 0
+      for( index = 3 - j; index < 4; index++) {
+         result |= (0 << (index*8));
+      }
+      */
+
+   } else if (imm == 0b10) {   //little-endian expansion
+      /* If IMM=0b00010, let pcv be the permute control vector required to
+         enable a right-indexed permute (vpermr or xxpermr) to implement an
+         expansion of the rightmost byte elements of a source vector into the
+         byte elements of a result vector specified by the byte-element mask
+         in VSR[VRB+32].  */
+      for( index = 0; index < 16; index++) {
+         i = index;
+
+         shift_by = i*8;
+
+         if ( i >= 8) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 7;
+
+         /* mod shift amount by 8 since src is either the upper or lower
+            64-bits.  */
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+               result[half_sel] |= j << shift_by;
+            j++;
+         } else {
+            result[half_sel] |= (index + (unsigned long long)0x10) << shift_by;
+         }
+      }
+
+   } else if (imm == 0b11) {   //little-endian compression
+      /* If IMM=0b00011, let pcv be the permute control vector required to
+         enable a right-indexed permute (vpermr or xxpermr) to implement a
+         compression of the sparse byte elements in a source vector specified
+         by the byte-element mask in VSR[VRB+32] into the rightmost byte
+         elements of a result vector.  */
+
+      for( index = 0; index < 16; index++) {
+         i = index;
+
+         shift_by = i*8;
+
+         if ( i >= 8) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 7;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            if (j >= 8)
+               result[0] |= (index) << (j-8)*8;
+            else
+               result[1] |= (index) << j*8;
+            j++;
+         }
+      }
+
+      /* The algorithim says set to undefined, leave as 0
+      for( index = 3 - j; index < 4; index++) {
+         result |= (0 << (index*8));
+      }
+      */
+
+   } else {
+      vex_printf("ERROR, vector_gen_pvc_byte_mask_dirty_helper, imm value %u not supported.\n",
+                 imm);
+      vassert(0);
+   }
+   write_VSX_entry( gst, reg_offset, result);
+}
+
+/* CALLED FROM GENERATED CODE */
+void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State* gst,
+                                             ULong src_hi, ULong src_lo,
+                                             UInt reg_offset,
+                                             UInt imm ) {
+   /* The function computes the 128-bit result then writes it directly
+      into the guest state VSX register.  */
+   UInt  i, shift_by, sel_shift_by, half_sel;
+   ULong index, src, result[2];
+   ULong j;
+
+   result[0] = 0;
+   result[1] = 0;
+   j = 0;
+
+   /* The algorithm in the ISA is written with IBM numbering zero on left and
+      N-1 on right. The loop index is converted to "i" to match the algorithm
+      for claritiy of matching the C code to the algorithm in the ISA.  */
+
+   if (imm == 0b00) {    // big endian expansion
+      /* If IMM=0b00000, let pcv be the permute control vector required to
+         enable a left-indexed permute (vperm or xxperm) to implement an
+         expansion of the leftmost halfword elements of a source vector into
+         the halfword elements of a result vector specified by the halfword-
+         element mask in VSR[VRB+32].
+      */
+      for( index = 0; index < 8; index++) {
+         i = 7 - index;
+
+         shift_by = i*16;
+
+         if ( i >= 4) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 15;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            // half-word i, byte 0
+            result[half_sel] |= (2*j + 0x0) << (shift_by+8);
+            // half-word i, byte 1
+            result[half_sel] |= (2*j + 0x1) << shift_by;
+            j++;
+         } else {
+            result[half_sel] |= (2*index + 0x10) << (shift_by+8);
+            result[half_sel] |= (2*index + 0x11) << shift_by;
+         }
+      }
+
+   } else if (imm == 0b01) {    // big endian expansion
+      /* If IMM=0b00001,let pcv be the permute control vector required to
+         enable a left-indexed permute (vperm or xxperm) to implement a
+         compression of the sparse halfword elements in a source vector
+         specified by the halfword-element mask in VSR[VRB+32] into the
+         leftmost halfword elements of a result vector.
+      */
+      for( index = 0; index < 8; index++) {
+         i = 7 - index;
+
+         shift_by = i*16;
+
+         if ( i >= 4) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 15;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            if (j >= 4) {
+               // half-word i, byte 0
+               result[1] |= (2*index + 0x0) << ((7 - j)*16 + 8);
+               // half-word i, byte 1
+               result[1] |= (2*index + 0x1) << ((7 - j)*16);
+            } else {
+               // half-word i, byte 0
+               result[0] |= (2*index + 0x0) << ((3 - j)*16 + 8);
+               // half-word i, byte 1
+               result[0] |= (2*index + 0x1) << ((3 - j)*16);
+            }
+            j++;
+         }
+      }
+
+   } else if (imm == 0b10) {   //little-endian expansion
+      /* If IMM=0b00010, let pcv be the permute control vector required to
+         enable a right-indexed permute (vpermr or xxpermr) to implement an
+         expansion of the rightmost halfword elements of a source vector into
+         the halfword elements of a result vector specified by the halfword-
+         element mask in VSR[VRB+32].
+       */
+      for( index = 0; index < 8; index++) {
+         i = index;
+         shift_by = i*16;
+
+         if ( i >= 4) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 15;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            // half-word i, byte 0
+            result[half_sel] |= (2*j + 0x00) << shift_by;
+            // half-word i, byte 1
+            result[half_sel] |= (2*j + 0x01) << (shift_by+8);
+            j++;
+
+         } else {
+            // half-word i, byte 0
+            result[half_sel] |= (2*index + 0x10) << shift_by;
+            // half-word i, byte 1
+            result[half_sel] |= (2*index + 0x11) << (shift_by+8);
+         }
+      }
+
+   } else if (imm == 0b11) {   //little-endian compression
+      /* If IMM=0b00011, let pcv be the permute control vector required to
+         enable a right-indexed permute (vpermr or xxpermr) to implement a
+         compression of the sparse halfword elements in a source vector
+         specified by the halfword-element mask in VSR[VRB+32] into the
+         rightmost halfword elements of a result vector.  */
+      for( index = 0; index < 8; index++) {
+         i = index;
+         shift_by = i*16;
+
+         if ( i >= 4) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 15;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            if (j >= 4) {
+               // half-word j, byte 0
+               result[0] |= (2*index + 0x0) << ((j-4)*16);
+               // half-word j, byte 1
+               result[0] |= (2*index + 0x1) << ((j-4)*16+8);
+            } else {
+               // half-word j, byte 0
+               result[1] |= (2*index + 0x0) << (j*16);
+               // half-word j, byte 1
+               result[1] |= (2*index + 0x1) << ((j*16)+8);
+            }
+            j++;
+         }
+      }
+
+   } else {
+      vex_printf("ERROR, vector_gen_pvc_hword_dirty_mask_helper, imm value %u not supported.\n",
+                 imm);
+      vassert(0);
+   }
+   write_VSX_entry( gst, reg_offset, result);
+}
+
+/* CALLED FROM GENERATED CODE */
+void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State* gst,
+                                            ULong src_hi, ULong src_lo,
+                                            UInt reg_offset, UInt imm ) {
+   /* The function computes the 128-bit result then writes it directly
+      into the guest state VSX register.  */
+   UInt  i, shift_by, sel_shift_by, half_sel;
+   ULong index, src, result[2];
+   ULong j;
+
+   result[0] = 0;
+   result[1] = 0;
+   j = 0;
+
+   /* The algorithm in the ISA is written with IBM numbering zero on left and
+      N-1 on right. The loop index is converted to "i" to match the algorithm
+      for claritiy of matching the C code to the algorithm in the ISA.  */
+
+   if (imm == 0b00) {    // big endian expansion
+      /* If IMM=0b00000, let pcv be the permute control vector required to
+         enable a left-indexed permute (vperm or xxperm) to implement an
+         expansion of the leftmost word elements of a source vector into the
+         word elements of a result vector specified by the word-element mask
+         in VSR[VRB+32].
+      */
+      for( index = 0; index < 4; index++) {
+         i = 3 - index;
+
+         shift_by = i*32;
+
+         if ( i >= 2) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 31;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            result[half_sel] |= (4*j+0) << (shift_by+24);  // word i, byte 0
+            result[half_sel] |= (4*j+1) << (shift_by+16);  // word i, byte 1
+            result[half_sel] |= (4*j+2) << (shift_by+8);   // word i, byte 2
+            result[half_sel] |= (4*j+3) << shift_by;       // word i, byte 3
+            j++;
+         } else {
+            result[half_sel] |= (4*index + 0x10) << (shift_by+24);
+            result[half_sel] |= (4*index + 0x11) << (shift_by+16);
+            result[half_sel] |= (4*index + 0x12) << (shift_by+8);
+            result[half_sel] |= (4*index + 0x13) << shift_by;
+         }
+      }
+
+   } else if (imm == 0b01) {    // big endian compression
+      /* If IMM=0b00001, let pcv be the permute control vector required to
+         enable a left-indexed permute (vperm or xxperm) to implement a
+         compression of the sparse word elements in a source vector specified
+         by the word-element mask in VSR[VRB+32] into the leftmost word
+         elements of a result vector.
+      */
+      for( index = 0; index < 4; index++) {
+         i = 3 - index;
+
+         shift_by = i*32;
+
+         if ( i >= 2) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 31;
+
+         if (((src >> sel_shift_by) & 0x1) == 1) {
+            if (j >= 2) {
+               // word j, byte 0
+               result[1] |= (4*index+0) << ((3 - j)*32 + 24);
+               // word j, byte 1
+               result[1] |= (4*index+1) << ((3 - j)*32 + 16);
+               // word j, byte 2
+               result[1] |= (4*index+2) << ((3 - j)*32 + 8);
+               // word j, byte 3
+               result[1] |= (4*index+3) << ((3 - j)*32 + 0);
+            } else {
+               result[0] |= (4*index+0) << ((1 - j)*32 + 24);
+               result[0] |= (4*index+1) << ((1 - j)*32 + 16);
+               result[0] |= (4*index+2) << ((1 - j)*32 + 8);
+               result[0] |= (4*index+3) << ((1 - j)*32 + 0);
+            }
+            j++;
+         }
+      }
+
+   } else if (imm == 0b10) {   //little-endian expansion
+      /* If IMM=0b00010, let pcv be the permute control vector required to
+         enable a right-indexed permute (vpermr or xxpermr) to implement an
+         expansion of the rightmost word elements of a source vector into the
+         word elements of a result vector specified by the word-element mask
+         in VSR[VRB+32].
+       */
+      for( index = 0; index < 4; index++) {
+         i = index;
+
+         shift_by = i*32;
+
+         if ( i >= 2) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 31;
+
+         if (((src >> sel_shift_by) & 0x1) == 1) {
+            result[half_sel] |= (4*j+0) << (shift_by + 0);  // word j, byte 0
+            result[half_sel] |= (4*j+1) << (shift_by + 8);  // word j, byte 1
+            result[half_sel] |= (4*j+2) << (shift_by + 16); // word j, byte 2
+            result[half_sel] |= (4*j+3) << (shift_by + 24); // word j, byte 3
+            j++;
+         } else {
+            result[half_sel] |= (4*index + 0x10) << (shift_by + 0);
+            result[half_sel] |= (4*index + 0x11) << (shift_by + 8);
+            result[half_sel] |= (4*index + 0x12) << (shift_by + 16);
+            result[half_sel] |= (4*index + 0x13) << (shift_by + 24);
+         }
+      }
+
+   } else if (imm == 0b11) {   //little-endian compression
+      /* If IMM=0b00011, let pcv be the permute control vector required to
+         enable a right-indexed permute (vpermr or xxpermr) to implement a
+         compression of the sparse word elements in a source vector specified
+         by the word-element mask in VSR[VRB+32] into the rightmost word
+         elements of a result vector.  */
+      for( index = 0; index < 4; index++) {
+         i =index;
+
+         shift_by = i*32;
+
+         if ( i >= 2) {
+            src = src_hi;
+            shift_by = shift_by - 64;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = shift_by + 31;
+
+         if (((src >> sel_shift_by) & 0x1) == 1) {
+            if (j >= 2){
+               // word j, byte 0
+               result[0] |= (4*index + 0x0) << ((j-2)*32+0);
+               // word j, byte 1
+               result[0] |= (4*index + 0x1) << ((j-2)*32+8);
+               // word j, byte 2
+               result[0] |= (4*index + 0x2) << ((j-2)*32+16);
+               // word j, byte 3
+               result[0] |= (4*index + 0x3) << ((j-2)*32+24);
+            } else {
+               result[1] |= (4*index + 0x0) << (j*32+0);
+               result[1] |= (4*index + 0x1) << (j*32+8);
+               result[1] |= (4*index + 0x2) << (j*32+16);
+               result[1] |= (4*index + 0x3) << (j*32+24);
+            }
+            j++;
+         }
+      }
+   } else {
+      vex_printf("ERROR, vector_gen_pvc_word_mask_dirty_helper, imm value %u not supported.\n",
+                 imm);
+      vassert(0);
+   }
+
+   write_VSX_entry( gst, reg_offset, result);
+}
+
+/* CALLED FROM GENERATED CODE */
+void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,
+                                             ULong src_hi, ULong src_lo,
+                                             UInt reg_offset, UInt imm ) {
+   /* The function computes the 128-bit result then writes it directly
+      into the guest state VSX register.  */
+   UInt  sel_shift_by, half_sel;
+   ULong index, src, result[2];
+   ULong j, i;
+
+   result[0] = 0;
+   result[1] = 0;
+   j = 0;
+
+   /* The algorithm in the ISA is written with IBM numbering zero on left and
+      N-1 on right. The loop index is converted to "i" to match the algorithm
+      for claritiy of matching the C code to the algorithm in the ISA.  */
+
+   if (imm == 0b00) {    // big endian expansion
+      /* If IMM=0b00000, let pcv be the permute control vector required to
+         enable a left-indexed permute (vperm or xxperm) to implement an
+         expansion of the leftmost doubleword elements of a source vector into
+         the doubleword elements of a result vector specified by the
+         doubleword-element mask in VSR[VRB+32].
+      */
+      for( index = 0; index < 2; index++) {
+         i = 1 - index;
+
+         if ( i == 1) {
+            src = src_hi;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = 63;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            result[half_sel] |= (8*j + 0x0) << 56; // dword i, byte 0
+            result[half_sel] |= (8*j + 0x1) << 48; // dword i, byte 1
+            result[half_sel] |= (8*j + 0x2) << 40; // dword i, byte 2
+            result[half_sel] |= (8*j + 0x3) << 32; // dword i, byte 3
+            result[half_sel] |= (8*j + 0x4) << 24; // dword i, byte 4
+            result[half_sel] |= (8*j + 0x5) << 16; // dword i, byte 5
+            result[half_sel] |= (8*j + 0x6) << 8;  // dword i, byte 6
+            result[half_sel] |= (8*j + 0x7) << 0;  // dword i, byte 7
+            j++;
+         } else {
+            result[half_sel] |= (8*index + 0x10) << 56;
+            result[half_sel] |= (8*index + 0x11) << 48;
+            result[half_sel] |= (8*index + 0x12) << 40;
+            result[half_sel] |= (8*index + 0x13) << 32;
+            result[half_sel] |= (8*index + 0x14) << 24;
+            result[half_sel] |= (8*index + 0x15) << 16;
+            result[half_sel] |= (8*index + 0x16) << 8;
+            result[half_sel] |= (8*index + 0x17) << 0;
+         }
+      }
+   } else if (imm == 0b01) {    // big endian compression
+      /* If IMM=0b00001, let pcv be the the permute control vector required to
+         enable a left-indexed permute (vperm or xxperm) to implement a
+         compression of the sparse doubleword elements in a source vector
+         specified by the doubleword-element mask in VSR[VRB+32] into the
+         leftmost doubleword elements of a result vector.
+      */
+      for( index = 0; index < 2; index++) {
+         i = 1 - index;
+
+         if ( i == 1) {
+            src = src_hi;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = 63;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            if (j == 1) {
+               result[1] |= (8*index + 0x0) << 56;   // double-word j, byte 0
+               result[1] |= (8*index + 0x1) << 48;   // double-word j, byte 1
+               result[1] |= (8*index + 0x2) << 40;   // double-word j, byte 2
+               result[1] |= (8*index + 0x3) << 32;   // double-word j, byte 3
+               result[1] |= (8*index + 0x4) << 24;   // double-word j, byte 4
+               result[1] |= (8*index + 0x5) << 16;   // double-word j, byte 5
+               result[1] |= (8*index + 0x6) << 8;    // double-word j, byte 6
+               result[1] |= (8*index + 0x7) << 0;    // double-word j, byte 7
+            } else {
+               result[0] |= (8*index + 0x0) << 56;   // double-word j, byte 0
+               result[0] |= (8*index + 0x1) << 48;   // double-word j, byte 1
+               result[0] |= (8*index + 0x2) << 40;   // double-word j, byte 2
+               result[0] |= (8*index + 0x3) << 32;   // double-word j, byte 3
+               result[0] |= (8*index + 0x4) << 24;   // double-word j, byte 4
+               result[0] |= (8*index + 0x5) << 16;   // double-word j, byte 5
+               result[0] |= (8*index + 0x6) << 8;    // double-word j, byte 6
+               result[0] |= (8*index + 0x7) << 0;    // double-word j, byte 7
+            }
+            j++;
+         }
+      }
+   } else if (imm == 0b10) {   //little-endian expansion
+      /* If IMM=0b00010, let pcv be the permute control vector required to
+         enable a right-indexed permute (vpermr or xxpermr) to implement an
+         expansion of the rightmost doubleword elements of a source vector
+         into the doubleword elements of a result vector specified by the
+         doubleword-element mask in VSR[VRB+32].
+       */
+
+      for( index = 0; index < 2; index++) {
+         i = index;
+
+         if ( i == 1) {
+            src = src_hi;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = 63;
+
+         if ( ((src >> sel_shift_by) & 0x1) == 1) {
+            result[half_sel] |= (8*j+0) << 0;  // double-word i, byte 0
+            result[half_sel] |= (8*j+1) << 8;  // double-word i, byte 1
+            result[half_sel] |= (8*j+2) << 16; // double-word i, byte 2
+            result[half_sel] |= (8*j+3) << 24; // double-word i, byte 3
+            result[half_sel] |= (8*j+4) << 32; // double-word i, byte 4
+            result[half_sel] |= (8*j+5) << 40; // double-word i, byte 5
+            result[half_sel] |= (8*j+6) << 48; // double-word i, byte 6
+            result[half_sel] |= (8*j+7) << 56; // double-word i, byte 7
+            j++;
+         } else {
+            result[half_sel] |= (8*index + 0x10) << 0;
+            result[half_sel] |= (8*index + 0x11) << 8;
+            result[half_sel] |= (8*index + 0x12) << 16;
+            result[half_sel] |= (8*index + 0x13) << 24;
+            result[half_sel] |= (8*index + 0x14) << 32;
+            result[half_sel] |= (8*index + 0x15) << 40;
+            result[half_sel] |= (8*index + 0x16) << 48;
+            result[half_sel] |= (8*index + 0x17) << 56;
+         }
+      }
+
+   } else if (imm == 0b11) {   //little-endian compression
+      /* If IMM=0b00011, let pcv be the permute control vector required to
+         enable a right-indexed permute (vpermr or xxpermr) to implement a
+         compression of the sparse doubleword elements in a source vector
+         specified by the doubleword-element mask in VSR[VRB+32] into the
+         rightmost doubleword elements of a result vector.  */
+      for( index = 0; index < 2; index++) {
+         i = index;
+
+         if ( i == 1) {
+            src = src_hi;
+            half_sel = 0;
+         } else {
+            src = src_lo;
+            half_sel = 1;
+         }
+
+         sel_shift_by = 63;
+
+         if (((src >> sel_shift_by) & 0x1) == 1) {
+            if (j == 1) {
+               result[0] |= (8*index + 0x0) << 0;    // double-word j, byte 0
+               result[0] |= (8*index + 0x1) << 8;    // double-word j, byte 1
+               result[0] |= (8*index + 0x2) << 16;   // double-word j, byte 2
+               result[0] |= (8*index + 0x3) << 24;   // double-word j, byte 3
+               result[0] |= (8*index + 0x4) << 32;   // double-word j, byte 4
+               result[0] |= (8*index + 0x5) << 40;   // double-word j, byte 5
+               result[0] |= (8*index + 0x6) << 48;   // double-word j, byte 6
+               result[0] |= (8*index + 0x7) << 56;   // double-word j, byte 7
+            } else {
+               result[1] |= (8*index + 0x0) << 0;
+               result[1] |= (8*index + 0x1) << 8;
+               result[1] |= (8*index + 0x2) << 16;
+               result[1] |= (8*index + 0x3) << 24;
+               result[1] |= (8*index + 0x4) << 32;
+               result[1] |= (8*index + 0x5) << 40;
+               result[1] |= (8*index + 0x6) << 48;
+               result[1] |= (8*index + 0x7) << 56;
+            }
+            j++;
+         }
+      }
+   } else {
+      vex_printf("ERROR, vector_gen_pvc_dword_mask_helper, imm value %u not supported.\n",
+                 imm);
+      vassert(0);
+   }
+
+   write_VSX_entry( gst, reg_offset, result);
+}
 
 /*------------------------------------------------*/
 /*---- VSX Matrix signed integer GER functions ---*/
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
index bcabf69dd7..354be6b53d 100644
--- a/VEX/priv/guest_ppc_toIR.c
+++ b/VEX/priv/guest_ppc_toIR.c
@@ -3322,6 +3322,7 @@ static IRExpr * locate_vector_ele_eq ( IRTemp src, IRExpr *value,
 #define DFORM_IMMASK  0xffffffff
 #define DSFORM_IMMASK 0xfffffffc
 #define DQFORM_IMMASK 0xfffffff0
+#define DA8LSFORM_IMMASK 0x3fffffff   // Algebraic 8LS Dform
 
 #define ISA_3_1_PREFIX_CHECK if (prefix) {if (!allow_isa_3_1) goto decode_noIsa3_1;}
 
@@ -6109,6 +6110,87 @@ static void vsx_matrix_64bit_float_ger ( const VexAbiInfo* vbi,
    stmt( IRStmt_Dirty(d) );
 }
 
+static void vector_gen_pvc_mask ( const VexAbiInfo* vbi,
+                                   IRExpr *src, UInt IMM,
+                                   UInt opc2, UInt VSX_addr ) {
+   /* The function takes a 64-bit source and an immediate value.  The function
+      calls a helper to execute the xxgenpcvbm, xxgenpcvhm, xxgenpcvwm,
+      xxgenpcvdm instruction.  The instructions are not practical to do with
+      Iops.  The instruction is implemented with a dirty helper that
+      calculates the 128-bit result and writes it directly into the guest
+      state VSX register.
+  */
+   IRTemp src_hi = newTemp( Ity_I64);
+   IRTemp src_lo = newTemp( Ity_I64);
+
+   IRDirty* d;
+
+   vassert( (VSX_addr >= 0) && (VSX_addr < 64) );
+   UInt reg_offset = offsetofPPCGuestState( guest_VSR0 )
+      + sizeof(U128) * VSX_addr;
+
+   assign( src_hi, unop( Iop_V128HIto64, src ) );
+   assign( src_lo, unop( Iop_V128to64, src ) );
+
+   IRExpr** args = mkIRExprVec_5(
+      IRExpr_GSPTR(),
+      mkexpr( src_hi ),
+      mkexpr( src_lo ),
+      mkU32( reg_offset ),
+      mkU64( IMM ) );
+
+   switch( opc2 ) {
+   case 0x394: // xxgenpcvbm
+      d = unsafeIRDirty_0_N (
+         0 /*regparms*/,
+         "vector_gen_pvc_byte_mask_dirty_helper",
+         fnptr_to_fnentry( vbi,
+                           &vector_gen_pvc_byte_mask_dirty_helper ),
+         args);
+      break;
+
+   case 0x395: // xxgenpcvhm
+      d = unsafeIRDirty_0_N (
+         0 /*regparms*/,
+         "vector_gen_pvc_hword_mask_dirty_helper",
+         fnptr_to_fnentry( vbi,
+                           &vector_gen_pvc_hword_mask_dirty_helper ),
+         args);
+      break;
+
+   case 0x3B4: // xxgenpcvwm
+      d = unsafeIRDirty_0_N (
+         0 /*regparms*/,
+         "vector_gen_pvc_word_mask_dirty_helper",
+         fnptr_to_fnentry( vbi,
+                           &vector_gen_pvc_word_mask_dirty_helper ),
+         args);
+      break;
+
+   case 0x3B5: // xxgenpcvdm
+      d = unsafeIRDirty_0_N (
+         0 /*regparms*/,
+         "vector_gen_pvc_dword_mask_dirty_helper",
+         fnptr_to_fnentry( vbi,
+                           &vector_gen_pvc_dword_mask_dirty_helper ),
+         args);
+      break;
+   default:
+      vex_printf("ERROR: Unkown instruction = %u in vector_gen_pvc_mask()\n",
+                 opc2);
+      return;
+   }
+
+   d->nFxState = 1;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Modify;
+   d->fxState[0].size   = sizeof(U128);
+   d->fxState[0].offset = reg_offset;
+
+   /* execute the dirty call, side-effecting guest state */
+   stmt( IRStmt_Dirty(d) );
+}
+
 static IRExpr * UNSIGNED_CMP_GT_V128 ( IRExpr *vA, IRExpr *vB ) {
    /* This function does an unsigned compare of two V128 values. The
     * function is for use in 32-bit mode only as it is expensive.  The
@@ -35227,6 +35309,54 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,
    return True;
 }
 
+static Bool dis_vector_generate_pvc_from_mask ( UInt prefix,
+                                                UInt theInstr,
+                                                const VexAbiInfo* vbi )
+{
+   UChar XT_addr = ifieldRegXT(theInstr);
+   UChar vB_addr = ifieldRegB(theInstr);
+   IRTemp vB = newTemp( Ity_V128 );
+   UInt opc2 = ifieldOPClo10(theInstr);
+   UInt IMM = IFIELD(theInstr, (31-15), 5);    // bits[11:15]
+
+   assign( vB, getVReg( vB_addr ) );
+
+   switch( opc2 ) {
+   case 0x394:
+      DIP("xxgenpcvbm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
+      /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
+         write it to the VSX result register.  */
+      vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
+      break;
+
+   case 0x395:
+      DIP("xxgenpcvhm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
+      /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
+         write it to the VSX result register.  */
+      vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
+      break;
+
+   case 0x3B4:
+      DIP("xxgenpcvwm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
+      /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
+         write it to the VSX result register.  */
+      vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
+      break;
+
+   case 0x3B5:
+      DIP("xxgenpcvdm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);
+      /* vector_gen_pvc_mask uses a dirty helper to calculate the result and
+         write it to the VSX result register.  */
+      vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );
+      break;
+
+   default:
+      return False;
+   }
+
+   return True;
+}
+
 static Int dis_nop_prefix ( UInt prefix, UInt theInstr )
 {
    Bool is_prefix   = prefix_instruction( prefix );
@@ -35748,14 +35878,9 @@ DisResult disInstr_PPC_WRK (
       }
       goto decode_failure;
 
-   case 0x31:   // lfsu, stxv
+   case 0x31:   // lfsu
       if (!allow_F) goto decode_noF;
-      if (prefix_instruction( prefix )) {  // stxv
-         if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;
-         if (dis_fp_pair_prefix( prefix, theInstr )) goto decode_success;
-      } else {  // lfsu
-         if (dis_fp_load( prefix, theInstr )) goto decode_success;
-      }
+      if (dis_fp_load( prefix, theInstr )) goto decode_success;
       goto decode_failure;
 
    case 0x32:
@@ -35842,7 +35967,6 @@ DisResult disInstr_PPC_WRK (
    case 0x39:  // pld, lxsd, lxssp, lfdp
       {
          UInt opc2tmp = ifieldOPC0o2(theInstr);
-
          if (!allow_F) goto decode_noF;
          if (prefix_instruction( prefix )) {   // pld
             if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;
@@ -36125,12 +36249,6 @@ DisResult disInstr_PPC_WRK (
             goto decode_failure;
       }
 
-      /* The vsxOpc2 returned is the "normalized" value, representing the
-       * instructions secondary opcode as taken from the standard secondary
-       * opcode field [21:30] (IBM notatition), even if the actual field
-       * is non-standard.  These normalized values are given in the opcode
-       * appendices of the ISA 2.06 document.
-       */
       if ( ( opc2 == 0x168 ) && ( IFIELD( theInstr, 19, 2 ) == 0 ) )// xxspltib
       {
          /* This is a special case of the XX1 form where the  RA, RB
@@ -36153,6 +36271,23 @@ DisResult disInstr_PPC_WRK (
          goto decode_failure;
       }
 
+      if ( ( opc2 == 0x394 ) ||         // xxgenpcvbm
+           ( opc2 == 0x395 ) ||         // xxgenpcvwm
+           ( opc2 == 0x3B4 ) ||         // xxgenpcvhm
+           ( opc2 == 0x3B5 ) ) {        // xxgenpcvdm
+         if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;
+         if (dis_vector_generate_pvc_from_mask( prefix, theInstr,
+                                                abiinfo ))
+            goto decode_success;
+         goto decode_failure;
+      }
+
+      /* The vsxOpc2 returned is the "normalized" value, representing the
+       * instructions secondary opcode as taken from the standard secondary
+       * opcode field [21:30] (IBM notatition), even if the actual field
+       * is non-standard.  These normalized values are given in the opcode
+       * appendices of the ISA 2.06 document.
+       */
       vsxOpc2 = get_VSX60_opc2(opc2, theInstr);
 
       switch (vsxOpc2) {
-- 
2.47.2