powerpc: Optimization for strlen for POWER8.

author Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>

Wed, 11 Nov 2015 19:31:28 +0000 (17:31 -0200)

committer Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>

Fri, 15 Apr 2016 20:19:19 +0000 (17:19 -0300)
author Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Wed, 11 Nov 2015 19:31:28 +0000 (17:31 -0200)
committer Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Fri, 15 Apr 2016 20:19:19 +0000 (17:19 -0300)
diff --git a/ChangeLog b/ChangeLog

index 9eafe63f55d346704a4a6b712bdfbb7e94ace91a..92d5557598089c331b7e1ca4d689256ab0fe3f16 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2016-04-15  Carlos Eduardo Seo  <cseo@linux.vnet.ibm.com>
+
+       * sysdeps/powerpc/powerpc64/multiarch/Makefile:
+       (sysdep_routines): Added __strlen_power8.
+       * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: Added
+       __strlen_power8 entry.
+       * sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S: New file.
+       Implementation for POWER8.
+       * sysdeps/powerpc/powerpc64/multiarch/strlen.c: Added IFUNC selector
+       for __strlen_power8.
+       * sysdeps/powerpc/powerpc64/power8/strlen.S: New file.
+       Implementation for POWER8.
+
  2016-04-15  Mike Frysinger  <vapier@gentoo.org>
  
         * locale/iso-4217.def: Add SSP and change ZMK to ZMW.
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile

index 7ed56bf292ffe7561eec9305cd5c35c32833f272..57abe8ff3c9d87c1ed684b5764cc0e3d78014308 100644 (file)
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -20,7 +20,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
                    strcat-power8 strcat-power7 strcat-ppc64 \
                    memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
                    strncpy-power8 strstr-power7 strstr-ppc64 \
-                  strspn-power8 strspn-ppc64
+                  strspn-power8 strspn-ppc64 strlen-power8
  
  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c

index 3e1f099d71e7f793a04a3ace243f82d4c0fb7b65..d30a661caf50c7e391d68701153c3697939be11e 100644 (file)
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -101,6 +101,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  
    /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
    IFUNC_IMPL (i, name, strlen,
+             IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __strlen_power8)
               IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
                               __strlen_power7)
               IFUNC_IMPL_ADD (array, i, strlen, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S

new file mode 100644 (file)

index 0000000..686dc3d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
@@ -0,0 +1,39 @@
+/* Optimized strlen implementation for POWER8.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)                            \
+  .section ".text";                                            \
+  ENTRY_2(__strlen_power8)                                     \
+  .align ALIGNARG(alignt);                                     \
+  EALIGN_W_##words;                                            \
+  BODY_LABEL(__strlen_power8):                                 \
+  cfi_startproc;                                               \
+  LOCALENTRY(__strlen_power8)
+#undef END
+#define END(name)                                              \
+  cfi_endproc;                                                 \
+  TRACEBACK(__strlen_power8)                                   \
+  END_2(__strlen_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strlen.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c

index 94501fdf2f318430e69470f62742d2d8a9f40ad5..609a87ec9f2d9f22ffad6641088d0f5cad1c8b3e 100644 (file)
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -29,11 +29,14 @@ extern __typeof (__redirect_strlen) __libc_strlen;
  
  extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
  extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
  
  libc_ifunc (__libc_strlen,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strlen_power7
-            : __strlen_ppc);
+           (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+           ? __strlen_power8 :
+             (hwcap & PPC_FEATURE_HAS_VSX)
+             ? __strlen_power7
+             : __strlen_ppc);
  
  #undef strlen
  strong_alias (__libc_strlen, strlen)
diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S

new file mode 100644 (file)

index 0000000..0142747
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strlen.S
@@ -0,0 +1,297 @@
+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
+   loop.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MFVRD(r,v)     .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+                              | ((t)<<(32-11)) \
+                              | ((a)<<(32-16)) \
+                              | ((b)<<(32-21)) )
+
+/* int [r3] strlen (char *s [r3])  */
+
+/* TODO: change this to .machine power8 when the minimum required binutils
+   allows it.  */
+       .machine  power7
+EALIGN (strlen, 4, 0)
+       CALL_MCOUNT 1
+       dcbt    0,r3
+       clrrdi  r4,r3,3       /* Align the address to doubleword boundary.  */
+       rlwinm  r6,r3,3,26,28 /* Calculate padding.  */
+       li      r0,0          /* Doubleword with null chars to use
+                                with cmpb.  */
+       li      r5,-1         /* MASK = 0xffffffffffffffff.  */
+       ld      r12,0(r4)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+       sld     r5,r5,r6
+#else
+       srd     r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+       orc     r9,r12,r5     /* Mask bits that are not part of the string.  */
+       cmpb    r10,r9,r0     /* Check for null bytes in DWORD1.  */
+       cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+       bne     cr7,L(done)
+
+       /* For shorter strings (< 64 bytes), we will not use vector registers,
+          as the overhead isn't worth it.  So, let's use GPRs instead.  This
+          will be done the same way as we do in the POWER7 implementation.
+          Let's see if we are aligned to a quadword boundary.  If so, we can
+          jump to the first (non-vectorized) loop.  Otherwise, we have to
+          handle the next DWORD first.  */
+       mtcrf   0x01,r4
+       mr      r9,r4
+       addi    r9,r9,8
+       bt      28,L(align64)
+
+       /* Handle the next 8 bytes so we are aligned to a quadword
+          boundary.  */
+       ldu     r5,8(r4)
+       cmpb    r10,r5,r0
+       cmpdi   cr7,r10,0
+       addi    r9,r9,8
+       bne     cr7,L(done)
+
+L(align64):
+       /* Proceed to the old (POWER7) implementation, checking two doublewords
+          per iteraction.  For the first 56 bytes, we will just check for null
+          characters.  After that, we will also check if we are 64-byte aligned
+          so we can jump to the vectorized implementation.  We will unroll
+          these loops to avoid excessive branching.  */
+       ld      r6,8(r4)
+       ldu     r5,16(r4)
+       cmpb    r10,r6,r0
+       cmpb    r11,r5,r0
+       or      r5,r10,r11
+       cmpdi   cr7,r5,0
+       addi    r9,r9,16
+       bne     cr7,L(dword_zero)
+
+       ld      r6,8(r4)
+       ldu     r5,16(r4)
+       cmpb    r10,r6,r0
+       cmpb    r11,r5,r0
+       or      r5,r10,r11
+       cmpdi   cr7,r5,0
+       addi    r9,r9,16
+       bne     cr7,L(dword_zero)
+
+       ld      r6,8(r4)
+       ldu     r5,16(r4)
+       cmpb    r10,r6,r0
+       cmpb    r11,r5,r0
+       or      r5,r10,r11
+       cmpdi   cr7,r5,0
+       addi    r9,r9,16
+       bne     cr7,L(dword_zero)
+
+       /* Are we 64-byte aligned? If so, jump to the vectorized loop.
+          Note: aligning to 64-byte will necessarily slow down performance for
+          strings around 64 bytes in length due to the extra comparisons
+          required to check alignment for the vectorized loop.  This is a
+          necessary tradeoff we are willing to take in order to speed up the
+          calculation for larger strings.  */
+       andi.   r10,r9,63
+       beq     cr0,L(preloop)
+       ld      r6,8(r4)
+       ldu     r5,16(r4)
+       cmpb    r10,r6,r0
+       cmpb    r11,r5,r0
+       or      r5,r10,r11
+       cmpdi   cr7,r5,0
+       addi    r9,r9,16
+       bne     cr7,L(dword_zero)
+
+       andi.   r10,r9,63
+       beq     cr0,L(preloop)
+       ld      r6,8(r4)
+       ldu     r5,16(r4)
+       cmpb    r10,r6,r0
+       cmpb    r11,r5,r0
+       or      r5,r10,r11
+       cmpdi   cr7,r5,0
+       addi    r9,r9,16
+       bne     cr7,L(dword_zero)
+
+       andi.   r10,r9,63
+       beq     cr0,L(preloop)
+       ld      r6,8(r4)
+       ldu     r5,16(r4)
+       cmpb    r10,r6,r0
+       cmpb    r11,r5,r0
+       or      r5,r10,r11
+       cmpdi   cr7,r5,0
+       addi    r9,r9,16
+       bne     cr7,L(dword_zero)
+
+       andi.   r10,r9,63
+       beq     cr0,L(preloop)
+       ld      r6,8(r4)
+       ldu     r5,16(r4)
+       cmpb    r10,r6,r0
+       cmpb    r11,r5,r0
+       or      r5,r10,r11
+       cmpdi   cr7,r5,0
+       addi    r9,r9,16
+
+       /* At this point, we are necessarily 64-byte aligned.  If no zeroes were
+          found, jump to the vectorized loop.  */
+       beq     cr7,L(preloop)
+
+L(dword_zero):
+       /* OK, one (or both) of the doublewords contains a null byte.  Check
+          the first doubleword and decrement the address in case the first
+          doubleword really contains a null byte.  */
+
+       cmpdi   cr6,r10,0
+       addi    r4,r4,-8
+       bne     cr6,L(done)
+
+       /* The null byte must be in the second doubleword.  Adjust the address
+          again and move the result of cmpb to r10 so we can calculate the
+          length.  */
+
+       mr      r10,r11
+       addi    r4,r4,8
+
+       /* If the null byte was found in the non-vectorized code, compute the
+          final length.  r10 has the output of the cmpb instruction, that is,
+          it contains 0xff in the same position as the null byte in the
+          original doubleword from the string.  Use that to calculate the
+          length.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+       addi    r9, r10,-1    /* Form a mask from trailing zeros.  */
+       andc    r9, r9,r10
+       popcntd r0, r9        /* Count the bits in the mask.  */
+#else
+       cntlzd  r0,r10        /* Count leading zeros before the match.  */
+#endif
+       subf    r5,r3,r4
+       srdi    r0,r0,3       /* Convert leading/trailing zeros to bytes.  */
+       add     r3,r5,r0      /* Compute final length.  */
+       blr
+
+       /* Vectorized implementation starts here.  */
+       .p2align  4
+L(preloop):
+       /* Set up for the loop.  */
+       mr      r4,r9
+       li      r7, 16        /* Load required offsets.  */
+       li      r8, 32
+       li      r9, 48
+       li      r12, 8
+       vxor    v0,v0,v0      /* VR with null chars to use with
+                                vcmpequb.  */
+
+       /* Main loop to look for the end of the string.  We will read in
+          64-byte chunks.  Align it to 32 bytes and unroll it 3 times to
+          leverage the icache performance.  */
+       .p2align  5
+L(loop):
+       lvx       v1,r4,r0  /* Load 4 quadwords.  */
+       lvx       v2,r4,r7
+       lvx       v3,r4,r8
+       lvx       v4,r4,r9
+       vminub    v5,v1,v2  /* Compare and merge into one VR for speed.  */
+       vminub    v6,v3,v4
+       vminub    v7,v5,v6
+       vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+       addi      r4,r4,64  /* Adjust address for the next iteration.  */
+       bne       cr6,L(vmx_zero)
+
+       lvx       v1,r4,r0  /* Load 4 quadwords.  */
+       lvx       v2,r4,r7
+       lvx       v3,r4,r8
+       lvx       v4,r4,r9
+       vminub    v5,v1,v2  /* Compare and merge into one VR for speed.  */
+       vminub    v6,v3,v4
+       vminub    v7,v5,v6
+       vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+       addi      r4,r4,64  /* Adjust address for the next iteration.  */
+       bne       cr6,L(vmx_zero)
+
+       lvx       v1,r4,r0  /* Load 4 quadwords.  */
+       lvx       v2,r4,r7
+       lvx       v3,r4,r8
+       lvx       v4,r4,r9
+       vminub    v5,v1,v2  /* Compare and merge into one VR for speed.  */
+       vminub    v6,v3,v4
+       vminub    v7,v5,v6
+       vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+       addi      r4,r4,64  /* Adjust address for the next iteration.  */
+       beq       cr6,L(loop)
+
+L(vmx_zero):
+       /* OK, we found a null byte.  Let's look for it in the current 64-byte
+          block and mark it in its corresponding VR.  */
+       vcmpequb  v1,v1,v0
+       vcmpequb  v2,v2,v0
+       vcmpequb  v3,v3,v0
+       vcmpequb  v4,v4,v0
+
+       /* We will now 'compress' the result into a single doubleword, so it
+          can be moved to a GPR for the final calculation.  First, we
+          generate an appropriate mask for vbpermq, so we can permute bits into
+          the first halfword.  */
+       vspltisb  v10,3
+       lvsl      v11,r0,r0
+       vslb      v10,v11,v10
+
+       /* Permute the first bit of each byte into bits 48-63.  */
+       VBPERMQ(v1,v1,v10)
+       VBPERMQ(v2,v2,v10)
+       VBPERMQ(v3,v3,v10)
+       VBPERMQ(v4,v4,v10)
+
+       /* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+       vsldoi  v2,v2,v2,2
+       vsldoi  v3,v3,v3,4
+       vsldoi  v4,v4,v4,6
+#else
+       vsldoi  v1,v1,v1,6
+       vsldoi  v2,v2,v2,4
+       vsldoi  v3,v3,v3,2
+#endif
+
+       /* Merge the results and move to a GPR.  */
+       vor     v1,v2,v1
+       vor     v2,v3,v4
+       vor     v4,v1,v2
+       MFVRD(r10,v4)
+
+        /* Adjust address to the begninning of the current 64-byte block.  */
+       addi    r4,r4,-64
+
+#ifdef __LITTLE_ENDIAN__
+       addi    r9, r10,-1    /* Form a mask from trailing zeros.  */
+       andc    r9, r9,r10
+       popcntd r0, r9        /* Count the bits in the mask.  */
+#else
+       cntlzd  r0,r10        /* Count leading zeros before the match.  */
+#endif
+       subf    r5,r3,r4
+       add     r3,r5,r0      /* Compute final length.  */
+       blr
+
+END (strlen)
+libc_hidden_builtin_def (strlen)
author	Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
	Wed, 11 Nov 2015 19:31:28 +0000 (17:31 -0200)
committer	Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
	Fri, 15 Apr 2016 20:19:19 +0000 (17:19 -0300)
ChangeLog		patch \| blob \| blame \| history
sysdeps/powerpc/powerpc64/multiarch/Makefile		patch \| blob \| blame \| history
sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c		patch \| blob \| blame \| history
sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S	[new file with mode: 0644]	patch \| blob
sysdeps/powerpc/powerpc64/multiarch/strlen.c		patch \| blob \| blame \| history
sysdeps/powerpc/powerpc64/power8/strlen.S	[new file with mode: 0644]	patch \| blob