]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
powerpc: strcasecmp/strncasecmp optmization for power8
authorraji <raji@oc4354787705.ibm.com>
Tue, 14 Jun 2016 09:21:16 +0000 (14:51 +0530)
committerraji <raji@oc4354787705.ibm.com>
Tue, 14 Jun 2016 09:21:16 +0000 (14:51 +0530)
This implementation utilizes vectors to improve performance
compared to current byte by byte implementation for POWER7.
The performance improvement is upto 4x.  This patch is tested
on powerpc64 and powerpc64le.

12 files changed:
ChangeLog
sysdeps/powerpc/powerpc64/multiarch/Makefile
sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S
sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/strncase.c
sysdeps/powerpc/powerpc64/power8/strcasecmp.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/power8/strncase.S [new file with mode: 0644]

index 1d2c3e3d864b1480c9600b4a897255820bb5b41f..495f0881964fc3a6d1904731e86305805f170720 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,25 @@
+2016-06-14  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
+
+       * sysdeps/powerpc/powerpc64/multiarch/Makefile:
+       (sysdep_routines): Add P8 and PPC64 strcasecmp/strncasecmp targets.
+       * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c:
+       (__libc_ifunc_impl_list): Add entries for P8 and PPC64
+       ifunc'ed strcasecmp/strncasecmp.
+       * sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power7.S:
+       [EALIGN]: Removed.
+       [END]: Likewise.
+       [__strcasecmp]: Define instead of the above to control symbol name.
+       * sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c: Add IFUNC selector
+       for __strcasecmp_power8.
+       * sysdeps/powerpc/powerpc64/multiarch/strncase.c: Add IFUNC selector
+       for __strncasecmp_power8.
+       * sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S: New File.
+       * sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c: Likewise.
+       * sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S: Likewise.
+       * sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c: Likewise.
+       * sysdeps/powerpc/powerpc64/power8/strcasecmp.S: Likewise.
+       * sysdeps/powerpc/powerpc64/power8/strncase.S: Likewise.
+
 2016-06-13  Joseph Myers  <joseph@codesourcery.com>
 
        [BZ #20252]
index 9ee9bc2044ca1e081a31fb8effe57eb3e5d3c336..e3ac285e003d8167d220d5303a1dd28d674116ea 100644 (file)
@@ -21,7 +21,9 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
                   memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
                   strncpy-power8 strstr-power7 strstr-ppc64 \
                   strspn-power8 strspn-ppc64 strcspn-power8 strcspn-ppc64 \
-                  strlen-power8 strcasestr-power8 strcasestr-ppc64
+                  strlen-power8 strcasestr-power8 strcasestr-ppc64 \
+                  strcasecmp-ppc64 strcasecmp-power8 strncase-ppc64 \
+                  strncase-power8
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
index a0dc8ad4c4e69f975453d0e221304074b3d18f69..9f6bd7cd9878345eb3d6b61a227cae8f2243e94c 100644 (file)
@@ -203,6 +203,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+             IFUNC_IMPL_ADD (array, i, strcasecmp,
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __strcasecmp_power8)
              IFUNC_IMPL_ADD (array, i, strcasecmp,
                              hwcap & PPC_FEATURE_HAS_VSX,
                              __strcasecmp_power7)
@@ -218,6 +221,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncase.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+             IFUNC_IMPL_ADD (array, i, strncasecmp,
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __strncasecmp_power8)
              IFUNC_IMPL_ADD (array, i, strncasecmp,
                              hwcap & PPC_FEATURE_HAS_VSX,
                              __strncasecmp_power7)
index 013dc62867b62fac45517d2e2410dccab42fec34..99cd7bd4df6b31520da901adcdcdd7a652cf12be 100644 (file)
@@ -1,4 +1,4 @@
-/* Optimized strcasecmp implementation foOWER7.
+/* Optimized strcasecmp implementation for POWER7.
    Copyright (C) 2013-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
 
 #include <sysdep.h>
 
-#undef ENTRY
-#define ENTRY(name)                                            \
-  .section ".text";                                            \
-  ENTRY_2(__strcasecmp_power7)                                 \
-  .align ALIGNARG(2);                                          \
-  BODY_LABEL(__strcasecmp_power7):                             \
-  cfi_startproc;                                               \
-  LOCALENTRY(__strcasecmp_power7)
-
-#undef END
-#define END(name)                                              \
-  cfi_endproc;                                                 \
-  TRACEBACK(__strcasecmp_power7)                               \
-  END_2(__strcasecmp_power7)
-
+#define __strcasecmp __strcasecmp_power7
 #undef weak_alias
 #define weak_alias(name, alias)
 
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-power8.S
new file mode 100644 (file)
index 0000000..492047a
--- /dev/null
@@ -0,0 +1,28 @@
+/* Optimized strcasecmp implementation for POWER8.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define __strcasecmp __strcasecmp_power8
+#undef weak_alias
+#define weak_alias(name, alias)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp-ppc64.c
new file mode 100644 (file)
index 0000000..6318b4a
--- /dev/null
@@ -0,0 +1,21 @@
+/* Multiarch strcasecmp for PPC64.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define strcasecmp __strcasecmp_ppc
+
+#include <string/strcasecmp.c>
index 1f22336d497b9611ee6b3406fac6619e1517d3f1..5ec6885dfcf5158eb54447dbd86e798b51463982 100644 (file)
@@ -1,4 +1,4 @@
-/* Multiple versions of strcasecmp.
+/* Multiple versions of strcasecmp
    Copyright (C) 2013-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-# include <string.h>
-# define strcasecmp __strcasecmp_ppc
-extern __typeof (__strcasecmp) __strcasecmp_ppc attribute_hidden;
-extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
-#endif
+#include <string.h>
+#include <shlib-compat.h>
+#include "init-arch.h"
 
-#include <string/strcasecmp.c>
-#undef strcasecmp
+extern __typeof (__strcasecmp) __libc_strcasecmp;
 
-#if IS_IN (libc)
-# include <shlib-compat.h>
-# include "init-arch.h"
+extern __typeof (__strcasecmp) __strcasecmp_ppc attribute_hidden;
+extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
+extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
 
-extern __typeof (__strcasecmp) __libc_strcasecmp;
 libc_ifunc (__libc_strcasecmp,
-           (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcasecmp_power7
-            : __strcasecmp_ppc);
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+             ? __strcasecmp_power8:
+            (hwcap & PPC_FEATURE_HAS_VSX)
+             ? __strcasecmp_power7
+             : __strcasecmp_ppc);
 
 weak_alias (__libc_strcasecmp, strcasecmp)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncase-power8.S
new file mode 100644 (file)
index 0000000..01a63b5
--- /dev/null
@@ -0,0 +1,28 @@
+/* Optimized strncasecmp implementation for POWER8.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define __strncasecmp __strncasecmp_power8
+#undef weak_alias
+#define weak_alias(name, alias)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncase.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncase-ppc64.c
new file mode 100644 (file)
index 0000000..c245d77
--- /dev/null
@@ -0,0 +1,21 @@
+/* Multiarch strncasecmp for PPC64.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define strncasecmp __strncasecmp_ppc
+
+#include <string/strncase.c>
index 2729fcea8302fcf9129a7247291f402fbf0b16c8..0bfb25c381195dd36fcd0f85e1cd22a3ba1f988d 100644 (file)
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-# include <string.h>
-# define strncasecmp __strncasecmp_ppc
-extern __typeof (__strncasecmp) __strncasecmp_ppc attribute_hidden;
-extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
-#endif
+#include <string.h>
+#include <shlib-compat.h>
+#include "init-arch.h"
 
-#include <string/strncase.c>
-#undef strncasecmp
+extern __typeof (__strncasecmp) __libc_strncasecmp;
 
-#if IS_IN (libc)
-# include <shlib-compat.h>
-# include "init-arch.h"
+extern __typeof (__strncasecmp) __strncasecmp_ppc attribute_hidden;
+extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
+extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
 
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-   ifunc symbol properly.  */
-extern __typeof (__strncasecmp) __libc_strncasecmp;
 libc_ifunc (__libc_strncasecmp,
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+             ? __strncasecmp_power8:
             (hwcap & PPC_FEATURE_HAS_VSX)
              ? __strncasecmp_power7
              : __strncasecmp_ppc);
+
 weak_alias (__libc_strncasecmp, strncasecmp)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
new file mode 100644 (file)
index 0000000..63f6217
--- /dev/null
@@ -0,0 +1,446 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
+
+#ifndef USE_AS_STRNCASECMP
+#  define __STRCASECMP __strcasecmp
+#  define STRCASECMP   strcasecmp
+#else
+#  define __STRCASECMP __strncasecmp
+#  define STRCASECMP   strncasecmp
+#endif
+/* Convert 16 bytes to lowercase and compare */
+#define TOLOWER()     \
+       vaddubm v8, v4, v1; \
+       vaddubm v7, v4, v3; \
+       vcmpgtub        v8, v8, v2; \
+       vsel    v4, v7, v4, v8; \
+       vaddubm v8, v5, v1; \
+       vaddubm v7, v5, v3; \
+       vcmpgtub        v8, v8, v2; \
+       vsel    v5, v7, v5, v8; \
+       vcmpequb.       v7, v5, v4;
+
+/* Get 16 bytes for unaligned case.  */
+#ifdef __LITTLE_ENDIAN__
+#define GET16BYTES(reg1, reg2, reg3) \
+       lvx     reg1, 0, reg2; \
+       vcmpequb.       v8, v0, reg1; \
+       beq     cr6, 1f; \
+       vspltisb        v9, 0; \
+       b       2f; \
+       .align 4; \
+1: \
+       addi    r6, reg2, 16; \
+       lvx     v9, 0, r6; \
+2: \
+       vperm   reg1, v9, reg1, reg3;
+#else
+#define GET16BYTES(reg1, reg2, reg3) \
+       lvx     reg1, 0, reg2; \
+       vcmpequb.       v8, v0, reg1; \
+       beq     cr6, 1f; \
+       vspltisb        v9, 0; \
+       b       2f; \
+       .align 4; \
+1: \
+       addi    r6, reg2, 16; \
+       lvx     v9, 0, r6; \
+2: \
+       vperm   reg1, reg1, v9, reg3;
+#endif
+
+/* Check null in v4, v5 and convert to lower.  */
+#define CHECKNULLANDCONVERT() \
+       vcmpequb.       v7, v0, v5; \
+       beq     cr6, 3f; \
+       vcmpequb.       v7, v0, v4; \
+       beq     cr6, 3f; \
+       b       L(null_found); \
+       .align  4; \
+3: \
+       TOLOWER()
+
+#ifdef _ARCH_PWR8
+#  define VCLZD_V8_v7  vclzd   v8, v7;
+#  define MFVRD_R3_V1  mfvrd   r3, v1;
+#  define VSUBUDM_V9_V8        vsubudm v9, v9, v8;
+#  define VPOPCNTD_V8_V8       vpopcntd v8, v8;
+#  define VADDUQM_V7_V8        vadduqm v9, v7, v8;
+#else
+#  define VCLZD_V8_v7  .long   0x11003fc2
+#  define MFVRD_R3_V1  .long   0x7c230067
+#  define VSUBUDM_V9_V8        .long   0x112944c0
+#  define VPOPCNTD_V8_V8       .long   0x110047c3
+#  define VADDUQM_V7_V8        .long   0x11274100
+#endif
+
+       .machine  power7
+
+ENTRY (__STRCASECMP)
+#ifdef USE_AS_STRNCASECMP
+       CALL_MCOUNT 3
+#else
+       CALL_MCOUNT 2
+#endif
+#define rRTN   r3      /* Return value */
+#define rSTR1  r10     /* 1st string */
+#define rSTR2  r4      /* 2nd string */
+#define rCHAR1 r6      /* Byte read from 1st string */
+#define rCHAR2 r7      /* Byte read from 2nd string */
+#define rADDR1 r8      /* Address of tolower(rCHAR1) */
+#define rADDR2 r12     /* Address of tolower(rCHAR2) */
+#define rLWR1  r8      /* Word tolower(rCHAR1) */
+#define rLWR2  r12     /* Word tolower(rCHAR2) */
+#define rTMP   r9
+#define rLOC   r11     /* Default locale address */
+
+       cmpd    cr7, rRTN, rSTR2
+
+       /* Get locale address.  */
+       ld      rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+       add     rLOC, rTMP, __libc_tsd_LOCALE@tls
+       ld      rLOC, 0(rLOC)
+
+       mr      rSTR1, rRTN
+       li      rRTN, 0
+       beqlr   cr7
+#ifdef USE_AS_STRNCASECMP
+       cmpdi   cr7, r5, 0
+       beq     cr7, L(retnull)
+       cmpdi   cr7, r5, 16
+       blt     cr7, L(bytebybyte)
+#endif
+       vspltisb        v0, 0
+       vspltisb        v8, -1
+       /* Check for null in initial characters.
+          Check max of 16 char depending on the alignment.
+          If null is present, proceed byte by byte.  */
+       lvx     v4, 0, rSTR1
+#ifdef  __LITTLE_ENDIAN__
+       lvsr    v10, 0, rSTR1   /* Compute mask.  */
+       vperm   v9, v8, v4, v10 /* Mask bits that are not part of string.  */
+#else
+       lvsl    v10, 0, rSTR1
+       vperm   v9, v4, v8, v10
+#endif
+       vcmpequb.       v9, v0, v9      /* Check for null bytes.  */
+       bne     cr6, L(bytebybyte)
+       lvx     v5, 0, rSTR2
+       /* Calculate alignment.  */
+#ifdef __LITTLE_ENDIAN__
+       lvsr    v6, 0, rSTR2
+       vperm   v9, v8, v5, v6  /* Mask bits that are not part of string.  */
+#else
+       lvsl    v6, 0, rSTR2
+       vperm   v9, v5, v8, v6
+#endif
+       vcmpequb.       v9, v0, v9      /* Check for null bytes.  */
+       bne     cr6, L(bytebybyte)
+       /* Check if locale has non ascii characters.  */
+       ld      rTMP, 0(rLOC)
+       addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+       lwz     rTMP, 0(r6)
+       cmpdi   cr7, rTMP, 1
+       beq     cr7, L(bytebybyte)
+
+       /* Load vector registers with values used for TOLOWER.  */
+       /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte.  */
+       vspltisb        v3, 2
+       vspltisb        v9, 4
+       vsl     v3, v3, v9
+       vaddubm v1, v3, v3
+       vnor    v1, v1, v1
+       vspltisb        v2, 7
+       vsububm v2, v3, v2
+
+       andi.   rADDR1, rSTR1, 0xF
+       beq     cr0, L(align)
+       addi    r6, rSTR1, 16
+       lvx     v9, 0, r6
+       /* Compute 16 bytes from previous two loads.  */
+#ifdef __LITTLE_ENDIAN__
+       vperm   v4, v9, v4, v10
+#else
+       vperm   v4, v4, v9, v10
+#endif
+L(align):
+       andi.   rADDR2, rSTR2, 0xF
+       beq     cr0, L(align1)
+       addi    r6, rSTR2, 16
+       lvx     v9, 0, r6
+       /* Compute 16 bytes from previous two loads.  */
+#ifdef __LITTLE_ENDIAN__
+       vperm   v5, v9, v5, v6
+#else
+       vperm   v5, v5, v9, v6
+#endif
+L(align1):
+       CHECKNULLANDCONVERT()
+       blt     cr6, L(match)
+       b       L(different)
+       .align  4
+L(match):
+       clrldi  r6, rSTR1, 60
+       subfic  r7, r6, 16
+#ifdef USE_AS_STRNCASECMP
+       sub     r5, r5, r7
+#endif
+       add     rSTR1, rSTR1, r7
+       add     rSTR2, rSTR2, r7
+       andi.   rADDR2, rSTR2, 0xF
+       addi    rSTR1, rSTR1, -16
+       addi    rSTR2, rSTR2, -16
+       beq     cr0, L(aligned)
+#ifdef __LITTLE_ENDIAN__
+       lvsr    v6, 0, rSTR2
+#else
+       lvsl    v6, 0, rSTR2
+#endif
+       /* There are 2 loops depending on the input alignment.
+          Each loop gets 16 bytes from s1 and s2, check for null,
+          convert to lowercase and compare. Loop till difference
+          or null occurs. */
+L(s1_align):
+       addi    rSTR1, rSTR1, 16
+       addi    rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+       cmpdi   cr7, r5, 16
+       blt     cr7, L(bytebybyte)
+       addi    r5, r5, -16
+#endif
+       lvx     v4, 0, rSTR1
+       GET16BYTES(v5, rSTR2, v6)
+       CHECKNULLANDCONVERT()
+       blt     cr6, L(s1_align)
+       b       L(different)
+       .align  4
+L(aligned):
+       addi    rSTR1, rSTR1, 16
+       addi    rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+       cmpdi   cr7, r5, 16
+       blt     cr7, L(bytebybyte)
+       addi    r5, r5, -16
+#endif
+       lvx     v4, 0, rSTR1
+       lvx     v5, 0, rSTR2
+       CHECKNULLANDCONVERT()
+       blt     cr6, L(aligned)
+
+       /* Calculate and return the difference. */
+L(different):
+       vaddubm v1, v3, v3
+       vcmpequb        v7, v0, v7
+#ifdef __LITTLE_ENDIAN__
+       /* Count trailing zero.  */
+       vspltisb        v8, -1
+       VADDUQM_V7_V8
+       vandc   v8, v9, v7
+       VPOPCNTD_V8_V8
+       vspltb  v6, v8, 15
+       vcmpequb.       v6, v6, v1
+       blt     cr6, L(shift8)
+#else
+       /* Count leading zero.  */
+       VCLZD_V8_v7
+       vspltb  v6, v8, 7
+       vcmpequb.       v6, v6, v1
+       blt     cr6, L(shift8)
+       vsro    v8, v8, v1
+#endif
+       b       L(skipsum)
+       .align  4
+L(shift8):
+       vsumsws         v8, v8, v0
+L(skipsum):
+#ifdef __LITTLE_ENDIAN__
+       /* Shift registers based on leading zero count.  */
+       vsro    v6, v5, v8
+       vsro    v7, v4, v8
+       /* Merge and move to GPR.  */
+       vmrglb  v6, v6, v7
+       vslo    v1, v6, v1
+       MFVRD_R3_V1
+       /* Place the characters that are different in first position.  */
+       sldi    rSTR2, rRTN, 56
+       srdi    rSTR2, rSTR2, 56
+       sldi    rSTR1, rRTN, 48
+       srdi    rSTR1, rSTR1, 56
+#else
+       vslo    v6, v5, v8
+       vslo    v7, v4, v8
+       vmrghb  v1, v6, v7
+       MFVRD_R3_V1
+       srdi    rSTR2, rRTN, 48
+       sldi    rSTR2, rSTR2, 56
+       srdi    rSTR2, rSTR2, 56
+       srdi    rSTR1, rRTN, 56
+#endif
+       subf    rRTN, rSTR1, rSTR2
+       extsw   rRTN, rRTN
+       blr
+
+       .align  4
+       /* OK. We've hit the end of the string. We need to be careful that
+          we don't compare two strings as different because of junk beyond
+          the end of the strings...  */
+L(null_found):
+       vaddubm v10, v3, v3
+#ifdef __LITTLE_ENDIAN__
+       /* Count trailing zero.  */
+       vspltisb        v8, -1
+       VADDUQM_V7_V8
+       vandc   v8, v9, v7
+       VPOPCNTD_V8_V8
+       vspltb  v6, v8, 15
+       vcmpequb.       v6, v6, v10
+       blt     cr6, L(shift_8)
+#else
+       /* Count leading zero.  */
+       VCLZD_V8_v7
+       vspltb  v6, v8, 7
+       vcmpequb.       v6, v6, v10
+       blt     cr6, L(shift_8)
+       vsro    v8, v8, v10
+#endif
+       b       L(skipsum1)
+       .align  4
+L(shift_8):
+       vsumsws v8, v8, v0
+L(skipsum1):
+       /* Calculate shift count based on count of zero.  */
+       vspltisb        v10, 7
+       vslb    v10, v10, v10
+       vsldoi  v9, v0, v10, 1
+       VSUBUDM_V9_V8
+       vspltisb        v8, 8
+       vsldoi  v8, v0, v8, 1
+       VSUBUDM_V9_V8
+       /* Shift and remove junk after null character.  */
+#ifdef __LITTLE_ENDIAN__
+       vslo    v5, v5, v9
+       vslo    v4, v4, v9
+#else
+       vsro    v5, v5, v9
+       vsro    v4, v4, v9
+#endif
+       /* Convert and compare 16 bytes.  */
+       TOLOWER()
+       blt     cr6, L(retnull)
+       b       L(different)
+       .align  4
+L(retnull):
+       li      rRTN, 0
+       blr
+       .align  4
+L(bytebybyte):
+       /* Unrolling loop for POWER: loads are done with 'lbz' plus
+       offset and string descriptors are only updated in the end
+       of loop unrolling. */
+       ld      rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+       lbz     rCHAR1, 0(rSTR1)        /* Load char from s1 */
+       lbz     rCHAR2, 0(rSTR2)        /* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+       rldicl  rTMP, r5, 62, 2
+       cmpdi   cr7, rTMP, 0
+       beq     cr7, L(lessthan4)
+       mtctr   rTMP
+#endif
+L(loop):
+       cmpdi   rCHAR1, 0               /* *s1 == '\0' ? */
+       sldi    rADDR1, rCHAR1, 2       /* Calculate address for tolower(*s1) */
+       sldi    rADDR2, rCHAR2, 2       /* Calculate address for tolower(*s2) */
+       lwzx    rLWR1, rLOC, rADDR1     /* Load tolower(*s1) */
+       lwzx    rLWR2, rLOC, rADDR2     /* Load tolower(*s2) */
+       cmpw    cr1, rLWR1, rLWR2       /* r = tolower(*s1) == tolower(*s2) ? */
+       crorc   4*cr1+eq,eq,4*cr1+eq    /* (*s1 != '\0') || (r == 1) */
+       beq     cr1, L(done)
+       lbz     rCHAR1, 1(rSTR1)
+       lbz     rCHAR2, 1(rSTR2)
+       cmpdi   rCHAR1, 0
+       sldi    rADDR1, rCHAR1, 2
+       sldi    rADDR2, rCHAR2, 2
+       lwzx    rLWR1, rLOC, rADDR1
+       lwzx    rLWR2, rLOC, rADDR2
+       cmpw    cr1, rLWR1, rLWR2
+       crorc   4*cr1+eq,eq,4*cr1+eq
+       beq     cr1, L(done)
+       lbz     rCHAR1, 2(rSTR1)
+       lbz     rCHAR2, 2(rSTR2)
+       cmpdi   rCHAR1, 0
+       sldi    rADDR1, rCHAR1, 2
+       sldi    rADDR2, rCHAR2, 2
+       lwzx    rLWR1, rLOC, rADDR1
+       lwzx    rLWR2, rLOC, rADDR2
+       cmpw    cr1, rLWR1, rLWR2
+       crorc   4*cr1+eq,eq,4*cr1+eq
+       beq     cr1, L(done)
+       lbz     rCHAR1, 3(rSTR1)
+       lbz     rCHAR2, 3(rSTR2)
+       cmpdi   rCHAR1, 0
+       /* Increment both string descriptors */
+       addi    rSTR1, rSTR1, 4
+       addi    rSTR2, rSTR2, 4
+       sldi    rADDR1, rCHAR1, 2
+       sldi    rADDR2, rCHAR2, 2
+       lwzx    rLWR1, rLOC, rADDR1
+       lwzx    rLWR2, rLOC, rADDR2
+       cmpw    cr1, rLWR1, rLWR2
+       crorc   4*cr1+eq,eq,4*cr1+eq
+       beq     cr1, L(done)
+       lbz     rCHAR1, 0(rSTR1)        /* Load char from s1 */
+       lbz     rCHAR2, 0(rSTR2)        /* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+       bdnz    L(loop)
+#else
+       b       L(loop)
+#endif
+#ifdef USE_AS_STRNCASECMP
+L(lessthan4):
+       clrldi  r5, r5, 62
+       cmpdi   cr7, r5, 0
+       beq     cr7, L(retnull)
+       mtctr   r5
+L(loop1):
+       cmpdi   rCHAR1, 0
+       sldi    rADDR1, rCHAR1, 2
+       sldi    rADDR2, rCHAR2, 2
+       lwzx    rLWR1, rLOC, rADDR1
+       lwzx    rLWR2, rLOC, rADDR2
+       cmpw    cr1, rLWR1, rLWR2
+       crorc   4*cr1+eq,eq,4*cr1+eq
+       beq     cr1, L(done)
+       addi    rSTR1, rSTR1, 1
+       addi    rSTR2, rSTR2, 1
+       lbz     rCHAR1, 0(rSTR1)
+       lbz     rCHAR2, 0(rSTR2)
+       bdnz    L(loop1)
+#endif
+L(done):
+       subf    r0, rLWR2, rLWR1
+       extsw   rRTN, r0
+       blr
+END (__STRCASECMP)
+
+weak_alias (__STRCASECMP, STRCASECMP)
+libc_hidden_builtin_def (__STRCASECMP)
diff --git a/sysdeps/powerpc/powerpc64/power8/strncase.S b/sysdeps/powerpc/powerpc64/power8/strncase.S
new file mode 100644 (file)
index 0000000..7ce2ed0
--- /dev/null
@@ -0,0 +1,20 @@
+/* Optimized strncasecmp implementation for POWER8.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STRNCASECMP 1
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>