x86: Unifies 'strlen-evex' and 'strlen-evex512' implementations.

author Matthew Sterrett <matthew.sterrett@intel.com>

Fri, 15 Dec 2023 20:04:05 +0000 (12:04 -0800)

committer Noah Goldstein <goldstein.w.n@gmail.com>

Mon, 18 Dec 2023 18:38:01 +0000 (12:38 -0600)
author Matthew Sterrett <matthew.sterrett@intel.com>
Fri, 15 Dec 2023 20:04:05 +0000 (12:04 -0800)
committer Noah Goldstein <goldstein.w.n@gmail.com>
Mon, 18 Dec 2023 18:38:01 +0000 (12:38 -0600)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S

index 7305b24e2869da35ba263bc9ecc7d3598cd6a596..77dc89900a53cfa3ebb80675f7945e0aba73cd6e 100644 (file)
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -1,5 +1,5 @@
-/* Placeholder function, not used by any processor at the moment.
-   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+/* strlen/wcslen optimized with 256/512-bit EVEX instructions.
+   Copyright (C) 2021-2023 Free Software Foundation, Inc.
     This file is part of the GNU C Library.
  
     The GNU C Library is free software; you can redistribute it and/or
@@ -16,7 +16,6 @@
     License along with the GNU C Library; if not, see
     <https://www.gnu.org/licenses/>.  */
  
-/* UNUSED. Exists purely as reference implementation.  */
  
  #include <isa-level.h>
  
@@ -26,272 +25,211 @@
  
  # ifdef USE_AS_WCSLEN
  #  define VPCMPEQ      vpcmpeqd
+#  define VPCMPNEQ     vpcmpneqd
  #  define VPTESTN      vptestnmd
+#  define VPTEST       vptestmd
  #  define VPMINU       vpminud
  #  define CHAR_SIZE    4
+#  define CHAR_SIZE_SHIFT_REG(reg)     sar $2, %reg
  # else
  #  define VPCMPEQ      vpcmpeqb
+#  define VPCMPNEQ     vpcmpneqb
  #  define VPTESTN      vptestnmb
+#  define VPTEST       vptestmb
  #  define VPMINU       vpminub
  #  define CHAR_SIZE    1
+#  define CHAR_SIZE_SHIFT_REG(reg)
+
+#  define REG_WIDTH    VEC_SIZE
  # endif
  
-# define PAGE_SIZE     4096
  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
  
-       .section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
-   one vector length string.  */
-ENTRY_P2ALIGN (STRLEN, 6)
-# ifdef USE_AS_STRNLEN
-       /* Check zero length.  */
-       test    %RSI_LP, %RSI_LP
-       jz      L(ret_max)
-#  ifdef __ILP32__
-       /* Clear the upper 32 bits.  */
-       movl    %esi, %esi
-#  endif
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL      first_vec_x2
+#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
+
+# else
+
+#  define TAIL_RETURN_LBL      first_vec_x3
+#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
  # endif
  
+# define XZERO VMM_128(0)
+# define VZERO VMM(0)
+# define PAGE_SIZE     4096
+
+       .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRLEN, 6)
         movl    %edi, %eax
-       vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
-       sall    $20, %eax
-       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
-       ja      L(page_cross)
-
-       /* Compare [w]char for null, mask bit will be set for match.  */
-       VPCMPEQ (%rdi), %VMM(0), %k0
-# ifdef USE_AS_STRNLEN
-       KMOV    %k0, %VRCX
-       /* Store max length in rax.  */
-       mov     %rsi, %rax
-       /* If rcx is 0, rax will have max length.  We can not use VRCX
-          and VRAX here for evex256 because, upper 32 bits may be
-          undefined for ecx and eax.  */
-       bsfq    %rcx, %rax
-       cmp     $CHAR_PER_VEC, %rax
-       ja      L(align_more)
-       cmpq    %rax, %rsi
-       cmovb   %esi, %eax
-# else
+       vpxorq  %XZERO, %XZERO, %XZERO
+       andl    $(PAGE_SIZE - 1), %eax
+       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
+       ja      L(cross_page_boundary)
+
+       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+          null byte.  */
+       VPCMPEQ (%rdi), %VZERO, %k0
         KMOV    %k0, %VRAX
         test    %VRAX, %VRAX
-       jz      L(align_more)
+       jz      L(aligned_more)
         bsf     %VRAX, %VRAX
-# endif
         ret
  
-       /* At this point vector max length reached.  */
-# ifdef USE_AS_STRNLEN
-       .p2align 4,,3
-L(ret_max):
-       movq    %rsi, %rax
+       .p2align 4,, 8
+L(first_vec_x4):
+       bsf     %VRAX, %VRAX
+       subl    %ecx, %edi
+       CHAR_SIZE_SHIFT_REG (edi)
+       leal    (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
         ret
-# endif
  
-L(align_more):
-       mov     %rdi, %rax
-       /* Align rax to VEC_SIZE.  */
-       andq    $-VEC_SIZE, %rax
-# ifdef USE_AS_STRNLEN
-       movq    %rdi, %rdx
-       subq    %rax, %rdx
-#  ifdef USE_AS_WCSLEN
-       shr     $2, %VRDX
-#  endif
-       /* At this point rdx contains [w]chars already compared.  */
-       leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
-       /* At this point rdx contains number of w[char] needs to go.
-          Now onwards rdx will keep decrementing with each compare.  */
-# endif
-
-       /* Loop unroll 4 times for 4 vector loop.  */
-       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
-       subq    $-VEC_SIZE, %rax
-       KMOV    %k0, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x1)
  
-# ifdef USE_AS_STRNLEN
-       subq    $CHAR_PER_VEC, %rdx
-       jbe     L(ret_max)
-# endif
  
-       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
-       KMOV    %k0, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x2)
+       /* Aligned more for strnlen compares remaining length vs 2 *
+          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+          going to the loop.  */
+       .p2align 4,, 10
+L(aligned_more):
+       movq    %rdi, %rcx
+       andq    $(VEC_SIZE * -1), %rdi
+L(cross_page_continue):
+       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+          rechecking bounds.  */
+       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x1)
  
-# ifdef USE_AS_STRNLEN
-       subq    $CHAR_PER_VEC, %rdx
-       jbe     L(ret_max)
-# endif
+       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x2)
  
-       VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
-       KMOV    %k0, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x3)
+       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x3)
  
-# ifdef USE_AS_STRNLEN
-       subq    $CHAR_PER_VEC, %rdx
-       jbe     L(ret_max)
-# endif
+       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x4)
  
-       VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
-       KMOV    %k0, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x4)
+       subq    $(VEC_SIZE * -1), %rdi
  
-# ifdef USE_AS_STRNLEN
-       subq    $CHAR_PER_VEC, %rdx
-       jbe     L(ret_max)
-       /* Save pointer before 4 x VEC_SIZE alignment.  */
-       movq    %rax, %rcx
+# if CHAR_PER_VEC == 64
+       /* No partial register stalls on processors that we use evex512
+          on and this saves code size.  */
+       xorb    %dil, %dil
+# else
+       andq    $-(VEC_SIZE * 4), %rdi
  # endif
  
-       /* Align address to VEC_SIZE * 4 for loop.  */
-       andq    $-(VEC_SIZE * 4), %rax
-
-# ifdef USE_AS_STRNLEN
-       subq    %rax, %rcx
-#  ifdef USE_AS_WCSLEN
-       shr     $2, %VRCX
-#  endif
-       /* rcx contains number of [w]char will be recompared due to
-          alignment fixes.  rdx must be incremented by rcx to offset
-          alignment adjustment.  */
-       addq    %rcx, %rdx
-       /* Need jump as we don't want to add/subtract rdx for first
-          iteration of 4 x VEC_SIZE aligned loop.  */
-# endif
  
-       .p2align 4,,11
-L(loop):
-       /* VPMINU and VPCMP combination provide better performance as
-          compared to alternative combinations.  */
-       VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
-       VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
-       VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
-       VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
  
+       /* Compare 4 * VEC at a time forward.  */
+       .p2align 4
+L(loop_4x_vec):
+       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
+       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
+       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
         VPTESTN %VMM(2), %VMM(2), %k0
-       VPTESTN %VMM(4), %VMM(4), %k1
+       VPTESTN %VMM(4), %VMM(4), %k2
  
-       subq    $-(VEC_SIZE * 4), %rax
-       KORTEST %k0, %k1
+       subq    $-(VEC_SIZE * 4), %rdi
+       KORTEST %k0, %k2
+       jz      L(loop_4x_vec)
  
-# ifndef USE_AS_STRNLEN
-       jz      L(loop)
+       VPTESTN %VMM(1), %VMM(1), %k1
+       KMOV    %k1, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x0)
+
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x1)
+
+       VPTESTN %VMM(3), %VMM(3), %k0
+
+# if CHAR_PER_VEC == 64
+       KMOV    %k0, %VRAX
+       test    %VRAX, %VRAX
+       jnz     L(first_vec_x2)
+       KMOV    %k2, %VRAX
  # else
-       jnz     L(loopend)
-       subq    $(CHAR_PER_VEC * 4), %rdx
-       ja      L(loop)
-       mov     %rsi, %rax
+       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.  */
+       kmovd   %k2, %edx
+       kmovd   %k0, %eax
+       salq    $CHAR_PER_VEC, %rdx
+       orq     %rdx, %rax
+# endif
+
+       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.  */
+       .p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+       bsfq    %rax, %rax
+       subq    %rcx, %rdi
+       CHAR_SIZE_SHIFT_REG (rdi)
+       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
         ret
-# endif
  
-L(loopend):
-
-       VPTESTN %VMM(1), %VMM(1), %k2
-       KMOV    %k2, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x1)
-
-       KMOV    %k0, %VRCX
-       /* At this point, if k0 is non zero, null char must be in the
-          second vector.  */
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x2)
-
-       VPTESTN %VMM(3), %VMM(3), %k3
-       KMOV    %k3, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x3)
-       /* At this point null [w]char must be in the fourth vector so no
-          need to check.  */
-       KMOV    %k1, %VRCX
-
-       /* Fourth, third, second vector terminating are pretty much
-          same, implemented this way to avoid branching and reuse code
-          from pre loop exit condition.  */
-L(ret_vec_x4):
-       bsf     %VRCX, %VRCX
-       subq    %rdi, %rax
-# ifdef USE_AS_WCSLEN
-       subq    $-(VEC_SIZE * 3), %rax
-       shrq    $2, %rax
-       addq    %rcx, %rax
-# else
-       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
-       cmpq    %rsi, %rax
-       cmovnb  %rsi, %rax
-# endif
+       .p2align 4,, 8
+L(first_vec_x0):
+       bsf     %VRAX, %VRAX
+       sub     %rcx, %rdi
+       CHAR_SIZE_SHIFT_REG (rdi)
+       addq    %rdi, %rax
         ret
  
-L(ret_vec_x3):
-       bsf     %VRCX, %VRCX
-       subq    %rdi, %rax
-# ifdef USE_AS_WCSLEN
-       subq    $-(VEC_SIZE * 2), %rax
-       shrq    $2, %rax
-       addq    %rcx, %rax
-# else
-       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
-       cmpq    %rsi, %rax
-       cmovnb  %rsi, %rax
-# endif
+       .p2align 4,, 10
+L(first_vec_x1):
+       bsf     %VRAX, %VRAX
+       sub     %rcx, %rdi
+       CHAR_SIZE_SHIFT_REG (rdi)
+       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
         ret
  
-L(ret_vec_x2):
-       subq    $-VEC_SIZE, %rax
-L(ret_vec_x1):
-       bsf     %VRCX, %VRCX
-       subq    %rdi, %rax
-# ifdef USE_AS_WCSLEN
-       shrq    $2, %rax
-# endif
-       addq    %rcx, %rax
-# ifdef USE_AS_STRNLEN
-       cmpq    %rsi, %rax
-       cmovnb  %rsi, %rax
-# endif
+       .p2align 4,, 10
+       /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.  */
+L(TAIL_RETURN_LBL):
+       bsf     %VRAX, %VRAX
+       sub     %VRCX, %VRDI
+       CHAR_SIZE_SHIFT_REG (VRDI)
+       lea     (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
         ret
  
-L(page_cross):
-       mov     %rdi, %rax
-       movl    %edi, %ecx
-       andl    $(VEC_SIZE - 1), %ecx
+       .p2align 4,, 8
+L(cross_page_boundary):
+       movq    %rdi, %rcx
+       /* Align data to VEC_SIZE.  */
+       andq    $-VEC_SIZE, %rdi
+
+       VPCMPEQ (%rdi), %VZERO, %k0
+
+       KMOV    %k0, %VRAX
  # ifdef USE_AS_WCSLEN
-       sarl    $2, %ecx
-# endif
-       /* ecx contains number of w[char] to be skipped as a result
-          of address alignment.  */
-       andq    $-VEC_SIZE, %rax
-       VPCMPEQ (%rax), %VMM(0), %k0
-       KMOV    %k0, %VRDX
-       /* Ignore number of character for alignment adjustment.  */
-       shr     %cl, %VRDX
-# ifdef USE_AS_STRNLEN
-       jnz     L(page_cross_end)
-       movl    $CHAR_PER_VEC, %eax
-       sub     %ecx, %eax
-       cmp     %rax, %rsi
-       ja      L(align_more)
+       movl    %ecx, %edx
+       shrl    $2, %edx
+       andl    $(CHAR_PER_VEC - 1), %edx
+       shrx    %edx, %eax, %eax
+       testl   %eax, %eax
  # else
-       jz      L(align_more)
-# endif
-
-L(page_cross_end):
-       bsf     %VRDX, %VRAX
-# ifdef USE_AS_STRNLEN
-       cmpq    %rsi, %rax
-       cmovnb  %esi, %eax
+       shr     %cl, %VRAX
  # endif
+       jz      L(cross_page_continue)
+       bsf     %VRAX, %VRAX
         ret
  
-END (STRLEN)
+END(STRLEN)
  #endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S

index 364eeffff6483b9cd14f25e22c746696fd837488..93ad15e3561937e43ff1dcec510812c4f6bdd8d0 100644 (file)
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -1,245 +1,7 @@
-/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
-   Copyright (C) 2021-2023 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifndef STRLEN
-#  define STRLEN       __strlen_evex
-# endif
-
-# ifndef VEC_SIZE
-#  include "x86-evex256-vecs.h"
-# endif
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ      vpcmpeqd
-#  define VPCMPNEQ     vpcmpneqd
-#  define VPTESTN      vptestnmd
-#  define VPTEST       vptestmd
-#  define VPMINU       vpminud
-#  define CHAR_SIZE    4
-#  define CHAR_SIZE_SHIFT_REG(reg)     sar $2, %reg
-# else
-#  define VPCMPEQ      vpcmpeqb
-#  define VPCMPNEQ     vpcmpneqb
-#  define VPTESTN      vptestnmb
-#  define VPTEST       vptestmb
-#  define VPMINU       vpminub
-#  define CHAR_SIZE    1
-#  define CHAR_SIZE_SHIFT_REG(reg)
-
-#  define REG_WIDTH    VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 64
-
-#  define TAIL_RETURN_LBL      first_vec_x2
-#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
-
-#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
-#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
-
-# else
-
-#  define TAIL_RETURN_LBL      first_vec_x3
-#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
-
-#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
-#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
-# endif
-
-# define XZERO VMM_128(0)
-# define VZERO VMM(0)
-# define PAGE_SIZE     4096
-
-       .section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRLEN, 6)
-       movl    %edi, %eax
-       vpxorq  %XZERO, %XZERO, %XZERO
-       andl    $(PAGE_SIZE - 1), %eax
-       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
-       ja      L(cross_page_boundary)
-
-       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
-          null byte.  */
-       VPCMPEQ (%rdi), %VZERO, %k0
-       KMOV    %k0, %VRAX
-       test    %VRAX, %VRAX
-       jz      L(aligned_more)
-       bsf     %VRAX, %VRAX
-       ret
-
-       .p2align 4,, 8
-L(first_vec_x4):
-       bsf     %VRAX, %VRAX
-       subl    %ecx, %edi
-       CHAR_SIZE_SHIFT_REG (edi)
-       leal    (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-       ret
-
-
-
-       /* Aligned more for strnlen compares remaining length vs 2 *
-          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
-          going to the loop.  */
-       .p2align 4,, 10
-L(aligned_more):
-       movq    %rdi, %rcx
-       andq    $(VEC_SIZE * -1), %rdi
-L(cross_page_continue):
-       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
-          rechecking bounds.  */
-       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRAX
-       test    %VRAX, %VRAX
-       jnz     L(first_vec_x1)
-
-       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRAX
-       test    %VRAX, %VRAX
-       jnz     L(first_vec_x2)
-
-       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRAX
-       test    %VRAX, %VRAX
-       jnz     L(first_vec_x3)
-
-       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRAX
-       test    %VRAX, %VRAX
-       jnz     L(first_vec_x4)
-
-       subq    $(VEC_SIZE * -1), %rdi
-
-# if CHAR_PER_VEC == 64
-       /* No partial register stalls on processors that we use evex512
-          on and this saves code size.  */
-       xorb    %dil, %dil
-# else
-       andq    $-(VEC_SIZE * 4), %rdi
-# endif
-
-
-
-       /* Compare 4 * VEC at a time forward.  */
-       .p2align 4
-L(loop_4x_vec):
-       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
-       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
-       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
-       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
-       VPTESTN %VMM(2), %VMM(2), %k0
-       VPTESTN %VMM(4), %VMM(4), %k2
-
-       subq    $-(VEC_SIZE * 4), %rdi
-       KORTEST %k0, %k2
-       jz      L(loop_4x_vec)
-
-       VPTESTN %VMM(1), %VMM(1), %k1
-       KMOV    %k1, %VRAX
-       test    %VRAX, %VRAX
-       jnz     L(first_vec_x0)
-
-       KMOV    %k0, %VRAX
-       test    %VRAX, %VRAX
-       jnz     L(first_vec_x1)
-
-       VPTESTN %VMM(3), %VMM(3), %k0
-
-# if CHAR_PER_VEC == 64
-       KMOV    %k0, %VRAX
-       test    %VRAX, %VRAX
-       jnz     L(first_vec_x2)
-       KMOV    %k2, %VRAX
-# else
-       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
-        */
-       kmovd   %k2, %edx
-       kmovd   %k0, %eax
-       salq    $CHAR_PER_VEC, %rdx
-       orq     %rdx, %rax
-# endif
-
-       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
-        */
-       .p2align 4,, 2
-L(FALLTHROUGH_RETURN_LBL):
-       bsfq    %rax, %rax
-       subq    %rcx, %rdi
-       CHAR_SIZE_SHIFT_REG (rdi)
-       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
-       ret
-
-       .p2align 4,, 8
-L(first_vec_x0):
-       bsf     %VRAX, %VRAX
-       sub     %rcx, %rdi
-       CHAR_SIZE_SHIFT_REG (rdi)
-       addq    %rdi, %rax
-       ret
-
-       .p2align 4,, 10
-L(first_vec_x1):
-       bsf     %VRAX, %VRAX
-       sub     %rcx, %rdi
-       CHAR_SIZE_SHIFT_REG (rdi)
-       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
-       ret
-
-       .p2align 4,, 10
-       /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
-        */
-L(TAIL_RETURN_LBL):
-       bsf     %VRAX, %VRAX
-       sub     %VRCX, %VRDI
-       CHAR_SIZE_SHIFT_REG (VRDI)
-       lea     (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
-       ret
-
-       .p2align 4,, 8
-L(cross_page_boundary):
-       movq    %rdi, %rcx
-       /* Align data to VEC_SIZE.  */
-       andq    $-VEC_SIZE, %rdi
-
-       VPCMPEQ (%rdi), %VZERO, %k0
-
-       KMOV    %k0, %VRAX
-# ifdef USE_AS_WCSLEN
-       movl    %ecx, %edx
-       shrl    $2, %edx
-       andl    $(CHAR_PER_VEC - 1), %edx
-       shrx    %edx, %eax, %eax
-       testl   %eax, %eax
-# else
-       shr     %cl, %VRAX
-# endif
-       jz      L(cross_page_continue)
-       bsf     %VRAX, %VRAX
-       ret
-
-END (STRLEN)
+#ifndef STRLEN
+# define STRLEN                __strlen_evex
  #endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S

index 0b7f220214a7c33c49b7fbbfc60614f86ec61308..ebf22c259f9b6362d1117753a337506c93d9e92b 100644 (file)
--- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -1,4 +1,264 @@
-#define STRLEN __strnlen_evex512
-#define USE_AS_STRNLEN 1
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
  
-#include "strlen-evex512.S"
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex512
+#endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ      vpcmpeqd
+#  define VPTESTN      vptestnmd
+#  define VPMINU       vpminud
+#  define CHAR_SIZE    4
+# else
+#  define VPCMPEQ      vpcmpeqb
+#  define VPTESTN      vptestnmb
+#  define VPMINU       vpminub
+#  define CHAR_SIZE    1
+# endif
+
+# define PAGE_SIZE     4096
+# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
+
+       .section SECTION(.text),"ax",@progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRNLEN, 6)
+       /* Check zero length.  */
+       test    %RSI_LP, %RSI_LP
+       jz      L(ret_max)
+#  ifdef __ILP32__
+       /* Clear the upper 32 bits.  */
+       movl    %esi, %esi
+#  endif
+
+       movl    %edi, %eax
+       vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
+       sall    $20, %eax
+       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+       ja      L(page_cross)
+
+       /* Compare [w]char for null, mask bit will be set for match.  */
+       VPCMPEQ (%rdi), %VMM(0), %k0
+       KMOV    %k0, %VRCX
+       /* Store max length in rax.  */
+       mov     %rsi, %rax
+       /* If rcx is 0, rax will have max length.  We can not use VRCX
+          and VRAX here for evex256 because, upper 32 bits may be
+          undefined for ecx and eax.  */
+       bsfq    %rcx, %rax
+       cmp     $CHAR_PER_VEC, %rax
+       ja      L(align_more)
+       cmpq    %rax, %rsi
+       cmovb   %esi, %eax
+       ret
+
+       /* At this point vector max length reached.  */
+       .p2align 4,,3
+L(ret_max):
+       movq    %rsi, %rax
+       ret
+
+L(align_more):
+       mov     %rdi, %rax
+       /* Align rax to VEC_SIZE.  */
+       andq    $-VEC_SIZE, %rax
+       movq    %rdi, %rdx
+       subq    %rax, %rdx
+#  ifdef USE_AS_WCSLEN
+       shr     $2, %VRDX
+#  endif
+       /* At this point rdx contains [w]chars already compared.  */
+       leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
+       /* At this point rdx contains number of w[char] needs to go.
+          Now onwards rdx will keep decrementing with each compare.  */
+
+       /* Loop unroll 4 times for 4 vector loop.  */
+       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
+       subq    $-VEC_SIZE, %rax
+       KMOV    %k0, %VRCX
+       test    %VRCX, %VRCX
+       jnz     L(ret_vec_x1)
+
+       subq    $CHAR_PER_VEC, %rdx
+       jbe     L(ret_max)
+
+       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
+       KMOV    %k0, %VRCX
+       test    %VRCX, %VRCX
+       jnz     L(ret_vec_x2)
+
+       subq    $CHAR_PER_VEC, %rdx
+       jbe     L(ret_max)
+
+       VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+       KMOV    %k0, %VRCX
+       test    %VRCX, %VRCX
+       jnz     L(ret_vec_x3)
+
+       subq    $CHAR_PER_VEC, %rdx
+       jbe     L(ret_max)
+
+       VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+       KMOV    %k0, %VRCX
+       test    %VRCX, %VRCX
+       jnz     L(ret_vec_x4)
+
+       subq    $CHAR_PER_VEC, %rdx
+       jbe     L(ret_max)
+       /* Save pointer before 4 x VEC_SIZE alignment.  */
+       movq    %rax, %rcx
+
+       /* Align address to VEC_SIZE * 4 for loop.  */
+       andq    $-(VEC_SIZE * 4), %rax
+
+       subq    %rax, %rcx
+#  ifdef USE_AS_WCSLEN
+       shr     $2, %VRCX
+#  endif
+       /* rcx contains number of [w]char will be recompared due to
+          alignment fixes.  rdx must be incremented by rcx to offset
+          alignment adjustment.  */
+       addq    %rcx, %rdx
+       /* Need jump as we don't want to add/subtract rdx for first
+          iteration of 4 x VEC_SIZE aligned loop.  */
+
+       .p2align 4,,11
+L(loop):
+       /* VPMINU and VPCMP combination provide better performance as
+          compared to alternative combinations.  */
+       VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
+       VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+       VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
+       VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
+
+       VPTESTN %VMM(2), %VMM(2), %k0
+       VPTESTN %VMM(4), %VMM(4), %k1
+
+       subq    $-(VEC_SIZE * 4), %rax
+       KORTEST %k0, %k1
+
+       jnz     L(loopend)
+       subq    $(CHAR_PER_VEC * 4), %rdx
+       ja      L(loop)
+       mov     %rsi, %rax
+       ret
+
+L(loopend):
+
+       VPTESTN %VMM(1), %VMM(1), %k2
+       KMOV    %k2, %VRCX
+       test    %VRCX, %VRCX
+       jnz     L(ret_vec_x1)
+
+       KMOV    %k0, %VRCX
+       /* At this point, if k0 is non zero, null char must be in the
+          second vector.  */
+       test    %VRCX, %VRCX
+       jnz     L(ret_vec_x2)
+
+       VPTESTN %VMM(3), %VMM(3), %k3
+       KMOV    %k3, %VRCX
+       test    %VRCX, %VRCX
+       jnz     L(ret_vec_x3)
+       /* At this point null [w]char must be in the fourth vector so no
+          need to check.  */
+       KMOV    %k1, %VRCX
+
+       /* Fourth, third, second vector terminating are pretty much
+          same, implemented this way to avoid branching and reuse code
+          from pre loop exit condition.  */
+L(ret_vec_x4):
+       bsf     %VRCX, %VRCX
+       subq    %rdi, %rax
+# ifdef USE_AS_WCSLEN
+       subq    $-(VEC_SIZE * 3), %rax
+       shrq    $2, %rax
+       addq    %rcx, %rax
+# else
+       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+
+       cmpq    %rsi, %rax
+       cmovnb  %rsi, %rax
+       ret
+
+L(ret_vec_x3):
+       bsf     %VRCX, %VRCX
+       subq    %rdi, %rax
+# ifdef USE_AS_WCSLEN
+       subq    $-(VEC_SIZE * 2), %rax
+       shrq    $2, %rax
+       addq    %rcx, %rax
+# else
+       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+       cmpq    %rsi, %rax
+       cmovnb  %rsi, %rax
+       ret
+
+L(ret_vec_x2):
+       subq    $-VEC_SIZE, %rax
+L(ret_vec_x1):
+       bsf     %VRCX, %VRCX
+       subq    %rdi, %rax
+# ifdef USE_AS_WCSLEN
+       shrq    $2, %rax
+# endif
+       addq    %rcx, %rax
+       cmpq    %rsi, %rax
+       cmovnb  %rsi, %rax
+       ret
+
+L(page_cross):
+       mov     %rdi, %rax
+       movl    %edi, %ecx
+       andl    $(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+       sarl    $2, %ecx
+# endif
+       /* ecx contains number of w[char] to be skipped as a result
+          of address alignment.  */
+       andq    $-VEC_SIZE, %rax
+       VPCMPEQ (%rax), %VMM(0), %k0
+       KMOV    %k0, %VRDX
+       /* Ignore number of character for alignment adjustment.  */
+       shr     %cl, %VRDX
+       jnz     L(page_cross_end)
+       movl    $CHAR_PER_VEC, %eax
+       sub     %ecx, %eax
+       cmp     %rax, %rsi
+       ja      L(align_more)
+
+L(page_cross_end):
+       bsf     %VRDX, %VRAX
+       cmpq    %rsi, %rax
+       cmovnb  %esi, %eax
+       ret
+
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S

index f59c372b78b4fb8c8e85d056a933804055b91093..aff288a66b2831fbc45178b683ec43426856bad4 100644 (file)
--- a/sysdeps/x86_64/multiarch/wcslen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -1,4 +1,8 @@
-#define STRLEN __wcslen_evex512
+#ifndef WCSLEN
+# define WCSLEN        __wcslen_evex512
+#endif
+
+#define STRLEN WCSLEN
  #define USE_AS_WCSLEN 1
  
  #include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S

index 73dcf2f210a85aacc9a50acc332eef1fc8ed60f8..1c37d74fc90ff60cb764c9a04b564a539dac5f7f 100644 (file)
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -1,5 +1,8 @@
-#define STRLEN __wcsnlen_evex512
+#ifndef WCSNLEN
+# define WCSNLEN       __wcsnlen_evex512
+#endif
+
+#define STRNLEN        WCSNLEN
  #define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
  
-#include "strlen-evex512.S"
+#include "strnlen-evex512.S"
author	Matthew Sterrett <matthew.sterrett@intel.com>
	Fri, 15 Dec 2023 20:04:05 +0000 (12:04 -0800)
committer	Noah Goldstein <goldstein.w.n@gmail.com>
	Mon, 18 Dec 2023 18:38:01 +0000 (12:38 -0600)
sysdeps/x86_64/multiarch/strlen-evex-base.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/strlen-evex.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/strnlen-evex512.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/wcslen-evex512.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/wcsnlen-evex512.S		patch \| blob \| blame \| history