x86: Shrink / minorly optimize strchr-evex and implement with VMM headers

author Noah Goldstein <goldstein.w.n@gmail.com>

Wed, 19 Oct 2022 00:44:04 +0000 (17:44 -0700)

committer Noah Goldstein <goldstein.w.n@gmail.com>

Thu, 20 Oct 2022 00:31:03 +0000 (17:31 -0700)
author Noah Goldstein <goldstein.w.n@gmail.com>
Wed, 19 Oct 2022 00:44:04 +0000 (17:44 -0700)
committer Noah Goldstein <goldstein.w.n@gmail.com>
Thu, 20 Oct 2022 00:31:03 +0000 (17:31 -0700)
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S

index a1c15c4419e22e08b2961c42864c8cc054e03985..c2a0d112f7b4e41a51bde59b342c04c54e5d2cc3 100644 (file)
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -26,48 +26,75 @@
  #  define STRCHR       __strchr_evex
  # endif
  
-# define VMOVU         vmovdqu64
-# define VMOVA         vmovdqa64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
  
  # ifdef USE_AS_WCSCHR
  #  define VPBROADCAST  vpbroadcastd
-#  define VPCMP                vpcmpd
+#  define VPCMP        vpcmpd
+#  define VPCMPEQ      vpcmpeqd
  #  define VPTESTN      vptestnmd
+#  define VPTEST       vptestmd
  #  define VPMINU       vpminud
  #  define CHAR_REG     esi
-#  define SHIFT_REG    ecx
+#  define SHIFT_REG    rcx
  #  define CHAR_SIZE    4
+
+#  define USE_WIDE_CHAR
  # else
  #  define VPBROADCAST  vpbroadcastb
-#  define VPCMP                vpcmpb
+#  define VPCMP        vpcmpb
+#  define VPCMPEQ      vpcmpeqb
  #  define VPTESTN      vptestnmb
+#  define VPTEST       vptestmb
  #  define VPMINU       vpminub
  #  define CHAR_REG     sil
-#  define SHIFT_REG    edx
+#  define SHIFT_REG    rdi
  #  define CHAR_SIZE    1
  # endif
  
-# define XMMZERO       xmm16
-
-# define YMMZERO       ymm16
-# define YMM0          ymm17
-# define YMM1          ymm18
-# define YMM2          ymm19
-# define YMM3          ymm20
-# define YMM4          ymm21
-# define YMM5          ymm22
-# define YMM6          ymm23
-# define YMM7          ymm24
-# define YMM8          ymm25
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-       .section .text.evex,"ax",@progbits
-ENTRY_P2ALIGN (STRCHR, 5)
-       /* Broadcast CHAR to YMM0.      */
-       VPBROADCAST     %esi, %YMM0
+# include "reg-macros.h"
+
+# if VEC_SIZE == 64
+#  define MASK_GPR     rcx
+#  define LOOP_REG     rax
+
+#  define COND_MASK(k_reg)     {%k_reg}
+# else
+#  define MASK_GPR     rax
+#  define LOOP_REG     rdi
+
+#  define COND_MASK(k_reg)
+# endif
+
+# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
+
+
+# if CHAR_PER_VEC == 64
+#  define LAST_VEC_OFFSET      (VEC_SIZE * 3)
+#  define TESTZ(reg)   incq %VGPR_SZ(reg, 64)
+# else
+
+#  if CHAR_PER_VEC == 32
+#   define TESTZ(reg)  incl %VGPR_SZ(reg, 32)
+#  elif CHAR_PER_VEC == 16
+#   define TESTZ(reg)  incw %VGPR_SZ(reg, 16)
+#  else
+#   define TESTZ(reg)  incb %VGPR_SZ(reg, 8)
+#  endif
+
+#  define LAST_VEC_OFFSET      (VEC_SIZE * 2)
+# endif
+
+# define VMATCH        VMM(0)
+
+# define PAGE_SIZE     4096
+
+       .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRCHR, 6)
+       /* Broadcast CHAR to VEC_0.  */
+       VPBROADCAST %esi, %VMATCH
         movl    %edi, %eax
         andl    $(PAGE_SIZE - 1), %eax
         /* Check if we cross page boundary with one vector load.
@@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5)
         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
         ja      L(cross_page_boundary)
  
+
         /* Check the first VEC_SIZE bytes. Search for both CHAR and the
            null bytes.  */
-       VMOVU   (%rdi), %YMM1
-
+       VMOVU   (%rdi), %VMM(1)
         /* Leaves only CHARS matching esi as 0.  */
-       vpxorq  %YMM1, %YMM0, %YMM2
-       VPMINU  %YMM2, %YMM1, %YMM2
-       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-       VPTESTN %YMM2, %YMM2, %k0
-       kmovd   %k0, %eax
-       testl   %eax, %eax
+       vpxorq  %VMM(1), %VMATCH, %VMM(2)
+       VPMINU  %VMM(2), %VMM(1), %VMM(2)
+       /* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+       VPTESTN %VMM(2), %VMM(2), %k0
+       KMOV    %k0, %VRAX
+# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL
+       /* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so
+          that all logic for match/null in first VEC first in 1x cache
+          lines.  This has a slight cost to larger sizes.  */
+       bsf     %VRAX, %VRAX
+       jz      L(aligned_more)
+# else
+       test    %VRAX, %VRAX
         jz      L(aligned_more)
-       tzcntl  %eax, %eax
+       bsf     %VRAX, %VRAX
+# endif
  # ifndef USE_AS_STRCHRNUL
         /* Found CHAR or the null byte.  */
         cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5)
  # endif
         ret
  
-
-
-       .p2align 4,, 10
-L(first_vec_x4):
-# ifndef USE_AS_STRCHRNUL
-       /* Check to see if first match was CHAR (k0) or null (k1).  */
-       kmovd   %k0, %eax
-       tzcntl  %eax, %eax
-       kmovd   %k1, %ecx
-       /* bzhil will not be 0 if first match was null.  */
-       bzhil   %eax, %ecx, %ecx
-       jne     L(zero)
-# else
-       /* Combine CHAR and null matches.  */
-       kord    %k0, %k1, %k0
-       kmovd   %k0, %eax
-       tzcntl  %eax, %eax
-# endif
-       /* NB: Multiply sizeof char type (1 or 4) to get the number of
-          bytes.  */
-       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-       ret
-
  # ifndef USE_AS_STRCHRNUL
  L(zero):
         xorl    %eax, %eax
         ret
  # endif
  
-
-       .p2align 4
+       .p2align 4,, 2
+L(first_vec_x3):
+       subq    $-(VEC_SIZE * 2), %rdi
+# if VEC_SIZE == 32
+       /* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32.
+          For VEC_SIZE == 64 the registers don't match.  */
+L(last_vec_x2):
+# endif
  L(first_vec_x1):
         /* Use bsf here to save 1-byte keeping keeping the block in 1x
            fetch block. eax guranteed non-zero.  */
-       bsfl    %eax, %eax
+       bsf     %VRCX, %VRCX
  # ifndef USE_AS_STRCHRNUL
-       /* Found CHAR or the null byte.  */
-       cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+       /* Found CHAR or the null byte.  */
+       cmp     (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
         jne     L(zero)
-
  # endif
         /* NB: Multiply sizeof char type (1 or 4) to get the number of
            bytes.  */
-       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+       leaq    (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax
         ret
  
-       .p2align 4,, 10
+       .p2align 4,, 2
+L(first_vec_x4):
+       subq    $-(VEC_SIZE * 2), %rdi
  L(first_vec_x2):
  # ifndef USE_AS_STRCHRNUL
         /* Check to see if first match was CHAR (k0) or null (k1).  */
-       kmovd   %k0, %eax
-       tzcntl  %eax, %eax
-       kmovd   %k1, %ecx
+       KMOV    %k0, %VRAX
+       tzcnt   %VRAX, %VRAX
+       KMOV    %k1, %VRCX
         /* bzhil will not be 0 if first match was null.  */
-       bzhil   %eax, %ecx, %ecx
+       bzhi    %VRAX, %VRCX, %VRCX
         jne     L(zero)
  # else
         /* Combine CHAR and null matches.  */
-       kord    %k0, %k1, %k0
-       kmovd   %k0, %eax
-       tzcntl  %eax, %eax
+       KOR     %k0, %k1, %k0
+       KMOV    %k0, %VRAX
+       bsf     %VRAX, %VRAX
  # endif
         /* NB: Multiply sizeof char type (1 or 4) to get the number of
            bytes.  */
         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
         ret
  
-       .p2align 4,, 10
-L(first_vec_x3):
-       /* Use bsf here to save 1-byte keeping keeping the block in 1x
-          fetch block. eax guranteed non-zero.  */
-       bsfl    %eax, %eax
-# ifndef USE_AS_STRCHRNUL
-       /* Found CHAR or the null byte.  */
-       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
-       jne     L(zero)
+# ifdef USE_AS_STRCHRNUL
+       /* We use this as a hook to get imm8 encoding for the jmp to
+          L(page_cross_boundary).  This allows the hot case of a
+          match/null-term in first VEC to fit entirely in 1 cache
+          line.  */
+L(cross_page_boundary):
+       jmp     L(cross_page_boundary_real)
  # endif
-       /* NB: Multiply sizeof char type (1 or 4) to get the number of
-          bytes.  */
-       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-       ret
  
         .p2align 4
  L(aligned_more):
+L(cross_page_continue):
         /* Align data to VEC_SIZE.  */
         andq    $-VEC_SIZE, %rdi
-L(cross_page_continue):
-       /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
-          data is only aligned to VEC_SIZE. Use two alternating methods
-          for checking VEC to balance latency and port contention.  */
  
-       /* This method has higher latency but has better port
-          distribution.  */
-       VMOVA   (VEC_SIZE)(%rdi), %YMM1
+       /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+          since data is only aligned to VEC_SIZE. Use two alternating
+          methods for checking VEC to balance latency and port
+          contention.  */
+
+    /* Method(1) with 8c latency:
+          For VEC_SIZE == 32:
+          p0 * 1.83, p1 * 0.83, p5 * 1.33
+          For VEC_SIZE == 64:
+          p0 * 2.50, p1 * 0.00, p5 * 1.50  */
+       VMOVA   (VEC_SIZE)(%rdi), %VMM(1)
         /* Leaves only CHARS matching esi as 0.  */
-       vpxorq  %YMM1, %YMM0, %YMM2
-       VPMINU  %YMM2, %YMM1, %YMM2
-       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-       VPTESTN %YMM2, %YMM2, %k0
-       kmovd   %k0, %eax
-       testl   %eax, %eax
+       vpxorq  %VMM(1), %VMATCH, %VMM(2)
+       VPMINU  %VMM(2), %VMM(1), %VMM(2)
+       /* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+       VPTESTN %VMM(2), %VMM(2), %k0
+       KMOV    %k0, %VRCX
+       test    %VRCX, %VRCX
         jnz     L(first_vec_x1)
  
-       /* This method has higher latency but has better port
-          distribution.  */
-       VMOVA   (VEC_SIZE * 2)(%rdi), %YMM1
-       /* Each bit in K0 represents a CHAR in YMM1.  */
-       VPCMP   $0, %YMM1, %YMM0, %k0
-       /* Each bit in K1 represents a CHAR in YMM1.  */
-       VPTESTN %YMM1, %YMM1, %k1
-       kortestd        %k0, %k1
+    /* Method(2) with 6c latency:
+          For VEC_SIZE == 32:
+          p0 * 1.00, p1 * 0.00, p5 * 2.00
+          For VEC_SIZE == 64:
+          p0 * 1.00, p1 * 0.00, p5 * 2.00  */
+       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(1)
+       /* Each bit in K0 represents a CHAR in VEC_1.  */
+       VPCMPEQ %VMM(1), %VMATCH, %k0
+       /* Each bit in K1 represents a CHAR in VEC_1.  */
+       VPTESTN %VMM(1), %VMM(1), %k1
+       KORTEST %k0, %k1
         jnz     L(first_vec_x2)
  
-       VMOVA   (VEC_SIZE * 3)(%rdi), %YMM1
+       /* By swapping between Method 1/2 we get more fair port
+          distrubition and better throughput.  */
+
+       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(1)
         /* Leaves only CHARS matching esi as 0.  */
-       vpxorq  %YMM1, %YMM0, %YMM2
-       VPMINU  %YMM2, %YMM1, %YMM2
-       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-       VPTESTN %YMM2, %YMM2, %k0
-       kmovd   %k0, %eax
-       testl   %eax, %eax
+       vpxorq  %VMM(1), %VMATCH, %VMM(2)
+       VPMINU  %VMM(2), %VMM(1), %VMM(2)
+       /* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+       VPTESTN %VMM(2), %VMM(2), %k0
+       KMOV    %k0, %VRCX
+       test    %VRCX, %VRCX
         jnz     L(first_vec_x3)
  
-       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
-       /* Each bit in K0 represents a CHAR in YMM1.  */
-       VPCMP   $0, %YMM1, %YMM0, %k0
-       /* Each bit in K1 represents a CHAR in YMM1.  */
-       VPTESTN %YMM1, %YMM1, %k1
-       kortestd        %k0, %k1
+       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
+       /* Each bit in K0 represents a CHAR in VEC_1.  */
+       VPCMPEQ %VMM(1), %VMATCH, %k0
+       /* Each bit in K1 represents a CHAR in VEC_1.  */
+       VPTESTN %VMM(1), %VMM(1), %k1
+       KORTEST %k0, %k1
         jnz     L(first_vec_x4)
  
         /* Align data to VEC_SIZE * 4 for the loop.  */
+# if VEC_SIZE == 64
+       /* Use rax for the loop reg as it allows to the loop to fit in
+          exactly 2-cache-lines. (more efficient imm32 + gpr
+          encoding).  */
+       leaq    (VEC_SIZE)(%rdi), %rax
+       /* No partial register stalls on evex512 processors.  */
+       xorb    %al, %al
+# else
+       /* For VEC_SIZE == 32 continue using rdi for loop reg so we can
+          reuse more code and save space.  */
         addq    $VEC_SIZE, %rdi
         andq    $-(VEC_SIZE * 4), %rdi
-
+# endif
         .p2align 4
  L(loop_4x_vec):
-       /* Check 4x VEC at a time. No penalty to imm32 offset with evex
-          encoding.  */
-       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
-       VMOVA   (VEC_SIZE * 5)(%rdi), %YMM2
-       VMOVA   (VEC_SIZE * 6)(%rdi), %YMM3
-       VMOVA   (VEC_SIZE * 7)(%rdi), %YMM4
-
-       /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+       /* Check 4x VEC at a time. No penalty for imm32 offset with evex
+          encoding (if offset % VEC_SIZE == 0).  */
+       VMOVA   (VEC_SIZE * 4)(%LOOP_REG), %VMM(1)
+       VMOVA   (VEC_SIZE * 5)(%LOOP_REG), %VMM(2)
+       VMOVA   (VEC_SIZE * 6)(%LOOP_REG), %VMM(3)
+       VMOVA   (VEC_SIZE * 7)(%LOOP_REG), %VMM(4)
+
+       /* Collect bits where VEC_1 does NOT match esi.  This is later
+          use to mask of results (getting not matches allows us to
+          save an instruction on combining).  */
+       VPCMP   $4, %VMATCH, %VMM(1), %k1
+
+       /* Two methods for loop depending on VEC_SIZE.  This is because
+          with zmm registers VPMINU can only run on p0 (as opposed to
+          p0/p1 for ymm) so it is less prefered.  */
+# if VEC_SIZE == 32
+       /* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
            zero.  */
-       vpxorq  %YMM1, %YMM0, %YMM5
-       /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
-          k register. Its possible to save either 1 or 2 instructions
-          using cmp no equals method for either YMM1 or YMM1 and YMM3
-          respectively but bottleneck on p5 makes it not worth it.  */
-       VPCMP   $4, %YMM0, %YMM2, %k2
-       vpxorq  %YMM3, %YMM0, %YMM7
-       VPCMP   $4, %YMM0, %YMM4, %k4
-
-       /* Use min to select all zeros from either xor or end of string).
-        */
-       VPMINU  %YMM1, %YMM5, %YMM1
-       VPMINU  %YMM3, %YMM7, %YMM3
+       vpxorq  %VMM(2), %VMATCH, %VMM(6)
+       vpxorq  %VMM(3), %VMATCH, %VMM(7)
  
-       /* Use min + zeromask to select for zeros. Since k2 and k4 will
-          have 0 as positions that matched with CHAR which will set
-          zero in the corresponding destination bytes in YMM2 / YMM4.
-        */
-       VPMINU  %YMM1, %YMM2, %YMM2{%k2}{z}
-       VPMINU  %YMM3, %YMM4, %YMM4
-       VPMINU  %YMM2, %YMM4, %YMM4{%k4}{z}
-
-       VPTESTN %YMM4, %YMM4, %k1
-       kmovd   %k1, %ecx
-       subq    $-(VEC_SIZE * 4), %rdi
-       testl   %ecx, %ecx
+       /* Find non-matches in VEC_4 while combining with non-matches
+          from VEC_1.  NB: Try and use masked predicate execution on
+          instructions that have mask result as it has no latency
+          penalty.  */
+       VPCMP   $4, %VMATCH, %VMM(4), %k4{%k1}
+
+       /* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
+       VPMINU  %VMM(1), %VMM(2), %VMM(2)
+
+       /* Use min to select all zeros from either xor or end of
+          string).  */
+       VPMINU  %VMM(3), %VMM(7), %VMM(3)
+       VPMINU  %VMM(2), %VMM(6), %VMM(2)
+
+       /* Combined zeros from VEC_2 / VEC_3 (search for null term).  */
+       VPMINU  %VMM(3), %VMM(4), %VMM(4)
+
+       /* Combined zeros from VEC_2 / VEC_4 (this has all null term and
+          esi matches for VEC_2 / VEC_3).  */
+       VPMINU  %VMM(2), %VMM(4), %VMM(4)
+# else
+       /* Collect non-matches for VEC_2.  */
+       VPCMP   $4, %VMM(2), %VMATCH, %k2
+
+       /* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
+       VPMINU  %VMM(1), %VMM(2), %VMM(2)
+
+       /* Find non-matches in VEC_3/VEC_4 while combining with non-
+          matches from VEC_1/VEC_2 respectively.  */
+       VPCMP   $4, %VMM(3), %VMATCH, %k3{%k1}
+       VPCMP   $4, %VMM(4), %VMATCH, %k4{%k2}
+
+       /* Finish combining zeros in all VECs.  */
+       VPMINU  %VMM(3), %VMM(4), %VMM(4)
+
+       /* Combine in esi matches for VEC_3 (if there was a match with
+          esi, the corresponding bit in %k3 is zero so the
+          VPMINU_MASKZ will have a zero in the result).  NB: This make
+          the VPMINU 3c latency.  The only way to avoid it is to
+          createa a 12c dependency chain on all the `VPCMP $4, ...`
+          which has higher total latency.  */
+       VPMINU  %VMM(2), %VMM(4), %VMM(4){%k3}{z}
+# endif
+       VPTEST  %VMM(4), %VMM(4), %k0{%k4}
+       KMOV    %k0, %VRDX
+       subq    $-(VEC_SIZE * 4), %LOOP_REG
+
+       /* TESTZ is inc using the proper register width depending on
+          CHAR_PER_VEC. An esi match or null-term match leaves a zero-
+          bit in rdx so inc won't overflow and won't be zero.  */
+       TESTZ   (rdx)
         jz      L(loop_4x_vec)
  
-       VPTESTN %YMM1, %YMM1, %k0
-       kmovd   %k0, %eax
-       testl   %eax, %eax
-       jnz     L(last_vec_x1)
+       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
+       KMOV    %k0, %VGPR(MASK_GPR)
+       TESTZ   (MASK_GPR)
+# if VEC_SIZE == 32
+       /* We can reuse the return code in page_cross logic for VEC_SIZE
+          == 32.  */
+       jnz     L(last_vec_x1_vec_size32)
+# else
+       jnz     L(last_vec_x1_vec_size64)
+# endif
+
  
-       VPTESTN %YMM2, %YMM2, %k0
-       kmovd   %k0, %eax
-       testl   %eax, %eax
+       /* COND_MASK integates the esi matches for VEC_SIZE == 64. For
+          VEC_SIZE == 32 they are already integrated.  */
+       VPTEST  %VMM(2), %VMM(2), %k0 COND_MASK(k2)
+       KMOV    %k0, %VRCX
+       TESTZ   (rcx)
         jnz     L(last_vec_x2)
  
-       VPTESTN %YMM3, %YMM3, %k0
-       kmovd   %k0, %eax
-       /* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
-# ifdef USE_AS_WCSCHR
-       sall    $8, %ecx
-       orl     %ecx, %eax
-       bsfl    %eax, %eax
+       VPTEST  %VMM(3), %VMM(3), %k0 COND_MASK(k3)
+       KMOV    %k0, %VRCX
+# if CHAR_PER_VEC == 64
+       TESTZ   (rcx)
+       jnz     L(last_vec_x3)
  # else
-       salq    $32, %rcx
-       orq     %rcx, %rax
-       bsfq    %rax, %rax
+       salq    $CHAR_PER_VEC, %rdx
+       TESTZ   (rcx)
+       orq     %rcx, %rdx
  # endif
+
+       bsfq    %rdx, %rdx
+
  # ifndef USE_AS_STRCHRNUL
         /* Check if match was CHAR or null.  */
-       cmp     (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+       cmp     (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG
         jne     L(zero_end)
  # endif
         /* NB: Multiply sizeof char type (1 or 4) to get the number of
            bytes.  */
-       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+       leaq    (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax
         ret
  
-       .p2align 4,, 8
-L(last_vec_x1):
-       bsfl    %eax, %eax
-# ifdef USE_AS_WCSCHR
-       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
-          */
-       leaq    (%rdi, %rax, CHAR_SIZE), %rax
-# else
-       addq    %rdi, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+       xorl    %eax, %eax
+       ret
  # endif
  
-# ifndef USE_AS_STRCHRNUL
+
+       /* Seperate return label for last VEC1 because for VEC_SIZE ==
+          32 we can reuse return code in L(page_cross) but VEC_SIZE ==
+          64 has mismatched registers.  */
+# if VEC_SIZE == 64
+       .p2align 4,, 8
+L(last_vec_x1_vec_size64):
+       bsf     %VRCX, %VRCX
+#  ifndef USE_AS_STRCHRNUL
         /* Check if match was null.  */
-       cmp     (%rax), %CHAR_REG
+       cmp     (%rax, %rcx, CHAR_SIZE), %CHAR_REG
         jne     L(zero_end)
-# endif
-
+#  endif
+#  ifdef USE_AS_WCSCHR
+       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
+        */
+       leaq    (%rax, %rcx, CHAR_SIZE), %rax
+#  else
+       addq    %rcx, %rax
+#  endif
         ret
  
+       /* Since we can't combine the last 2x matches for CHAR_PER_VEC
+          == 64 we need return label for last VEC3.  */
+#  if CHAR_PER_VEC == 64
         .p2align 4,, 8
+L(last_vec_x3):
+       addq    $VEC_SIZE, %LOOP_REG
+#  endif
+
+       /* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't
+          reuse L(first_vec_x3) due to register mismatch.  */
  L(last_vec_x2):
-       bsfl    %eax, %eax
-# ifndef USE_AS_STRCHRNUL
+       bsf     %VGPR(MASK_GPR), %VGPR(MASK_GPR)
+#  ifndef USE_AS_STRCHRNUL
         /* Check if match was null.  */
-       cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+       cmp     (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
         jne     L(zero_end)
-# endif
+#  endif
         /* NB: Multiply sizeof char type (1 or 4) to get the number of
            bytes.  */
-       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+       leaq    (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax
         ret
+# endif
  
-       /* Cold case for crossing page with first load.  */
-       .p2align 4,, 8
+       /* Cold case for crossing page with first load.  */
+       .p2align 4,, 10
+# ifndef USE_AS_STRCHRNUL
  L(cross_page_boundary):
-       movq    %rdi, %rdx
+# endif
+L(cross_page_boundary_real):
         /* Align rdi.  */
-       andq    $-VEC_SIZE, %rdi
-       VMOVA   (%rdi), %YMM1
-       /* Leaves only CHARS matching esi as 0.  */
-       vpxorq  %YMM1, %YMM0, %YMM2
-       VPMINU  %YMM2, %YMM1, %YMM2
-       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-       VPTESTN %YMM2, %YMM2, %k0
-       kmovd   %k0, %eax
+       xorq    %rdi, %rax
+       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+       /* Use high latency method of getting matches to save code size.
+        */
+
+       /* K1 has 1s where VEC(1) does NOT match esi.  */
+       VPCMP   $4, %VMM(1), %VMATCH, %k1
+       /* K0 has ones where K1 is 1 (non-match with esi), and non-zero
+          (null).  */
+       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
+       KMOV    %k0, %VRAX
         /* Remove the leading bits.  */
  # ifdef USE_AS_WCSCHR
-       movl    %edx, %SHIFT_REG
+       movl    %edi, %VGPR_SZ(SHIFT_REG, 32)
         /* NB: Divide shift count by 4 since each bit in K1 represent 4
            bytes.  */
-       sarl    $2, %SHIFT_REG
-       andl    $(CHAR_PER_VEC - 1), %SHIFT_REG
+       sarl    $2, %VGPR_SZ(SHIFT_REG, 32)
+       andl    $(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32)
+
+       /* if wcsrchr we need to reverse matches as we can't rely on
+          signed shift to bring in ones. There is not sarx for
+          gpr8/16. Also not we can't use inc here as the lower bits
+          represent matches out of range so we can't rely on overflow.
+        */
+       xorl    $((1 << CHAR_PER_VEC)- 1), %eax
+# endif
+       /* Use arithmatic shift so that leading 1s are filled in.  */
+       sarx    %VGPR(SHIFT_REG), %VRAX, %VRAX
+       /* If eax is all ones then no matches for esi or NULL.  */
+
+# ifdef USE_AS_WCSCHR
+       test    %VRAX, %VRAX
+# else
+       inc     %VRAX
  # endif
-       sarxl   %SHIFT_REG, %eax, %eax
-       /* If eax is zero continue.  */
-       testl   %eax, %eax
         jz      L(cross_page_continue)
-       bsfl    %eax, %eax
  
+       .p2align 4,, 10
+L(last_vec_x1_vec_size32):
+       bsf     %VRAX, %VRAX
  # ifdef USE_AS_WCSCHR
-       /* NB: Multiply wchar_t count by 4 to get the number of
-          bytes.  */
-       leaq    (%rdx, %rax, CHAR_SIZE), %rax
+       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
+        */
+       leaq    (%rdi, %rax, CHAR_SIZE), %rax
  # else
-       addq    %rdx, %rax
+       addq    %rdi, %rax
  # endif
  # ifndef USE_AS_STRCHRNUL
         /* Check to see if match was CHAR or null.  */
         cmp     (%rax), %CHAR_REG
-       je      L(cross_page_ret)
-L(zero_end):
-       xorl    %eax, %eax
-L(cross_page_ret):
+       jne     L(zero_end_0)
  # endif
         ret
+# ifndef USE_AS_STRCHRNUL
+L(zero_end_0):
+       xorl    %eax, %eax
+       ret
+# endif
  
  END (STRCHR)
  #endif
author	Noah Goldstein <goldstein.w.n@gmail.com>
	Wed, 19 Oct 2022 00:44:04 +0000 (17:44 -0700)
committer	Noah Goldstein <goldstein.w.n@gmail.com>
	Thu, 20 Oct 2022 00:31:03 +0000 (17:31 -0700)