aarch64: MTE compatible strchrnul

author Andrea Corallo <andrea.corallo@arm.com>

Fri, 5 Jun 2020 15:18:49 +0000 (17:18 +0200)

committer Szabolcs Nagy <szabolcs.nagy@arm.com>

Tue, 9 Jun 2020 08:20:27 +0000 (09:20 +0100)
author Andrea Corallo <andrea.corallo@arm.com>
Fri, 5 Jun 2020 15:18:49 +0000 (17:18 +0200)
committer Szabolcs Nagy <szabolcs.nagy@arm.com>
Tue, 9 Jun 2020 08:20:27 +0000 (09:20 +0100)
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S

index a65be6cba83aa1c2124e8db4732e5938064ea653..1ae4598f82126b24eb918012768917ac35002432 100644 (file)
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -22,109 +22,75 @@
  
  /* Assumptions:
   *
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
   */
  
-/* Arguments and results.  */
  #define srcin          x0
  #define chrin          w1
-
  #define result         x0
  
-/* Locals and temporaries.  */
-
  #define src            x2
-#define tmp1           x3
-#define wtmp2          w4
-#define tmp3           x5
+#define tmp1           x1
+#define tmp2           x3
+#define tmp2w          w3
  
  #define vrepchr                v0
-#define vdata1         v1
-#define vdata2         v2
-#define vhas_nul1      v3
-#define vhas_nul2      v4
-#define vhas_chr1      v5
-#define vhas_chr2      v6
-#define vrepmask       v7
-#define vend1          v16
-
-/* Core algorithm.
-
-   For each 32-byte hunk we calculate a 64-bit syndrome value, with
-   two bits per byte (LSB is always in bits 0 and 1, for both big
-   and little-endian systems).  For each tuple, bit 0 is set iff
-   the relevant byte matched the requested character or nul.  Since the
-   bits in the syndrome reflect exactly the order in which things occur
-   in the original string a count_trailing_zeros() operation will
-   identify exactly which byte is causing the termination.  */
+#define vdata          v1
+#define qdata          q1
+#define vhas_nul       v2
+#define vhas_chr       v3
+#define vrepmask       v4
+#define vend           v5
+#define dend           d5
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
  
  ENTRY (__strchrnul)
         DELOUSE (0)
-       /* Magic constant 0x40100401 to allow us to identify which lane
-          matches the termination condition.  */
-       mov     wtmp2, #0x0401
-       movk    wtmp2, #0x4010, lsl #16
+       bic     src, srcin, 15
         dup     vrepchr.16b, chrin
-       bic     src, srcin, #31         /* Work with aligned 32-byte hunks.  */
-       dup     vrepmask.4s, wtmp2
-       ands    tmp1, srcin, #31
-       b.eq    L(loop)
-
-       /* Input string is not 32-byte aligned.  Rather than forcing
-          the padding bytes to a safe value, we calculate the syndrome
-          for all the bytes, but then mask off those bits of the
-          syndrome that are related to the padding.  */
-       ld1     {vdata1.16b, vdata2.16b}, [src], #32
-       neg     tmp1, tmp1
-       cmeq    vhas_nul1.16b, vdata1.16b, #0
-       cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
-       cmeq    vhas_nul2.16b, vdata2.16b, #0
-       cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
-       orr     vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
-       orr     vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
-       and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-       and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
-       lsl     tmp1, tmp1, #1
-       addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
-       mov     tmp3, #~0
-       addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
-       lsr     tmp1, tmp3, tmp1
-
-       mov     tmp3, vend1.2d[0]
-       bic     tmp1, tmp3, tmp1        // Mask padding bits.
-       cbnz    tmp1, L(tail)
+       ld1     {vdata.16b}, [src]
+       mov     tmp2w, 0xf00f
+       dup     vrepmask.8h, tmp2w
+       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
+       cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
+       lsl     tmp2, srcin, 2
+       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+       fmov    tmp1, dend
+       lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
+       cbz     tmp1, L(loop)
  
+       rbit    tmp1, tmp1
+       clz     tmp1, tmp1
+       add     result, srcin, tmp1, lsr 2
+       ret
+
+       .p2align 4
  L(loop):
-       ld1     {vdata1.16b, vdata2.16b}, [src], #32
-       cmeq    vhas_nul1.16b, vdata1.16b, #0
-       cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
-       cmeq    vhas_nul2.16b, vdata2.16b, #0
-       cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
-       /* Use a fast check for the termination condition.  */
-       orr     vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
-       orr     vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
-       orr     vend1.16b, vhas_chr1.16b, vhas_chr2.16b
-       addp    vend1.2d, vend1.2d, vend1.2d
-       mov     tmp1, vend1.2d[0]
+       ldr     qdata, [src, 16]!
+       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
+       cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
+       umaxp   vend.16b, vhas_chr.16b, vhas_chr.16b
+       fmov    tmp1, dend
         cbz     tmp1, L(loop)
  
-       /* Termination condition found.  Now need to establish exactly why
-          we terminated.  */
-       and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-       and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
-       addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b         // 256->128
-       addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
-
-       mov     tmp1, vend1.2d[0]
-L(tail):
-       /* Count the trailing zeros, by bit reversing...  */
+       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+       fmov    tmp1, dend
+#ifndef __AARCH64EB__
         rbit    tmp1, tmp1
-       /* Re-bias source.  */
-       sub     src, src, #32
-       clz     tmp1, tmp1      /* ... and counting the leading zeros.  */
-       /* tmp1 is twice the offset into the fragment.  */
-       add     result, src, tmp1, lsr #1
+#endif
+       clz     tmp1, tmp1
+       add     result, src, tmp1, lsr 2
         ret
  
  END(__strchrnul)
author	Andrea Corallo <andrea.corallo@arm.com>
	Fri, 5 Jun 2020 15:18:49 +0000 (17:18 +0200)
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>
	Tue, 9 Jun 2020 08:20:27 +0000 (09:20 +0100)