sysdeps/powerpc/powerpc64/power8/strspn.S

   1 /* Optimized strspn implementation for Power8.
   2
   3    Copyright (C) 2016-2018 Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 /* size_t [r3] strspn (const char *string [r3],
  21                        const char *needleAccept [r4])  */
  22
  23 /* This takes a novel approach by computing a 256 bit mask whereby
  24    each set bit implies the byte is "accepted".  P8 vector hardware
  25    has extremely efficient hardware for selecting bits from a mask.
  26
  27    One might ask "why not use bpermd for short strings"?  It is
  28    so slow that its performance about matches the generic PPC64
  29    variant without any fancy masking, with the added expense of
  30    making the mask.  That was the first variant of this.  */
  31
  32
  33
  34 #include "sysdep.h"
  35
  36 #ifndef USE_AS_STRCSPN
  37 #  define USE_AS_STRCSPN 0
  38 #  ifndef STRSPN
  39 #    define STRSPN strspn
  40 #  endif
  41 #  define INITIAL_MASK 0
  42 #  define UPDATE_MASK(RA, RS, RB) or    RA, RS, RB
  43 #else
  44 #  ifndef STRSPN
  45 #    define STRSPN strcspn
  46 #  endif
  47 #  define INITIAL_MASK -1
  48 #  define UPDATE_MASK(RA, RS, RB) andc  RA, RS, RB
  49 #endif
  50
  51 /* Simple macro to use VSX instructions in overlapping VR's.  */
  52 #define XXVR(insn, vrt, vra, vrb) \
  53         insn 32+vrt, 32+vra, 32+vrb
  54
  55 /* ISA 2.07B instructions are not all defined for older binutils.
  56    Macros are defined below for these newer instructions in order
  57    to maintain compatibility.  */
  58
  59 /* Note, TX/SX is always set as VMX regs are the high 32 VSX regs.  */
  60 #define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
  61 #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
  62
  63 #define VBPERMQ(t,a,b) .long (0x1000054c \
  64                               | ((t)<<(32-11))  \
  65                               | ((a)<<(32-16))  \
  66                               | ((b)<<(32-21)) )
  67
  68         /* This can be updated to power8 once the minimum version of
  69            binutils supports power8 and the above instructions.  */
  70         .machine power7
  71 ENTRY_TOCLESS (STRSPN, 4)
  72         CALL_MCOUNT 2
  73
  74         /* Generate useful constants for later on.  */
  75         vspltisb v1, 7
  76         vspltisb v2, -1
  77         vslb    v1, v1, v1      /* 0x80 to swap high bit for vbpermq.  */
  78         vspltisb v10, 0
  79         vsldoi  v4, v10, v2, 2  /* 0xFFFF into vr4.  */
  80         XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches.  */
  81
  82         /* Prepare to compute 256b mask.  */
  83         addi    r4, r4, -1
  84         li      r5, INITIAL_MASK
  85         li      r6, INITIAL_MASK
  86         li      r7, INITIAL_MASK
  87         li      r8, INITIAL_MASK
  88
  89 #if USE_AS_STRCSPN
  90         /* Ensure the null character never matches by clearing ISA bit 0 in
  91            in r5 which is the bit which will check for it in the later usage
  92            of vbpermq.  */
  93         srdi    r5, r5, 1
  94 #endif
  95
  96         li      r11, 1
  97         sldi    r11, r11, 63
  98
  99         /* Start interleaved Mask computation.
 100            This will eventually or 1's into ignored bits from vbpermq.  */
 101         lvsr    v11, 0, r3
 102         vspltb  v11, v11, 0     /* Splat shift constant.  */
 103
 104         /* Build a 256b mask in r5-r8.  */
 105         .align 4
 106 L(next_needle):
 107         lbzu    r9, 1(r4)
 108
 109         cmpldi  cr0, r9, 0
 110         cmpldi  cr1, r9, 128
 111
 112         /* This is a little tricky.  srd only uses the first 7 bits,
 113            and if bit 7 is set, value is always 0.  So, we can
 114            effectively shift 128b in this case.  */
 115         xori    r12, r9,  0x40  /* Invert bit 6.  */
 116         srd     r10, r11, r9    /* Mask for bits 0-63.  */
 117         srd     r12, r11, r12   /* Mask for bits 64-127.  */
 118
 119         beq     cr0, L(start_cmp)
 120
 121         /* Now, or the value into the correct GPR.  */
 122         bge cr1,L(needle_gt128)
 123         UPDATE_MASK (r5, r5, r10)       /* 0 - 63.  */
 124         UPDATE_MASK (r6, r6, r12)       /* 64 - 127.  */
 125         b L(next_needle)
 126
 127         .align 4
 128 L(needle_gt128):
 129         UPDATE_MASK (r7, r7, r10)       /* 128 - 191.  */
 130         UPDATE_MASK (r8, r8, r12)       /* 192 - 255.  */
 131         b L(next_needle)
 132
 133
 134         .align 4
 135 L(start_cmp):
 136         /* Move and merge bitmap into 2 VRs.  bpermd is slower on P8.  */
 137         mr      r0, r3          /* Save r3 for final length computation.  */
 138         MTVRD (v5, r5)
 139         MTVRD (v6, r6)
 140         MTVRD (v7, r7)
 141         MTVRD (v8, r8)
 142
 143         /* Continue interleaved mask generation.  */
 144 #ifdef __LITTLE_ENDIAN__
 145         vsrw    v11, v2, v11    /* Note, shift ignores higher order bits.  */
 146         vsplth  v11, v11, 0     /* Only care about the high 16 bits of v10.  */
 147 #else
 148         vslw    v11, v2, v11    /* Note, shift ignores higher order bits.  */
 149         vsplth  v11, v11, 1     /* Only care about the low 16 bits of v10.  */
 150 #endif
 151         lvx     v0, 0, r3       /* Note, unaligned load ignores lower bits.  */
 152
 153         /* Do the merging of the bitmask.  */
 154         XXVR(xxmrghd, v5, v5, v6)
 155         XXVR(xxmrghd, v6, v7, v8)
 156
 157         /* Finish mask generation.  */
 158         vand    v11, v11, v4    /* Throwaway bits not in the mask.  */
 159
 160         /* Compare the first 1-16B, while masking unwanted bytes.  */
 161         clrrdi  r3, r3, 4       /* Note,  counts from qw boundaries.  */
 162         vxor    v9, v0, v1      /* Swap high bit.  */
 163         VBPERMQ (v8, v5, v0)
 164         VBPERMQ (v7, v6, v9)
 165         vor     v7, v7, v8
 166         vor     v7, v7, v11     /* Ignore non-participating bytes.  */
 167         vcmpequh. v8, v7, v4
 168         bnl     cr6, L(done)
 169
 170         addi    r3, r3, 16
 171
 172         .align 4
 173 L(vec):
 174         lvx     v0, 0, r3
 175         addi    r3, r3, 16
 176         vxor    v9, v0, v1      /* Swap high bit.  */
 177         VBPERMQ (v8, v5, v0)
 178         VBPERMQ (v7, v6, v9)
 179         vor     v7, v7, v8
 180         vcmpequh. v8, v7, v4
 181         blt     cr6, L(vec)
 182
 183         addi    r3, r3, -16
 184 L(done):
 185         subf    r3, r0, r3
 186         MFVRD (r10, v7)
 187
 188 #ifdef __LITTLE_ENDIAN__
 189         addi    r0,  r10, 1     /* Count the trailing 1's.  */
 190         andc    r10, r10, r0
 191         popcntd r10, r10
 192 #else
 193         xori    r10, r10, 0xffff /* Count leading 1's by inverting.  */
 194         addi    r3,  r3,  -48   /* Account for the extra leading zeros.  */
 195         cntlzd  r10, r10
 196 #endif
 197
 198         add     r3, r3, r10
 199         blr
 200
 201 END(STRSPN)
 202 libc_hidden_builtin_def (STRSPN)