1 /* Optimized strspn implementation for Power8.
3 Copyright (C) 2016-2018 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 /* size_t [r3] strspn (const char *string [r3],
21 const char *needleAccept [r4]) */
23 /* This takes a novel approach by computing a 256 bit mask whereby
24 each set bit implies the byte is "accepted". P8 vector hardware
25 has extremely efficient hardware for selecting bits from a mask.
27 One might ask "why not use bpermd for short strings"? It is
28 so slow that its performance about matches the generic PPC64
29 variant without any fancy masking, with the added expense of
30 making the mask. That was the first variant of this. */
36 #ifndef USE_AS_STRCSPN
37 # define USE_AS_STRCSPN 0
39 # define STRSPN strspn
41 # define INITIAL_MASK 0
42 # define UPDATE_MASK(RA, RS, RB) or RA, RS, RB
45 # define STRSPN strcspn
47 # define INITIAL_MASK -1
48 # define UPDATE_MASK(RA, RS, RB) andc RA, RS, RB
51 /* Simple macro to use VSX instructions in overlapping VR's. */
52 #define XXVR(insn, vrt, vra, vrb) \
53 insn 32+vrt, 32+vra, 32+vrb
55 /* ISA 2.07B instructions are not all defined for older binutils.
56 Macros are defined below for these newer instructions in order
57 to maintain compatibility. */
59 /* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */
60 #define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
61 #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
63 #define VBPERMQ(t,a,b) .long (0x1000054c \
68 /* This can be updated to power8 once the minimum version of
69 binutils supports power8 and the above instructions. */
71 ENTRY_TOCLESS (STRSPN, 4)
74 /* Generate useful constants for later on. */
77 vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */
79 vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */
80 XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */
82 /* Prepare to compute 256b mask. */
90 /* Ensure the null character never matches by clearing ISA bit 0 in
91 in r5 which is the bit which will check for it in the later usage
99 /* Start interleaved Mask computation.
100 This will eventually or 1's into ignored bits from vbpermq. */
102 vspltb v11, v11, 0 /* Splat shift constant. */
104 /* Build a 256b mask in r5-r8. */
112 /* This is a little tricky. srd only uses the first 7 bits,
113 and if bit 7 is set, value is always 0. So, we can
114 effectively shift 128b in this case. */
115 xori r12, r9, 0x40 /* Invert bit 6. */
116 srd r10, r11, r9 /* Mask for bits 0-63. */
117 srd r12, r11, r12 /* Mask for bits 64-127. */
119 beq cr0, L(start_cmp)
121 /* Now, or the value into the correct GPR. */
122 bge cr1,L(needle_gt128)
123 UPDATE_MASK (r5, r5, r10) /* 0 - 63. */
124 UPDATE_MASK (r6, r6, r12) /* 64 - 127. */
129 UPDATE_MASK (r7, r7, r10) /* 128 - 191. */
130 UPDATE_MASK (r8, r8, r12) /* 192 - 255. */
136 /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */
137 mr r0, r3 /* Save r3 for final length computation. */
143 /* Continue interleaved mask generation. */
144 #ifdef __LITTLE_ENDIAN__
145 vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */
146 vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */
148 vslw v11, v2, v11 /* Note, shift ignores higher order bits. */
149 vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */
151 lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */
153 /* Do the merging of the bitmask. */
154 XXVR(xxmrghd, v5, v5, v6)
155 XXVR(xxmrghd, v6, v7, v8)
157 /* Finish mask generation. */
158 vand v11, v11, v4 /* Throwaway bits not in the mask. */
160 /* Compare the first 1-16B, while masking unwanted bytes. */
161 clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */
162 vxor v9, v0, v1 /* Swap high bit. */
166 vor v7, v7, v11 /* Ignore non-participating bytes. */
176 vxor v9, v0, v1 /* Swap high bit. */
188 #ifdef __LITTLE_ENDIAN__
189 addi r0, r10, 1 /* Count the trailing 1's. */
193 xori r10, r10, 0xffff /* Count leading 1's by inverting. */
194 addi r3, r3, -48 /* Account for the extra leading zeros. */
202 libc_hidden_builtin_def (STRSPN)