]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power8/strspn.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / strspn.S
CommitLineData
25dba0ad
PM
1/* Optimized strspn implementation for Power8.
2
04277e02 3 Copyright (C) 2016-2019 Free Software Foundation, Inc.
25dba0ad
PM
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20/* size_t [r3] strspn (const char *string [r3],
21 const char *needleAccept [r4]) */
22
23/* This takes a novel approach by computing a 256 bit mask whereby
24 each set bit implies the byte is "accepted". P8 vector hardware
25 has extremely efficient hardware for selecting bits from a mask.
26
27 One might ask "why not use bpermd for short strings"? It is
28 so slow that its performance about matches the generic PPC64
29 variant without any fancy masking, with the added expense of
30 making the mask. That was the first variant of this. */
31
32
33
34#include "sysdep.h"
35
8f1b841e
PM
36#ifndef USE_AS_STRCSPN
37# define USE_AS_STRCSPN 0
38# ifndef STRSPN
39# define STRSPN strspn
40# endif
41# define INITIAL_MASK 0
42# define UPDATE_MASK(RA, RS, RB) or RA, RS, RB
43#else
44# ifndef STRSPN
45# define STRSPN strcspn
46# endif
47# define INITIAL_MASK -1
48# define UPDATE_MASK(RA, RS, RB) andc RA, RS, RB
49#endif
50
25dba0ad
PM
51/* Simple macro to use VSX instructions in overlapping VR's. */
52#define XXVR(insn, vrt, vra, vrb) \
53 insn 32+vrt, 32+vra, 32+vrb
54
55/* ISA 2.07B instructions are not all defined for older binutils.
56 Macros are defined below for these newer instructions in order
57 to maintain compatibility. */
58
59/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */
60#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
61#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
62
63#define VBPERMQ(t,a,b) .long (0x1000054c \
64 | ((t)<<(32-11)) \
65 | ((a)<<(32-16)) \
66 | ((b)<<(32-21)) )
67
68 /* This can be updated to power8 once the minimum version of
69 binutils supports power8 and the above instructions. */
70 .machine power7
d5b41185 71ENTRY_TOCLESS (STRSPN, 4)
25dba0ad
PM
72 CALL_MCOUNT 2
73
74 /* Generate useful constants for later on. */
75 vspltisb v1, 7
76 vspltisb v2, -1
77 vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */
78 vspltisb v10, 0
79 vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */
80 XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */
81
82 /* Prepare to compute 256b mask. */
83 addi r4, r4, -1
8f1b841e
PM
84 li r5, INITIAL_MASK
85 li r6, INITIAL_MASK
86 li r7, INITIAL_MASK
87 li r8, INITIAL_MASK
88
89#if USE_AS_STRCSPN
90 /* Ensure the null character never matches by clearing ISA bit 0 in
91 in r5 which is the bit which will check for it in the later usage
92 of vbpermq. */
93 srdi r5, r5, 1
94#endif
95
25dba0ad
PM
96 li r11, 1
97 sldi r11, r11, 63
98
99 /* Start interleaved Mask computation.
100 This will eventually or 1's into ignored bits from vbpermq. */
101 lvsr v11, 0, r3
102 vspltb v11, v11, 0 /* Splat shift constant. */
103
104 /* Build a 256b mask in r5-r8. */
105 .align 4
106L(next_needle):
107 lbzu r9, 1(r4)
108
109 cmpldi cr0, r9, 0
110 cmpldi cr1, r9, 128
111
112 /* This is a little tricky. srd only uses the first 7 bits,
113 and if bit 7 is set, value is always 0. So, we can
114 effectively shift 128b in this case. */
115 xori r12, r9, 0x40 /* Invert bit 6. */
116 srd r10, r11, r9 /* Mask for bits 0-63. */
117 srd r12, r11, r12 /* Mask for bits 64-127. */
118
119 beq cr0, L(start_cmp)
120
121 /* Now, or the value into the correct GPR. */
122 bge cr1,L(needle_gt128)
8f1b841e
PM
123 UPDATE_MASK (r5, r5, r10) /* 0 - 63. */
124 UPDATE_MASK (r6, r6, r12) /* 64 - 127. */
25dba0ad
PM
125 b L(next_needle)
126
127 .align 4
128L(needle_gt128):
8f1b841e
PM
129 UPDATE_MASK (r7, r7, r10) /* 128 - 191. */
130 UPDATE_MASK (r8, r8, r12) /* 192 - 255. */
25dba0ad
PM
131 b L(next_needle)
132
133
134 .align 4
135L(start_cmp):
136 /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */
137 mr r0, r3 /* Save r3 for final length computation. */
138 MTVRD (v5, r5)
139 MTVRD (v6, r6)
140 MTVRD (v7, r7)
141 MTVRD (v8, r8)
142
143 /* Continue interleaved mask generation. */
144#ifdef __LITTLE_ENDIAN__
145 vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */
146 vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */
147#else
148 vslw v11, v2, v11 /* Note, shift ignores higher order bits. */
149 vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */
150#endif
151 lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */
152
153 /* Do the merging of the bitmask. */
154 XXVR(xxmrghd, v5, v5, v6)
155 XXVR(xxmrghd, v6, v7, v8)
156
157 /* Finish mask generation. */
158 vand v11, v11, v4 /* Throwaway bits not in the mask. */
159
160 /* Compare the first 1-16B, while masking unwanted bytes. */
161 clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */
162 vxor v9, v0, v1 /* Swap high bit. */
163 VBPERMQ (v8, v5, v0)
164 VBPERMQ (v7, v6, v9)
165 vor v7, v7, v8
166 vor v7, v7, v11 /* Ignore non-participating bytes. */
167 vcmpequh. v8, v7, v4
168 bnl cr6, L(done)
169
170 addi r3, r3, 16
171
172 .align 4
173L(vec):
174 lvx v0, 0, r3
175 addi r3, r3, 16
176 vxor v9, v0, v1 /* Swap high bit. */
177 VBPERMQ (v8, v5, v0)
178 VBPERMQ (v7, v6, v9)
179 vor v7, v7, v8
180 vcmpequh. v8, v7, v4
181 blt cr6, L(vec)
182
183 addi r3, r3, -16
184L(done):
185 subf r3, r0, r3
186 MFVRD (r10, v7)
187
188#ifdef __LITTLE_ENDIAN__
189 addi r0, r10, 1 /* Count the trailing 1's. */
190 andc r10, r10, r0
191 popcntd r10, r10
192#else
193 xori r10, r10, 0xffff /* Count leading 1's by inverting. */
194 addi r3, r3, -48 /* Account for the extra leading zeros. */
195 cntlzd r10, r10
196#endif
197
198 add r3, r3, r10
199 blr
200
8f1b841e
PM
201END(STRSPN)
202libc_hidden_builtin_def (STRSPN)