]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power8/strchr.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / strchr.S
1 /* Optimized strchr implementation for PowerPC64/POWER8.
2 Copyright (C) 2016-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 #ifdef USE_AS_STRCHRNUL
22 # ifndef STRCHRNUL
23 # define FUNC_NAME __strchrnul
24 # else
25 # define FUNC_NAME STRCHRNUL
26 # endif
27 #else
28 # ifndef STRCHR
29 # define FUNC_NAME strchr
30 # else
31 # define FUNC_NAME STRCHR
32 # endif
33 #endif /* !USE_AS_STRCHRNUL */
34
35 /* int [r3] strchr (char *s [r3], int c [r4]) */
36 /* TODO: change these to the actual instructions when the minimum required
37 binutils allows it. */
38 #define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
39 #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
40 #define VBPERMQ(t,a,b) .long (0x1000054c \
41 | ((t)<<(32-11)) \
42 | ((a)<<(32-16)) \
43 | ((b)<<(32-21)) )
44 /* TODO: change this to .machine power8 when the minimum required binutils
45 allows it. */
46 .machine power7
47 ENTRY_TOCLESS (FUNC_NAME)
48 CALL_MCOUNT 2
49 dcbt 0,r3
50 clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
51 cmpdi cr7,r4,0
52 ld r12,0(r8) /* Load doubleword from memory. */
53 li r0,0 /* Doubleword with null chars to use
54 with cmpb. */
55
56 rlwinm r6,r3,3,26,28 /* Calculate padding. */
57
58 beq cr7,L(null_match)
59
60 /* Replicate byte to doubleword. */
61 insrdi r4,r4,8,48
62 insrdi r4,r4,16,32
63 insrdi r4,r4,32,0
64
65 /* Now r4 has a doubleword of c bytes and r0 has
66 a doubleword of null bytes. */
67
68 cmpb r10,r12,r4 /* Compare each byte against c byte. */
69 cmpb r11,r12,r0 /* Compare each byte against null byte. */
70
71 /* Move the doublewords left and right to discard the bits that are
72 not part of the string and bring them back as zeros. */
73 #ifdef __LITTLE_ENDIAN__
74 srd r10,r10,r6
75 srd r11,r11,r6
76 sld r10,r10,r6
77 sld r11,r11,r6
78 #else
79 sld r10,r10,r6
80 sld r11,r11,r6
81 srd r10,r10,r6
82 srd r11,r11,r6
83 #endif
84 or r5,r10,r11 /* OR the results to speed things up. */
85 cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
86 have been found. */
87 bne cr7,L(done)
88
89 mtcrf 0x01,r8
90
91 /* Are we now aligned to a doubleword boundary? If so, skip to
92 the main loop. Otherwise, go through the alignment code. */
93
94 bt 28,L(loop)
95
96 /* Handle WORD2 of pair. */
97 ldu r12,8(r8)
98 cmpb r10,r12,r4
99 cmpb r11,r12,r0
100 or r5,r10,r11
101 cmpdi cr7,r5,0
102 bne cr7,L(done)
103 b L(loop) /* We branch here (rather than falling through)
104 to skip the nops due to heavy alignment
105 of the loop below. */
106
107 .p2align 5
108 L(loop):
109 /* Load two doublewords, compare and merge in a
110 single register for speed. This is an attempt
111 to speed up the null-checking process for bigger strings. */
112 ld r12,8(r8)
113 ldu r9,16(r8)
114 cmpb r10,r12,r4
115 cmpb r11,r12,r0
116 cmpb r6,r9,r4
117 cmpb r7,r9,r0
118 or r5,r10,r11
119 or r9,r6,r7
120 or r12,r5,r9
121 cmpdi cr7,r12,0
122 beq cr7,L(vector)
123 /* OK, one (or both) of the doublewords contains a c/null byte. Check
124 the first doubleword and decrement the address in case the first
125 doubleword really contains a c/null byte. */
126
127 cmpdi cr6,r5,0
128 addi r8,r8,-8
129 bne cr6,L(done)
130
131 /* The c/null byte must be in the second doubleword. Adjust the
132 address again and move the result of cmpb to r10 so we can calculate
133 the pointer. */
134
135 mr r10,r6
136 mr r11,r7
137 addi r8,r8,8
138 #ifdef USE_AS_STRCHRNUL
139 mr r5, r9
140 #endif
141 /* r10/r11 have the output of the cmpb instructions, that is,
142 0xff in the same position as the c/null byte in the original
143 doubleword from the string. Use that to calculate the pointer. */
144 L(done):
145 #ifdef USE_AS_STRCHRNUL
146 mr r10, r5
147 #endif
148 #ifdef __LITTLE_ENDIAN__
149 addi r3,r10,-1
150 andc r3,r3,r10
151 popcntd r0,r3
152 # ifndef USE_AS_STRCHRNUL
153 addi r4,r11,-1
154 andc r4,r4,r11
155 cmpld cr7,r3,r4
156 bgt cr7,L(no_match)
157 # endif
158 #else
159 cntlzd r0,r10 /* Count leading zeros before c matches. */
160 # ifndef USE_AS_STRCHRNUL
161 cmpld cr7,r11,r10
162 bgt cr7,L(no_match)
163 # endif
164 #endif
165 srdi r0,r0,3 /* Convert leading zeros to bytes. */
166 add r3,r8,r0 /* Return address of the matching c byte
167 or null in case c was not found. */
168 blr
169
170 /* Check the first 32B in GPR's and move to vectorized loop. */
171 .p2align 5
172 L(vector):
173 addi r3, r8, 8
174 andi. r10, r3, 31
175 bne cr0, L(loop)
176 vspltisb v0, 0
177 /* Precompute vbpermq constant. */
178 vspltisb v10, 3
179 lvsl v11, r0, r0
180 vslb v10, v11, v10
181 MTVRD(v1,r4)
182 li r5, 16
183 vspltb v1, v1, 7
184 /* Compare 32 bytes in each loop. */
185 L(continue):
186 lvx v4, 0, r3
187 lvx v5, r3, r5
188 vcmpequb v2, v0, v4
189 vcmpequb v3, v0, v5
190 vcmpequb v6, v1, v4
191 vcmpequb v7, v1, v5
192 vor v8, v2, v3
193 vor v9, v6, v7
194 vor v11, v8, v9
195 vcmpequb. v11, v0, v11
196 addi r3, r3, 32
197 blt cr6, L(continue)
198 /* One (or both) of the quadwords contains a c/null byte. */
199 addi r3, r3, -32
200 #ifndef USE_AS_STRCHRNUL
201 vcmpequb. v11, v0, v9
202 blt cr6, L(no_match)
203 #endif
204 /* Permute the first bit of each byte into bits 48-63. */
205 VBPERMQ(v2, v2, v10)
206 VBPERMQ(v3, v3, v10)
207 VBPERMQ(v6, v6, v10)
208 VBPERMQ(v7, v7, v10)
209 /* Shift each component into its correct position for merging. */
210 #ifdef __LITTLE_ENDIAN__
211 vsldoi v3, v3, v3, 2
212 vsldoi v7, v7, v7, 2
213 #else
214 vsldoi v2, v2, v2, 6
215 vsldoi v3, v3, v3, 4
216 vsldoi v6, v6, v6, 6
217 vsldoi v7, v7, v7, 4
218 #endif
219
220 /* Merge the results and move to a GPR. */
221 vor v1, v3, v2
222 vor v2, v6, v7
223 vor v4, v1, v2
224 MFVRD(r5, v4)
225 #ifdef __LITTLE_ENDIAN__
226 addi r6, r5, -1
227 andc r6, r6, r5
228 popcntd r6, r6
229 #else
230 cntlzd r6, r5 /* Count leading zeros before the match. */
231 #endif
232 add r3, r3, r6 /* Compute final length. */
233 /* Return NULL if null found before c. */
234 #ifndef USE_AS_STRCHRNUL
235 lbz r4, 0(r3)
236 cmpdi cr7, r4, 0
237 beq cr7, L(no_match)
238 #endif
239 blr
240
241 #ifndef USE_AS_STRCHRNUL
242 .align 4
243 L(no_match):
244 li r3,0
245 blr
246 #endif
247
248 /* We are here because strchr was called with a null byte. */
249 .align 4
250 L(null_match):
251 /* r0 has a doubleword of null bytes. */
252
253 cmpb r5,r12,r0 /* Compare each byte against null bytes. */
254
255 /* Move the doublewords left and right to discard the bits that are
256 not part of the string and bring them back as zeros. */
257 #ifdef __LITTLE_ENDIAN__
258 srd r5,r5,r6
259 sld r5,r5,r6
260 #else
261 sld r5,r5,r6
262 srd r5,r5,r6
263 #endif
264 cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
265 have been found. */
266 bne cr7,L(done_null)
267
268 mtcrf 0x01,r8
269
270 /* Are we now aligned to a quadword boundary? If so, skip to
271 the main loop. Otherwise, go through the alignment code. */
272
273 bt 28,L(loop_null)
274
275 /* Handle WORD2 of pair. */
276 ldu r12,8(r8)
277 cmpb r5,r12,r0
278 cmpdi cr7,r5,0
279 bne cr7,L(done_null)
280 b L(loop_null) /* We branch here (rather than falling through)
281 to skip the nops due to heavy alignment
282 of the loop below. */
283
284 /* Main loop to look for the end of the string. Since it's a
285 small loop (< 8 instructions), align it to 32-bytes. */
286 .p2align 5
287 L(loop_null):
288 /* Load two doublewords, compare and merge in a
289 single register for speed. This is an attempt
290 to speed up the null-checking process for bigger strings. */
291 ld r12,8(r8)
292 ldu r11,16(r8)
293 cmpb r5,r12,r0
294 cmpb r10,r11,r0
295 or r6,r5,r10
296 cmpdi cr7,r6,0
297 beq cr7,L(vector1)
298
299 /* OK, one (or both) of the doublewords contains a null byte. Check
300 the first doubleword and decrement the address in case the first
301 doubleword really contains a null byte. */
302
303 cmpdi cr6,r5,0
304 addi r8,r8,-8
305 bne cr6,L(done_null)
306
307 /* The null byte must be in the second doubleword. Adjust the address
308 again and move the result of cmpb to r10 so we can calculate the
309 pointer. */
310
311 mr r5,r10
312 addi r8,r8,8
313
314 /* r5 has the output of the cmpb instruction, that is, it contains
315 0xff in the same position as the null byte in the original
316 doubleword from the string. Use that to calculate the pointer. */
317 L(done_null):
318 #ifdef __LITTLE_ENDIAN__
319 addi r0,r5,-1
320 andc r0,r0,r5
321 popcntd r0,r0
322 #else
323 cntlzd r0,r5 /* Count leading zeros before the match. */
324 #endif
325 srdi r0,r0,3 /* Convert leading zeros to bytes. */
326 add r3,r8,r0 /* Return address of the matching null byte. */
327 blr
328 .p2align 5
329 L(vector1):
330 addi r3, r8, 8
331 andi. r10, r3, 31
332 bne cr0, L(loop_null)
333 vspltisb v8, -1
334 vspltisb v0, 0
335 vspltisb v10, 3
336 lvsl v11, r0, r0
337 vslb v10, v11, v10
338 li r5, 16
339 L(continue1):
340 lvx v4, 0, r3
341 lvx v5, r3, r5
342 vcmpequb v2, v0, v4
343 vcmpequb v3, v0, v5
344 vor v8, v2, v3
345 vcmpequb. v11, v0, v8
346 addi r3, r3, 32
347 blt cr6, L(continue1)
348 addi r3, r3, -32
349 L(end1):
350 VBPERMQ(v2, v2, v10)
351 VBPERMQ(v3, v3, v10)
352 /* Shift each component into its correct position for merging. */
353 #ifdef __LITTLE_ENDIAN__
354 vsldoi v3, v3, v3, 2
355 #else
356 vsldoi v2, v2, v2, 6
357 vsldoi v3, v3, v3, 4
358 #endif
359
360 /* Merge the results and move to a GPR. */
361 vor v4, v3, v2
362 MFVRD(r5, v4)
363 #ifdef __LITTLE_ENDIAN__
364 addi r6, r5, -1
365 andc r6, r6, r5
366 popcntd r6, r6
367 #else
368 cntlzd r6, r5 /* Count leading zeros before the match. */
369 #endif
370 add r3, r3, r6 /* Compute final length. */
371 blr
372 END (FUNC_NAME)
373
374 #ifndef USE_AS_STRCHRNUL
375 weak_alias (strchr, index)
376 libc_hidden_builtin_def (strchr)
377 #endif