]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power8/memrchr.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / memrchr.S
CommitLineData
59ba2d2b 1/* Optimized memrchr implementation for PowerPC64/POWER8.
04277e02 2 Copyright (C) 2017-2019 Free Software Foundation, Inc.
59ba2d2b
RS
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
5a82c748 18 <https://www.gnu.org/licenses/>. */
59ba2d2b
RS
19
20#include <sysdep.h>
21
22/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */
23
59ba2d2b
RS
24#ifndef MEMRCHR
25# define MEMRCHR __memrchr
26#endif
066020c5 27 .machine power8
59ba2d2b
RS
28ENTRY_TOCLESS (MEMRCHR)
29 CALL_MCOUNT 3
30 add r7, r3, r5 /* Calculate the last acceptable address. */
31 neg r0, r7
32 addi r7, r7, -1
33 mr r10, r3
34 clrrdi r6, r7, 7
35 li r9, 3<<5
36 dcbt r9, r6, 8 /* Stream hint, decreasing addresses. */
37
38 /* Replicate BYTE to doubleword. */
39 insrdi r4, r4, 8, 48
40 insrdi r4, r4, 16, 32
41 insrdi r4, r4, 32, 0
42 li r6, -8
43 li r9, -1
44 rlwinm r0, r0, 3, 26, 28 /* Calculate padding. */
45 clrrdi r8, r7, 3
46 srd r9, r9, r0
47 cmpldi r5, 32
48 clrrdi r0, r10, 3
49 ble L(small_range)
50
51#ifdef __LITTLE_ENDIAN__
52 ldx r12, 0, r8
53#else
54 ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */
55#endif
56 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
57 and r3, r3, r9
58 cmpldi cr7, r3, 0 /* If r3 == 0, no BYTEs have been found. */
59 bne cr7, L(done)
60
61 /* Are we now aligned to a quadword boundary? If so, skip to
62 the main loop. Otherwise, go through the alignment code. */
63 andi. r12, r8, 15
64 beq cr0, L(align_qw)
65
66 /* Handle DWORD2 of pair. */
67#ifdef __LITTLE_ENDIAN__
68 ldx r12, r8, r6
69#else
70 ldbrx r12, r8, r6
71#endif
72 addi r8, r8, -8
73 cmpb r3, r12, r4
74 cmpldi cr7, r3, 0
75 bne cr7, L(done)
76
77 .align 4
78 /* At this point, r8 is 16B aligned. */
79L(align_qw):
80 sub r5, r8, r0
81 vspltisb v0, 0
82 /* Precompute vbpermq constant. */
83 vspltisb v10, 3
84 li r0, 0
85 lvsl v11, r0, r0
86 vslb v10, v11, v10
066020c5 87 mtvrd v1, r4
59ba2d2b
RS
88 vspltb v1, v1, 7
89 cmpldi r5, 64
90 ble L(tail64)
91 /* Are we 64-byte aligned? If so, jump to the vectorized loop.
92 Note: aligning to 64-byte will necessarily slow down performance for
93 strings around 64 bytes in length due to the extra comparisons
94 required to check alignment for the vectorized loop. This is a
95 necessary tradeoff we are willing to take in order to speed up the
96 calculation for larger strings. */
97 andi. r11, r8, 63
98 beq cr0, L(preloop_64B)
99 /* In order to begin the 64B loop, it needs to be 64
100 bytes aligned. So read until it is 64B aligned. */
101 addi r8, r8, -16
102 lvx v4, 0, r8
103 vcmpequb v6, v1, v4
104 vcmpequb. v11, v0, v6
105 bnl cr6, L(found_16B)
106 addi r5, r5, -16
107
108 andi. r11, r8, 63
109 beq cr0, L(preloop_64B)
110 addi r8, r8, -16
111 lvx v4, 0, r8
112 vcmpequb v6, v1, v4
113 vcmpequb. v11, v0, v6
114 bnl cr6, L(found_16B)
115 addi r5, r5, -16
116
117 andi. r11, r8, 63
118 beq cr0, L(preloop_64B)
119 addi r8, r8, -16
120 lvx v4, 0, r8
121 vcmpequb v6, v1, v4
122 vcmpequb. v11, v0, v6
123 bnl cr6, L(found_16B)
124 addi r5, r5, -16
125 /* At this point it should be 64B aligned.
126 Prepare for the 64B loop. */
127L(preloop_64B):
128 cmpldi r5, 64 /* Check if r5 < 64. */
129 ble L(tail64)
130 srdi r9, r5, 6 /* Number of loop iterations. */
131 mtctr r9 /* Setup the counter. */
132 li r11, 16 /* Load required offsets. */
133 li r9, 32
134 li r7, 48
135
136 /* Handle r5 > 64. Loop over the bytes in strides of 64B. */
137 .align 4
138L(loop):
139 addi r8, r8, -64 /* Adjust address for the next iteration. */
140 lvx v2, 0, r8 /* Load 4 quadwords. */
141 lvx v3, r8, r11
142 lvx v4, v8, r9
143 lvx v5, v8, r7
144 vcmpequb v6, v1, v2
145 vcmpequb v7, v1, v3
146 vcmpequb v8, v1, v4
147 vcmpequb v9, v1, v5
148 vor v11, v6, v7
149 vor v12, v8, v9
150 vor v11, v11, v12 /* Compare and merge into one VR for speed. */
151 vcmpequb. v11, v0, v11
152 bnl cr6, L(found)
153 bdnz L(loop)
154 clrldi r5, r5, 58
155
156 /* Handle remainder of 64B loop or r5 > 64. */
157 .align 4
158L(tail64):
159 cmpldi r5, 0
160 beq L(null)
161 addi r8, r8, -16
162 lvx v4, 0, r8
163 vcmpequb v6, v1, v4
164 vcmpequb. v11, v0, v6
165 bnl cr6, L(found_16B)
166 cmpldi cr6, r5, 16
167 ble cr6, L(null)
168 addi r5, r5, -16
169
170 addi r8, r8, -16
171 lvx v4, 0, r8
172 vcmpequb v6, v1, v4
173 vcmpequb. v11, v0, v6
174 bnl cr6, L(found_16B)
175 cmpldi cr6, r5, 16
176 ble cr6, L(null)
177 addi r5, r5, -16
178
179 addi r8, r8, -16
180 lvx v4, 0, r8
181 vcmpequb v6, v1, v4
182 vcmpequb. v11, v0, v6
183 bnl cr6, L(found_16B)
184 cmpldi cr6, r5, 16
185 ble cr6, L(null)
186 addi r5, r5, -16
187
188 addi r8, r8, -16
189 lvx v4, 0, r8
190 vcmpequb v6, v1, v4
191 vcmpequb. v11, v0, v6
192 bnl cr6, L(found_16B)
193 li r3, 0
194 blr
195
196 /* Found a match in 64B loop. */
197 .align 4
198L(found):
199 /* Permute the first bit of each byte into bits 48-63. */
066020c5
RFF
200 vbpermq v6, v6, v10
201 vbpermq v7, v7, v10
202 vbpermq v8, v8, v10
203 vbpermq v9, v9, v10
59ba2d2b
RS
204 /* Shift each component into its correct position for merging. */
205#ifdef __LITTLE_ENDIAN__
206 vsldoi v7, v7, v7, 2
207 vsldoi v8, v8, v8, 4
208 vsldoi v9, v9, v9, 6
209#else
210 vsldoi v6, v6, v6, 6
211 vsldoi v7, v7, v7, 4
212 vsldoi v8, v8, v8, 2
213#endif
214 /* Merge the results and move to a GPR. */
215 vor v11, v6, v7
216 vor v4, v9, v8
217 vor v4, v11, v4
066020c5 218 mfvrd r5, v4
59ba2d2b
RS
219#ifdef __LITTLE_ENDIAN__
220 cntlzd r6, r5 /* Count leading zeros before the match. */
221#else
222 addi r6, r5, -1
223 andc r6, r6, r5
224 popcntd r6, r6
225#endif
226 addi r8, r8, 63
227 sub r3, r8, r6 /* Compute final address. */
5a907168
RS
228 cmpld cr7, r3, r10
229 bgelr cr7
230 li r3, 0
59ba2d2b
RS
231 blr
232
233 /* Found a match in last 16 bytes. */
234 .align 4
235L(found_16B):
5a907168
RS
236 cmpld r8, r10 /* Are we on the last QW? */
237 bge L(last)
238 /* Now discard bytes before starting address. */
239 sub r9, r10, r8
066020c5 240 mtvrd v9, r9
5a907168
RS
241 vspltisb v8, 3
242 /* Mask unwanted bytes. */
243#ifdef __LITTLE_ENDIAN__
244 lvsr v7, 0, r10
245 vperm v6, v0, v6, v7
246 vsldoi v9, v0, v9, 8
247 vsl v9, v9, v8
248 vslo v6, v6, v9
249#else
250 lvsl v7, 0, r10
251 vperm v6, v6, v0, v7
252 vsldoi v9, v0, v9, 8
253 vsl v9, v9, v8
254 vsro v6, v6, v9
255#endif
256L(last):
59ba2d2b 257 /* Permute the first bit of each byte into bits 48-63. */
066020c5 258 vbpermq v6, v6, v10
59ba2d2b
RS
259 /* Shift each component into its correct position for merging. */
260#ifdef __LITTLE_ENDIAN__
261 vsldoi v6, v6, v6, 6
066020c5 262 mfvrd r7, v6
59ba2d2b
RS
263 cntlzd r6, r7 /* Count leading zeros before the match. */
264#else
066020c5 265 mfvrd r7, v6
59ba2d2b
RS
266 addi r6, r7, -1
267 andc r6, r6, r7
268 popcntd r6, r6
269#endif
270 addi r8, r8, 15
271 sub r3, r8, r6 /* Compute final address. */
272 cmpld r6, r5
273 bltlr
274 li r3, 0
275 blr
276
277 /* r3 has the output of the cmpb instruction, that is, it contains
278 0xff in the same position as BYTE in the original
279 word from the string. Use that to calculate the pointer.
280 We need to make sure BYTE is *before* the end of the
281 range. */
282L(done):
283 cntlzd r9, r3 /* Count leading zeros before the match. */
284 cmpld r8, r0 /* Are we on the last word? */
285 srdi r6, r9, 3 /* Convert leading zeros to bytes. */
286 addi r0, r6, -7
287 sub r3, r8, r0
288 cmpld cr7, r3, r10
289 bnelr
290 bgelr cr7
291 li r3, 0
292 blr
293
294 .align 4
295L(null):
296 li r3, 0
297 blr
298
299/* Deals with size <= 32. */
300 .align 4
301L(small_range):
302 cmpldi r5, 0
303 beq L(null)
304
305#ifdef __LITTLE_ENDIAN__
306 ldx r12, 0, r8
307#else
308 ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */
309#endif
310 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
311 and r3, r3, r9
312 cmpldi cr7, r3, 0
313 bne cr7, L(done)
314
315 /* Are we done already? */
316 cmpld r8, r0
317 addi r8, r8, -8
318 beqlr
319
320 .align 5
321L(loop_small):
322#ifdef __LITTLE_ENDIAN__
323 ldx r12, 0, r8
324#else
325 ldbrx r12, 0, r8
326#endif
327 cmpb r3, r12, r4
328 cmpld r8, r0
329 cmpldi cr7, r3, 0
330 bne cr7, L(done)
331 addi r8, r8, -8
332 bne L(loop_small)
333 blr
334
335END (MEMRCHR)
336weak_alias (__memrchr, memrchr)
337libc_hidden_builtin_def (memrchr)