]>
Commit | Line | Data |
---|---|---|
96d6fd6c | 1 | /* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8. |
688903eb | 2 | Copyright (C) 2015-2018 Free Software Foundation, Inc. |
96d6fd6c AZ |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | ||
21 | #ifdef USE_AS_STPCPY | |
dbcc7d08 WSM |
22 | # ifndef STPCPY |
23 | # define FUNC_NAME __stpcpy | |
24 | # else | |
25 | # define FUNC_NAME STPCPY | |
26 | # endif | |
96d6fd6c | 27 | #else |
dbcc7d08 WSM |
28 | # ifndef STRCPY |
29 | # define FUNC_NAME strcpy | |
30 | # else | |
31 | # define FUNC_NAME STRCPY | |
32 | # endif | |
33 | #endif /* !USE_AS_STPCPY */ | |
96d6fd6c AZ |
34 | |
35 | /* Implements the function | |
36 | ||
37 | char * [r3] strcpy (char *dest [r3], const char *src [r4]) | |
38 | ||
39 | or | |
40 | ||
41 | char * [r3] stpcpy (char *dest [r3], const char *src [r4]) | |
42 | ||
43 | if USE_AS_STPCPY is defined. | |
44 | ||
45 | The implementation uses unaligned doubleword access to avoid specialized | |
46 | code paths depending of data alignment. Although recent powerpc64 uses | |
47 | 64K as default, the page cross handling assumes minimum page size of | |
48 | 4k. */ | |
49 | ||
1e36806f | 50 | .machine power8 |
d5b41185 | 51 | ENTRY_TOCLESS (FUNC_NAME, 4) |
96d6fd6c AZ |
52 | li r0,0 /* Doubleword with null chars to use |
53 | with cmpb. */ | |
54 | ||
55 | /* Check if the [src]+15 will cross a 4K page by checking if the bit | |
56 | indicating the page size changes. Basically: | |
57 | ||
58 | uint64_t srcin = (uint64_t)src; | |
59 | uint64_t ob = srcin & 4096UL; | |
60 | uint64_t nb = (srcin+15UL) & 4096UL; | |
61 | if (ob ^ nb) | |
62 | goto pagecross; */ | |
63 | ||
64 | addi r9,r4,15 | |
65 | xor r9,r9,r4 | |
66 | rlwinm. r9,r9,0,19,19 | |
67 | bne L(pagecross) | |
68 | ||
69 | /* For short string (less than 16 bytes), just calculate its size as | |
70 | strlen and issues a memcpy if null is found. */ | |
71 | mr r7,r4 | |
72 | ld r12,0(r7) /* Load doubleword from memory. */ | |
73 | cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */ | |
74 | cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ | |
75 | bne cr7,L(done) | |
76 | ||
77 | ldu r8,8(r7) | |
78 | cmpb r10,r8,r0 | |
79 | cmpdi cr7,r10,0 | |
80 | bne cr7,L(done) | |
81 | ||
82 | b L(loop_before) | |
83 | ||
84 | .align 4 | |
85 | L(pagecross): | |
86 | clrrdi r7,r4,3 /* Align the address to doubleword boundary. */ | |
87 | rlwinm r6,r4,3,26,28 /* Calculate padding. */ | |
88 | li r5,-1 /* MASK = 0xffffffffffffffff. */ | |
89 | ld r12,0(r7) /* Load doubleword from memory. */ | |
90 | #ifdef __LITTLE_ENDIAN__ | |
91 | sld r5,r5,r6 | |
92 | #else | |
93 | srd r5,r5,r6 /* MASK = MASK >> padding. */ | |
94 | #endif | |
95 | orc r9,r12,r5 /* Mask bits that are not part of the string. */ | |
96 | cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ | |
97 | cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ | |
98 | bne cr7,L(done) | |
99 | ||
100 | ldu r6,8(r7) | |
101 | cmpb r10,r6,r0 | |
102 | cmpdi cr7,r10,0 | |
103 | bne cr7,L(done) | |
104 | ||
105 | ld r12,0(r7) | |
106 | cmpb r10,r12,r0 | |
107 | cmpdi cr7,r10,0 | |
108 | bne cr7,L(done) | |
109 | ||
110 | ldu r6,8(r7) | |
111 | cmpb r10,r6,r0 | |
112 | cmpdi cr7,r10,0 | |
113 | bne cr7,L(done) | |
114 | ||
115 | /* We checked for 24 - x bytes, with x being the source alignment | |
116 | (0 <= x <= 16), and no zero has been found. Start the loop | |
117 | copy with doubleword aligned address. */ | |
118 | mr r7,r4 | |
119 | ld r12, 0(r7) | |
120 | ldu r8, 8(r7) | |
121 | ||
122 | L(loop_before): | |
1e36806f | 123 | /* Save the two doublewords read from source and align the source |
96d6fd6c AZ |
124 | to 16 bytes for the loop. */ |
125 | mr r11,r3 | |
126 | std r12,0(r11) | |
127 | std r8,8(r11) | |
128 | addi r11,r11,16 | |
129 | rldicl r9,r4,0,60 | |
130 | subf r7,r9,r7 | |
131 | subf r11,r9,r11 | |
1e36806f RS |
132 | /* Source is adjusted to 16B alignment and destination r11 is |
133 | also moved based on that adjustment. Now check if r11 is | |
134 | also 16B aligned to move to vectorized loop. */ | |
135 | andi. r6, r11, 0xF | |
136 | bne L(loop_start) | |
137 | ||
138 | /* Prepare for the loop. */ | |
139 | subf r4, r9, r4 /* Adjust r4 based on alignment. */ | |
140 | li r7, 16 /* Load required offsets. */ | |
141 | li r8, 32 | |
142 | li r9, 48 | |
143 | vspltisb v0, 0 | |
144 | addi r4, r4, 16 | |
145 | /* Are we 64-byte aligned? If so, jump to the vectorized loop. | |
146 | Else copy 16B till r4 is 64B aligned. */ | |
147 | andi. r6, r4, 63 | |
148 | beq L(qw_loop) | |
149 | ||
150 | lvx v6, 0, r4 /* Load 16 bytes from memory. */ | |
151 | vcmpequb. v5, v0, v6 /* Check for null. */ | |
152 | bne cr6, L(qw_done) | |
153 | stvx v6, 0, r11 /* Store 16 bytes. */ | |
154 | addi r4, r4, 16 /* Increment the address. */ | |
155 | addi r11, r11, 16 | |
156 | andi. r6, r4, 63 | |
157 | beq L(qw_loop) | |
158 | ||
159 | lvx v6, 0, r4 | |
160 | vcmpequb. v5, v0, v6 | |
161 | bne cr6, L(qw_done) | |
162 | stvx v6, 0, r11 | |
163 | addi r4, r4, 16 | |
164 | addi r11, r11, 16 | |
165 | andi. r6, r4, 63 | |
166 | beq L(qw_loop) | |
167 | ||
168 | lvx v6, 0, r4 | |
169 | vcmpequb. v5, v0, v6 | |
170 | bne cr6, L(qw_done) | |
171 | stvx v6, 0, r11 | |
172 | addi r4, r4, 16 | |
173 | addi r11, r11, 16 | |
174 | ||
175 | .align 4 | |
176 | L(qw_loop): | |
177 | lvx v1, r4, r0 /* Load 4 quadwords. */ | |
178 | lvx v2, r4, r7 | |
179 | lvx v3, r4, r8 | |
180 | lvx v4, r4, r9 | |
181 | vminub v5, v1, v2 /* Compare and merge into one VR for speed. */ | |
182 | vminub v8, v3, v4 | |
183 | vminub v7, v5, v8 | |
184 | vcmpequb. v7, v7, v0 /* Check for NULLs. */ | |
185 | bne cr6, L(qw_loop_done) | |
186 | stvx v1, r11, r0 /* Store 4 quadwords. */ | |
187 | stvx v2, r11, r7 | |
188 | stvx v3, r11, r8 | |
189 | stvx v4, r11, r9 | |
190 | addi r4, r4, 64 /* Adjust address for the next iteration. */ | |
191 | addi r11, r11, 64 /* Adjust address for the next iteration. */ | |
192 | ||
193 | lvx v1, r4, r0 /* Load 4 quadwords. */ | |
194 | lvx v2, r4, r7 | |
195 | lvx v3, r4, r8 | |
196 | lvx v4, r4, r9 | |
197 | vminub v5, v1, v2 /* Compare and merge into one VR for speed. */ | |
198 | vminub v8, v3, v4 | |
199 | vminub v7, v5, v8 | |
200 | vcmpequb. v7, v7, v0 /* Check for NULLs. */ | |
201 | bne cr6, L(qw_loop_done) | |
202 | stvx v1, r11, r0 /* Store 4 quadwords. */ | |
203 | stvx v2, r11, r7 | |
204 | stvx v3, r11, r8 | |
205 | stvx v4, r11, r9 | |
206 | addi r4, r4, 64 /* Adjust address for the next iteration. */ | |
207 | addi r11, r11, 64 /* Adjust address for the next iteration. */ | |
208 | ||
209 | lvx v1, r4, r0 /* Load 4 quadwords. */ | |
210 | lvx v2, r4, r7 | |
211 | lvx v3, r4, r8 | |
212 | lvx v4, r4, r9 | |
213 | vminub v5, v1, v2 /* Compare and merge into one VR for speed. */ | |
214 | vminub v8, v3, v4 | |
215 | vminub v7, v5, v8 | |
216 | vcmpequb. v7, v7, v0 /* Check for NULLs. */ | |
217 | bne cr6, L(qw_loop_done) | |
218 | stvx v1, r11, r0 /* Store 4 quadwords. */ | |
219 | stvx v2, r11, r7 | |
220 | stvx v3, r11, r8 | |
221 | stvx v4, r11, r9 | |
222 | addi r4, r4, 64 /* Adjust address for the next iteration. */ | |
223 | addi r11, r11, 64 /* Adjust address for the next iteration. */ | |
224 | b L(qw_loop) | |
225 | ||
226 | .align 4 | |
227 | L(qw_loop_done): | |
228 | /* Null found in one of the 4 loads. */ | |
229 | vcmpequb. v7, v1, v0 | |
230 | vor v6, v1, v1 | |
231 | bne cr6, L(qw_done) | |
232 | /* Not on the first 16B, So store it. */ | |
233 | stvx v1, r11, r0 | |
234 | addi r4, r4, 16 | |
235 | addi r11, r11, 16 | |
236 | vcmpequb. v7, v2, v0 | |
237 | vor v6, v2, v2 | |
238 | bne cr6, L(qw_done) | |
239 | /* Not on the second 16B, So store it. */ | |
240 | stvx v2, r11, r0 | |
241 | addi r4, r4, 16 | |
242 | addi r11, r11, 16 | |
243 | vcmpequb. v7, v3, v0 | |
244 | vor v6, v3, v3 | |
245 | bne cr6, L(qw_done) | |
246 | /* Not on the third 16B, So store it. */ | |
247 | stvx v6, r11, r0 | |
248 | addi r4, r4, 16 | |
249 | addi r11, r11, 16 | |
250 | vor v6, v4, v4 | |
251 | ||
252 | .align 4 | |
253 | L(qw_done): | |
254 | mr r7, r4 | |
255 | /* Move the result to GPR. */ | |
256 | #ifdef __LITTLE_ENDIAN__ | |
257 | vsldoi v4, v6, v0, 8 | |
258 | mfvrd r12, v4 | |
259 | #else | |
260 | mfvrd r12, v6 | |
261 | #endif | |
262 | /* Check for null in the first 8 bytes. */ | |
263 | cmpb r10, r12, r0 | |
264 | cmpdi cr6, r10, 0 | |
265 | bne cr6, L(done2) | |
266 | /* Null found in second doubleword. */ | |
267 | #ifdef __LITTLE_ENDIAN__ | |
268 | mfvrd r6, v6 | |
269 | #else | |
270 | vsldoi v6, v6, v0, 8 | |
271 | mfvrd r6, v6 | |
272 | #endif | |
273 | cmpb r10, r6, r0 | |
274 | addi r7, r7, 8 | |
275 | b L(done2) | |
96d6fd6c AZ |
276 | |
277 | .align 5 | |
278 | L(loop): | |
279 | std r12, 0(r11) | |
280 | std r6, 8(r11) | |
281 | addi r11,r11,16 | |
282 | L(loop_start): | |
283 | /* Load two doublewords, compare and merge in a | |
284 | single register for speed. This is an attempt | |
285 | to speed up the null-checking process for bigger strings. */ | |
286 | ||
287 | ld r12, 8(r7) | |
288 | ldu r6, 16(r7) | |
289 | cmpb r10,r12,r0 | |
290 | cmpb r9,r6,r0 | |
291 | or r8,r9,r10 /* Merge everything in one doubleword. */ | |
292 | cmpdi cr7,r8,0 | |
293 | beq cr7,L(loop) | |
294 | ||
295 | ||
296 | /* OK, one (or both) of the doublewords contains a null byte. Check | |
297 | the first doubleword and decrement the address in case the first | |
298 | doubleword really contains a null byte. */ | |
299 | ||
300 | addi r4,r7,-8 | |
301 | cmpdi cr6,r10,0 | |
302 | addi r7,r7,-8 | |
303 | bne cr6,L(done2) | |
304 | ||
305 | /* The null byte must be in the second doubleword. Adjust the address | |
306 | again and move the result of cmpb to r10 so we can calculate the | |
307 | length. */ | |
308 | ||
309 | mr r10,r9 | |
310 | addi r7,r7,8 | |
311 | b L(done2) | |
312 | ||
313 | /* r10 has the output of the cmpb instruction, that is, it contains | |
314 | 0xff in the same position as the null byte in the original | |
315 | doubleword from the string. Use that to calculate the length. */ | |
316 | L(done): | |
317 | mr r11,r3 | |
318 | L(done2): | |
319 | #ifdef __LITTLE_ENDIAN__ | |
320 | addi r9, r10, -1 /* Form a mask from trailing zeros. */ | |
321 | andc r9, r9, r10 | |
322 | popcntd r6, r9 /* Count the bits in the mask. */ | |
323 | #else | |
324 | cntlzd r6,r10 /* Count leading zeros before the match. */ | |
325 | #endif | |
326 | subf r5,r4,r7 | |
327 | srdi r6,r6,3 /* Convert leading/trailing zeros to bytes. */ | |
328 | add r8,r5,r6 /* Compute final length. */ | |
329 | #ifdef USE_AS_STPCPY | |
330 | /* stpcpy returns the dest address plus the size not counting the | |
331 | final '\0'. */ | |
332 | add r3,r11,r8 | |
333 | #endif | |
334 | addi r8,r8,1 /* Final '/0'. */ | |
335 | ||
336 | cmpldi cr6,r8,8 | |
337 | mtocrf 0x01,r8 | |
338 | ble cr6,L(copy_LE_8) | |
339 | ||
340 | cmpldi cr1,r8,16 | |
341 | blt cr1,8f | |
342 | ||
343 | /* Handle copies of 0~31 bytes. */ | |
344 | .align 4 | |
345 | L(copy_LT_32): | |
346 | /* At least 6 bytes to go. */ | |
347 | blt cr1,8f | |
348 | ||
349 | /* Copy 16 bytes. */ | |
350 | ld r6,0(r4) | |
351 | ld r8,8(r4) | |
352 | addi r4,r4,16 | |
353 | std r6,0(r11) | |
354 | std r8,8(r11) | |
355 | addi r11,r11,16 | |
356 | 8: /* Copy 8 bytes. */ | |
357 | bf 28,L(tail4) | |
358 | ld r6,0(r4) | |
359 | addi r4,r4,8 | |
360 | std r6,0(r11) | |
361 | addi r11,r11,8 | |
362 | ||
363 | .align 4 | |
364 | /* Copies 4~7 bytes. */ | |
365 | L(tail4): | |
366 | bf 29,L(tail2) | |
367 | lwz r6,0(r4) | |
368 | stw r6,0(r11) | |
369 | bf 30,L(tail5) | |
370 | lhz r7,4(r4) | |
371 | sth r7,4(r11) | |
372 | bflr 31 | |
373 | lbz r8,6(r4) | |
374 | stb r8,6(r11) | |
375 | blr | |
376 | ||
377 | .align 4 | |
378 | /* Copies 2~3 bytes. */ | |
379 | L(tail2): | |
380 | bf 30,1f | |
381 | lhz r6,0(r4) | |
382 | sth r6,0(r11) | |
383 | bflr 31 | |
384 | lbz r7,2(r4) | |
385 | stb r7,2(r11) | |
386 | blr | |
387 | ||
388 | .align 4 | |
389 | L(tail5): | |
390 | bf 31,1f | |
391 | lbz r6,4(r4) | |
392 | stb r6,4(r11) | |
393 | blr | |
394 | ||
395 | .align 4 | |
396 | 1: | |
397 | bflr 31 | |
398 | lbz r6,0(r4) | |
399 | stb r6,0(r11) | |
400 | blr | |
401 | ||
402 | /* Handles copies of 0~8 bytes. */ | |
403 | .align 4 | |
404 | L(copy_LE_8): | |
405 | bne cr6,L(tail4) | |
406 | ld r6,0(r4) | |
407 | std r6,0(r11) | |
408 | blr | |
409 | END (FUNC_NAME) | |
410 | ||
411 | #ifndef USE_AS_STPCPY | |
412 | libc_hidden_builtin_def (strcpy) | |
413 | #endif |