]>
Commit | Line | Data |
---|---|---|
1 | /* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8. | |
2 | Copyright (C) 2015-2019 Free Software Foundation, Inc. | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <https://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | ||
21 | #ifdef USE_AS_STPNCPY | |
22 | # ifndef STPNCPY | |
23 | # define FUNC_NAME __stpncpy | |
24 | # else | |
25 | # define FUNC_NAME STPNCPY | |
26 | # endif | |
27 | #else | |
28 | # ifndef STRNCPY | |
29 | # define FUNC_NAME strncpy | |
30 | # else | |
31 | # define FUNC_NAME STRNCPY | |
32 | # endif | |
33 | #endif /* !USE_AS_STPNCPY */ | |
34 | ||
35 | #ifndef MEMSET | |
36 | /* For builds without IFUNC support, local calls should be made to internal | |
37 | GLIBC symbol (created by libc_hidden_builtin_def). */ | |
38 | # ifdef SHARED | |
39 | # define MEMSET_is_local | |
40 | # define MEMSET __GI_memset | |
41 | # else | |
42 | # define MEMSET memset | |
43 | # endif | |
44 | #endif | |
45 | ||
46 | #define FRAMESIZE (FRAME_MIN_SIZE+48) | |
47 | ||
48 | /* Implements the function | |
49 | ||
50 | char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) | |
51 | ||
52 | or | |
53 | ||
54 | char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) | |
55 | ||
56 | if USE_AS_STPCPY is defined. | |
57 | ||
58 | The implementation uses unaligned doubleword access to avoid specialized | |
59 | code paths depending of data alignment. Although recent powerpc64 uses | |
60 | 64K as default, the page cross handling assumes minimum page size of | |
61 | 4k. */ | |
62 | ||
63 | .machine power8 | |
64 | #ifdef MEMSET_is_local | |
65 | ENTRY_TOCLESS (FUNC_NAME, 4) | |
66 | #else | |
67 | ENTRY (FUNC_NAME, 4) | |
68 | #endif | |
69 | CALL_MCOUNT 3 | |
70 | ||
71 | /* Check if the [src]+15 will cross a 4K page by checking if the bit | |
72 | indicating the page size changes. Basically: | |
73 | ||
74 | uint64_t srcin = (uint64_t)src; | |
75 | uint64_t ob = srcin & 4096UL; | |
76 | uint64_t nb = (srcin+15UL) & 4096UL; | |
77 | if (ob ^ nb) | |
78 | goto pagecross; */ | |
79 | ||
80 | addi r10,r4,16 | |
81 | rlwinm r9,r4,0,19,19 | |
82 | ||
83 | /* Save some non-volatile registers on the stack. */ | |
84 | std r26,-48(r1) | |
85 | std r27,-40(r1) | |
86 | ||
87 | rlwinm r8,r10,0,19,19 | |
88 | ||
89 | std r28,-32(r1) | |
90 | std r29,-24(r1) | |
91 | ||
92 | cmpld cr7,r9,r8 | |
93 | ||
94 | std r30,-16(r1) | |
95 | std r31,-8(r1) | |
96 | ||
97 | /* Update CFI. */ | |
98 | cfi_offset(r26, -48) | |
99 | cfi_offset(r27, -40) | |
100 | cfi_offset(r28, -32) | |
101 | cfi_offset(r29, -24) | |
102 | cfi_offset(r30, -16) | |
103 | cfi_offset(r31, -8) | |
104 | ||
105 | beq cr7,L(unaligned_lt_16) | |
106 | rldicl r9,r4,0,61 | |
107 | subfic r8,r9,8 | |
108 | cmpld cr7,r5,r8 | |
109 | bgt cr7,L(pagecross) | |
110 | ||
111 | /* At this points there is 1 to 15 bytes to check and write. Since it could | |
112 | be either from first unaligned 16 bytes access or from bulk copy, the code | |
113 | uses an unrolled byte read/write instead of trying to analyze the cmpb | |
114 | results. */ | |
115 | L(short_path): | |
116 | mr r9,r3 | |
117 | L(short_path_1): | |
118 | /* Return if there are no more bytes to be written. */ | |
119 | cmpdi cr7,r5,0 | |
120 | beq cr7,L(short_path_loop_end_1) | |
121 | L(short_path_2): | |
122 | /* Copy one char from src (r4) and write it to dest (r9). If it is the | |
123 | end-of-string, start the null padding. Continue, otherwise. */ | |
124 | lbz r10,0(r4) | |
125 | cmpdi cr7,r10,0 | |
126 | stb r10,0(r9) | |
127 | beq cr7,L(zero_pad_start_1) | |
128 | /* If there are no more bytes to be written, return. */ | |
129 | cmpdi cr0,r5,1 | |
130 | addi r8,r9,1 | |
131 | addi r6,r5,-1 | |
132 | beq cr0,L(short_path_loop_end_0) | |
133 | /* Copy another char from src (r4) to dest (r9). Check again if it is | |
134 | the end-of-string. If so, start the null padding. */ | |
135 | lbz r10,1(r4) | |
136 | cmpdi cr7,r10,0 | |
137 | stb r10,1(r9) | |
138 | beq cr7,L(zero_pad_start_prepare_1) | |
139 | /* Eagerly decrement r5 by 3, which is the number of bytes already | |
140 | written, plus one write that will be performed later on. */ | |
141 | addi r10,r5,-3 | |
142 | b L(short_path_loop_1) | |
143 | ||
144 | .align 4 | |
145 | L(short_path_loop): | |
146 | /* At this point, the induction variable, r5, as well as the pointers | |
147 | to dest and src (r9 and r4, respectivelly) have been updated. | |
148 | ||
149 | Note: The registers r7 and r10 are induction variables derived from | |
150 | r5. They are used to determine if the total number of writes has | |
151 | been reached at every other write. | |
152 | ||
153 | Copy one char from src (r4) and write it to dest (r9). If it is the | |
154 | end-of-string, start the null padding. Continue, otherwise. */ | |
155 | lbz r8,0(r4) | |
156 | addi r7,r10,-2 | |
157 | cmpdi cr5,r8,0 | |
158 | stb r8,0(r9) | |
159 | beq cr5,L(zero_pad_start_1) | |
160 | beq cr7,L(short_path_loop_end_0) | |
161 | /* Copy another char from src (r4) to dest (r9). Check again if it is | |
162 | the end-of-string. If so, start the null padding. */ | |
163 | lbz r8,1(r4) | |
164 | cmpdi cr7,r8,0 | |
165 | stb r8,1(r9) | |
166 | beq cr7,L(zero_pad_start) | |
167 | mr r10,r7 | |
168 | L(short_path_loop_1): | |
169 | /* This block is reached after two chars have been already written to | |
170 | dest. Nevertheless, r5 (the induction variable), r9 (the pointer to | |
171 | dest), and r4 (the pointer to src) have not yet been updated. | |
172 | ||
173 | At this point: | |
174 | r5 holds the count of bytes yet to be written plus 2. | |
175 | r9 points to the last two chars that were already written to dest. | |
176 | r4 points to the last two chars that were already copied from src. | |
177 | ||
178 | The algorithm continues by decrementing r5, the induction variable, | |
179 | so that it reflects the last two writes. The pointers to dest (r9) | |
180 | and to src (r4) are increment by two, for the same reason. | |
181 | ||
182 | Note: Register r10 is another induction variable, derived from r5, | |
183 | which determines if the total number of writes has been reached. */ | |
184 | addic. r5,r5,-2 | |
185 | addi r9,r9,2 | |
186 | cmpdi cr7,r10,0 /* Eagerly check if the next write is the last. */ | |
187 | addi r4,r4,2 | |
188 | addi r6,r9,1 | |
189 | bne cr0,L(short_path_loop) /* Check if the total number of writes | |
190 | has been reached at every other | |
191 | write. */ | |
192 | #ifdef USE_AS_STPNCPY | |
193 | mr r3,r9 | |
194 | b L(short_path_loop_end) | |
195 | #endif | |
196 | ||
197 | L(short_path_loop_end_0): | |
198 | #ifdef USE_AS_STPNCPY | |
199 | addi r3,r9,1 | |
200 | b L(short_path_loop_end) | |
201 | #endif | |
202 | L(short_path_loop_end_1): | |
203 | #ifdef USE_AS_STPNCPY | |
204 | mr r3,r9 | |
205 | #endif | |
206 | L(short_path_loop_end): | |
207 | /* Restore non-volatile registers. */ | |
208 | ld r26,-48(r1) | |
209 | ld r27,-40(r1) | |
210 | ld r28,-32(r1) | |
211 | ld r29,-24(r1) | |
212 | ld r30,-16(r1) | |
213 | ld r31,-8(r1) | |
214 | blr | |
215 | ||
216 | /* This code pads the remainder of dest with NULL bytes. The algorithm | |
217 | calculates the remaining size and calls memset. */ | |
218 | .align 4 | |
219 | L(zero_pad_start): | |
220 | mr r5,r10 | |
221 | mr r9,r6 | |
222 | L(zero_pad_start_1): | |
223 | /* At this point: | |
224 | - r5 holds the number of bytes that still have to be written to | |
225 | dest. | |
226 | - r9 points to the position, in dest, where the first null byte | |
227 | will be written. | |
228 | The above statements are true both when control reaches this label | |
229 | from a branch or when falling through the previous lines. */ | |
230 | #ifndef USE_AS_STPNCPY | |
231 | mr r30,r3 /* Save the return value of strncpy. */ | |
232 | #endif | |
233 | /* Prepare the call to memset. */ | |
234 | mr r3,r9 /* Pointer to the area to be zero-filled. */ | |
235 | li r4,0 /* Byte to be written (zero). */ | |
236 | ||
237 | /* We delayed the creation of the stack frame, as well as the saving of | |
238 | the link register, because only at this point, we are sure that | |
239 | doing so is actually needed. */ | |
240 | ||
241 | /* Save the link register. */ | |
242 | mflr r0 | |
243 | std r0,16(r1) | |
244 | ||
245 | /* Create the stack frame. */ | |
246 | stdu r1,-FRAMESIZE(r1) | |
247 | cfi_adjust_cfa_offset(FRAMESIZE) | |
248 | cfi_offset(lr, 16) | |
249 | ||
250 | bl MEMSET | |
251 | #ifndef MEMSET_is_local | |
252 | nop | |
253 | #endif | |
254 | ||
255 | ld r0,FRAMESIZE+16(r1) | |
256 | ||
257 | #ifndef USE_AS_STPNCPY | |
258 | mr r3,r30 /* Restore the return value of strncpy, i.e.: | |
259 | dest. For stpncpy, the return value is the | |
260 | same as return value of memset. */ | |
261 | #endif | |
262 | ||
263 | /* Restore non-volatile registers and return. */ | |
264 | ld r26,FRAMESIZE-48(r1) | |
265 | ld r27,FRAMESIZE-40(r1) | |
266 | ld r28,FRAMESIZE-32(r1) | |
267 | ld r29,FRAMESIZE-24(r1) | |
268 | ld r30,FRAMESIZE-16(r1) | |
269 | ld r31,FRAMESIZE-8(r1) | |
270 | /* Restore the stack frame. */ | |
271 | addi r1,r1,FRAMESIZE | |
272 | cfi_adjust_cfa_offset(-FRAMESIZE) | |
273 | /* Restore the link register. */ | |
274 | mtlr r0 | |
275 | cfi_restore(lr) | |
276 | blr | |
277 | ||
278 | /* The common case where [src]+16 will not cross a 4K page boundary. | |
279 | In this case the code fast check the first 16 bytes by using doubleword | |
280 | read/compares and update destiny if neither total size or null byte | |
281 | is found in destiny. */ | |
282 | .align 4 | |
283 | L(unaligned_lt_16): | |
284 | cmpldi cr7,r5,7 | |
285 | ble cr7,L(short_path) | |
286 | ld r7,0(r4) | |
287 | li r8,0 | |
288 | cmpb r8,r7,r8 | |
289 | cmpdi cr7,r8,0 | |
290 | bne cr7,L(short_path_prepare_2) | |
291 | addi r6,r5,-8 | |
292 | std r7,0(r3) | |
293 | addi r9,r3,8 | |
294 | cmpldi cr7,r6,7 | |
295 | addi r7,r4,8 | |
296 | ble cr7,L(short_path_prepare_1_1) | |
297 | ld r4,8(r4) | |
298 | cmpb r8,r4,r8 | |
299 | cmpdi cr7,r8,0 | |
300 | bne cr7,L(short_path_prepare_2_1) | |
301 | std r4,8(r3) | |
302 | addi r29,r3,16 | |
303 | addi r5,r5,-16 | |
304 | /* Neither the null byte was found or total length was reached, | |
305 | align to 16 bytes and issue a bulk copy/compare. */ | |
306 | b L(align_to_16b) | |
307 | ||
308 | /* In the case of 4k page boundary cross, the algorithm first align | |
309 | the address to a doubleword, calculate a mask based on alignment | |
310 | to ignore the bytes and continue using doubleword. */ | |
311 | .align 4 | |
312 | L(pagecross): | |
313 | rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */ | |
314 | li r6,-1 /* MASK = 0xffffffffffffffffUL. */ | |
315 | sldi r9,r9,3 /* Calculate padding. */ | |
316 | ld r7,0(r11) /* Load doubleword from memory. */ | |
317 | #ifdef __LITTLE_ENDIAN__ | |
318 | sld r9,r6,r9 /* MASK = MASK << padding. */ | |
319 | #else | |
320 | srd r9,r6,r9 /* MASK = MASK >> padding. */ | |
321 | #endif | |
322 | orc r9,r7,r9 /* Mask bits that are not part of the | |
323 | string. */ | |
324 | li r7,0 | |
325 | cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */ | |
326 | cmpdi cr7,r9,0 | |
327 | bne cr7,L(short_path_prepare_2) | |
328 | subf r8,r8,r5 /* Adjust total length. */ | |
329 | cmpldi cr7,r8,8 /* Check if length was reached. */ | |
330 | ble cr7,L(short_path_prepare_2) | |
331 | ||
332 | /* For next checks we have aligned address, so we check for more | |
333 | three doublewords to make sure we can read 16 unaligned bytes | |
334 | to start the bulk copy with 16 aligned addresses. */ | |
335 | ld r7,8(r11) | |
336 | cmpb r9,r7,r9 | |
337 | cmpdi cr7,r9,0 | |
338 | bne cr7,L(short_path_prepare_2) | |
339 | addi r7,r8,-8 | |
340 | cmpldi cr7,r7,8 | |
341 | ble cr7,L(short_path_prepare_2) | |
342 | ld r7,16(r11) | |
343 | cmpb r9,r7,r9 | |
344 | cmpdi cr7,r9,0 | |
345 | bne cr7,L(short_path_prepare_2) | |
346 | addi r8,r8,-16 | |
347 | cmpldi cr7,r8,8 | |
348 | ble cr7,L(short_path_prepare_2) | |
349 | ld r8,24(r11) | |
350 | cmpb r9,r8,r9 | |
351 | cmpdi cr7,r9,0 | |
352 | bne cr7,L(short_path_prepare_2) | |
353 | ||
354 | /* No null byte found in the 32 bytes readed and length not reached, | |
355 | read source again using unaligned loads and store them. */ | |
356 | ld r9,0(r4) | |
357 | addi r29,r3,16 | |
358 | addi r5,r5,-16 | |
359 | std r9,0(r3) | |
360 | ld r9,8(r4) | |
361 | std r9,8(r3) | |
362 | ||
363 | /* Align source to 16 bytes and adjust destiny and size. */ | |
364 | L(align_to_16b): | |
365 | rldicl r9,r10,0,60 | |
366 | rldicr r28,r10,0,59 | |
367 | add r12,r5,r9 | |
368 | subf r29,r9,r29 | |
369 | ||
370 | /* The bulk read/compare/copy loads two doublewords, compare and merge | |
371 | in a single register for speed. This is an attempt to speed up the | |
372 | null-checking process for bigger strings. */ | |
373 | ||
374 | cmpldi cr7,r12,15 | |
375 | ble cr7,L(short_path_prepare_1_2) | |
376 | ||
377 | /* Main loop for large sizes, unrolled 2 times to get better use of | |
378 | pipeline. */ | |
379 | ld r8,0(28) | |
380 | ld r10,8(28) | |
381 | li r9,0 | |
382 | cmpb r7,r8,r9 | |
383 | cmpb r9,r10,r9 | |
384 | or. r6,r9,r7 | |
385 | bne cr0,L(short_path_prepare_2_3) | |
386 | addi r5,r12,-16 | |
387 | addi r4,r28,16 | |
388 | std r8,0(r29) | |
389 | std r10,8(r29) | |
390 | cmpldi cr7,r5,15 | |
391 | addi r9,r29,16 | |
392 | ble cr7,L(short_path_1) | |
393 | mr r11,r28 | |
394 | mr r6,r29 | |
395 | li r30,0 | |
396 | subfic r26,r4,48 | |
397 | subfic r27,r9,48 | |
398 | ||
399 | b L(loop_16b) | |
400 | ||
401 | .align 4 | |
402 | L(loop_start): | |
403 | ld r31,0(r11) | |
404 | ld r10,8(r11) | |
405 | cmpb r0,r31,r7 | |
406 | cmpb r8,r10,r7 | |
407 | or. r7,r0,r8 | |
408 | addi r5,r5,-32 | |
409 | cmpldi cr7,r5,15 | |
410 | add r4,r4,r26 | |
411 | add r9,r9,r27 | |
412 | bne cr0,L(short_path_prepare_2_2) | |
413 | add r4,r28,r4 | |
414 | std r31,0(r6) | |
415 | add r9,r29,r9 | |
416 | std r10,8(r6) | |
417 | ble cr7,L(short_path_1) | |
418 | ||
419 | L(loop_16b): | |
420 | ld r10,16(r11) | |
421 | ld r0,24(r11) | |
422 | cmpb r8,r10,r30 | |
423 | cmpb r7,r0,r30 | |
424 | or. r7,r8,r7 | |
425 | addi r12,r12,-32 | |
426 | cmpldi cr7,r12,15 | |
427 | addi r11,r11,32 | |
428 | bne cr0,L(short_path_2) | |
429 | std r10,16(r6) | |
430 | addi r6,r6,32 | |
431 | std r0,-8(r6) | |
432 | bgt cr7,L(loop_start) | |
433 | ||
434 | mr r5,r12 | |
435 | mr r4,r11 | |
436 | mr r9,r6 | |
437 | b L(short_path_1) | |
438 | ||
439 | .align 4 | |
440 | L(short_path_prepare_1_1): | |
441 | mr r5,r6 | |
442 | mr r4,r7 | |
443 | b L(short_path_1) | |
444 | L(short_path_prepare_1_2): | |
445 | mr r5,r12 | |
446 | mr r4,r28 | |
447 | mr r9,r29 | |
448 | b L(short_path_1) | |
449 | L(short_path_prepare_2): | |
450 | mr r9,r3 | |
451 | b L(short_path_2) | |
452 | L(short_path_prepare_2_1): | |
453 | mr r5,r6 | |
454 | mr r4,r7 | |
455 | b L(short_path_2) | |
456 | L(short_path_prepare_2_2): | |
457 | mr r5,r12 | |
458 | mr r4,r11 | |
459 | mr r9,r6 | |
460 | b L(short_path_2) | |
461 | L(short_path_prepare_2_3): | |
462 | mr r5,r12 | |
463 | mr r4,r28 | |
464 | mr r9,r29 | |
465 | b L(short_path_2) | |
466 | L(zero_pad_start_prepare_1): | |
467 | mr r5,r6 | |
468 | mr r9,r8 | |
469 | b L(zero_pad_start_1) | |
470 | END (FUNC_NAME) | |
471 | ||
472 | #ifndef USE_AS_STPNCPY | |
473 | libc_hidden_builtin_def (strncpy) | |
474 | #endif |