]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power7/strncpy.S
51860df8c54216299b26ec50e5a095901a6a5130
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / strncpy.S
1 /* Copyright (C) 2014 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3
4 The GNU C Library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
8
9 The GNU C Library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with the GNU C Library; if not, see
16 <http://www.gnu.org/licenses/>. */
17
18 #include <sysdep.h>
19
20 /* Implements the functions
21
22 char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
23
24 AND
25
26 char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
27
28 The algorithm is as follows:
29 > if src and dest are 8 byte aligned, perform double word copy
30 else
31 > copy byte by byte on unaligned addresses.
32
33 The aligned comparison are made using cmpb instructions. */
34
35 /* The focus on optimization for performance improvements are as follows:
36 1. data alignment [gain from aligned memory access on read/write]
37 2. POWER7 gains performance with loop unrolling/unwinding
38 [gain by reduction of branch penalty].
39 3. The final pad with null bytes is done by calling an optimized
40 memset. */
41
42 #ifdef USE_AS_STPNCPY
43 # define FUNC_NAME __stpncpy
44 #else
45 # define FUNC_NAME strncpy
46 #endif
47
48 #define FRAMESIZE (FRAME_MIN_SIZE+32)
49
50 #ifndef MEMSET
51 /* For builds with no IFUNC support, local calls should be made to internal
52 GLIBC symbol (created by libc_hidden_builtin_def). */
53 # ifdef SHARED
54 # define MEMSET __GI_memset
55 # else
56 # define MEMSET memset
57 # endif
58 #endif
59
60 .machine power7
61 EALIGN(FUNC_NAME, 4, 0)
62 CALL_MCOUNT 3
63
64 mflr r0 /* load link register LR to r0 */
65 or r10, r3, r4 /* to verify source and destination */
66 rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */
67
68 std r19, -8(r1) /* save callers register , r19 */
69 std r18, -16(r1) /* save callers register , r18 */
70 std r0, 16(r1) /* store the link register */
71 stdu r1, -FRAMESIZE(r1) /* create the stack frame */
72
73 mr r9, r3 /* save r3 into r9 for use */
74 mr r18, r3 /* save r3 for retCode of strncpy */
75 bne 0, L(byte_by_byte)
76
77
78 srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */
79 cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */
80 ble 7, L(update1)
81
82 ld r10, 0(r4) /* load doubleWord from src */
83 cmpb r8, r10, r8 /* compare src with NULL ,we read just now */
84 cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
85 bne cr7, L(update3)
86
87 std r10, 0(r3) /* copy doubleword at offset=0 */
88 ld r10, 8(r4) /* load next doubleword from offset=8 */
89 cmpb r8, r10, r8 /* compare src with NULL , we read just now */
90 cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
91 bne 7,L(HopBy8)
92
93 addi r8, r11, -4
94 mr r7, r3
95 srdi r8, r8, 2
96 mr r6, r4
97 addi r8, r8, 1
98 li r12, 0
99 mtctr r8
100 b L(dwordCopy)
101
102 .p2align 4
103 L(dWordUnroll):
104 std r8, 16(r9)
105 ld r8, 24(r4) /* load dword,perform loop unrolling again */
106 cmpb r10, r8, r10
107 cmpdi cr7, r10, 0
108 bne cr7, L(HopBy24)
109
110 std r8, 24(r7) /* copy dword at offset=24 */
111 addi r9, r9, 32
112 addi r4, r4, 32
113 bdz L(leftDwords) /* continue with loop on counter */
114
115 ld r3, 32(r6)
116 cmpb r8, r3, r10
117 cmpdi cr7, r8, 0
118 bne cr7, L(update2)
119
120 std r3, 32(r7)
121 ld r10, 40(r6)
122 cmpb r8, r10, r8
123 cmpdi cr7, r8, 0
124 bne cr7, L(HopBy40)
125
126 mr r6, r4 /* update values */
127 mr r7, r9
128 mr r11, r0
129 mr r5, r19
130
131 L(dwordCopy):
132 std r10, 8(r9) /* copy dword at offset=8 */
133 addi r19, r5, -32
134 addi r0, r11, -4
135 ld r8, 16(r4)
136 cmpb r10, r8, r12
137 cmpdi cr7, r10, 0
138 beq cr7, L(dWordUnroll)
139
140 addi r9, r9, 16 /* increment dst by 16 */
141 addi r4, r4, 16 /* increment src by 16 */
142 addi r5, r5, -16 /* decrement length 'n' by 16 */
143 addi r0, r11, -2 /* decrement loop counter */
144
145 L(dWordUnrollOFF):
146 ld r10, 0(r4) /* load first dword */
147 li r8, 0 /* load mask */
148 cmpb r8, r10, r8
149 cmpdi cr7, r8, 0
150 bne cr7, L(byte_by_byte)
151 mtctr r0
152 li r7, 0
153 b L(CopyDword)
154
155 .p2align 4
156 L(loadDWordandCompare):
157 ld r10, 0(r4)
158 cmpb r8, r10, r7
159 cmpdi cr7, r8, 0
160 bne cr7, L(byte_by_byte)
161
162 L(CopyDword):
163 addi r9, r9, 8
164 std r10, -8(r9)
165 addi r4, r4, 8
166 addi r5, r5, -8
167 bdnz L(loadDWordandCompare)
168
169 L(byte_by_byte):
170 cmpldi cr7, r5, 3
171 ble cr7, L(verifyByte)
172 srdi r10, r5, 2
173 mr r19, r9
174 mtctr r10
175 b L(firstByteUnroll)
176
177 .p2align 4
178 L(bytes_unroll):
179 lbz r10, 1(r4) /* load byte from src */
180 cmpdi cr7, r10, 0 /* compare for NULL */
181 stb r10, 1(r19) /* store byte to dst */
182 beq cr7, L(updtDestComputeN2ndByte)
183
184 addi r4, r4, 4 /* advance src */
185
186 lbz r10, -2(r4) /* perform loop unrolling for byte r/w */
187 cmpdi cr7, r10, 0
188 stb r10, 2(r19)
189 beq cr7, L(updtDestComputeN3rdByte)
190
191 lbz r10, -1(r4) /* perform loop unrolling for byte r/w */
192 addi r19, r19, 4
193 cmpdi cr7, r10, 0
194 stb r10, -1(r19)
195 beq cr7, L(ComputeNByte)
196
197 bdz L(update0)
198
199 L(firstByteUnroll):
200 lbz r10, 0(r4) /* perform loop unrolling for byte r/w */
201 cmpdi cr7, 10, 0
202 stb r10, 0(r19)
203 bne cr7, L(bytes_unroll)
204 addi r19, r19, 1
205
206 L(ComputeNByte):
207 subf r9, r19, r9 /* compute 'n'n bytes to fill */
208 add r8, r9, r5
209
210 L(zeroFill):
211 cmpdi cr7, r8, 0 /* compare if length is zero */
212 beq cr7, L(update3return)
213
214 mr r3, r19 /* fill buffer with */
215 li r4, 0 /* zero fill buffer */
216 mr r5, r8 /* how many bytes to fill buffer with */
217 bl MEMSET /* call optimized memset */
218 nop
219
220 L(update3return):
221 #ifdef USE_AS_STPNCPY
222 addi r3, r19, -1 /* update return value */
223 #endif
224
225 L(hop2return):
226 #ifndef USE_AS_STPNCPY
227 mr r3, r18 /* set return value */
228 #endif
229 addi r1, r1, FRAMESIZE /* restore stack pointer */
230 ld r0, 16(r1) /* read the saved link register */
231 ld r18, -16(r1) /* restore callers save register, r18 */
232 ld r19, -8(r1) /* restore callers save register, r19 */
233 mtlr r0 /* branch to link register */
234 blr /* return */
235
236 .p2align 4
237 L(update0):
238 mr r9, r19
239
240 .p2align 4
241 L(verifyByte):
242 rldicl. r8, r5, 0, 62
243 #ifdef USE_AS_STPNCPY
244 mr r3, r9
245 #endif
246 beq cr0, L(hop2return)
247 mtctr r8
248 addi r4, r4, -1
249 mr r19, r9
250 b L(oneBYone)
251
252 .p2align 4
253 L(proceed):
254 bdz L(done)
255
256 L(oneBYone):
257 lbzu r10, 1(r4) /* copy byte */
258 addi r19, r19, 1
259 addi r8, r8, -1
260 cmpdi cr7, r10, 0
261 stb r10, -1(r19)
262 bne cr7, L(proceed)
263 b L(zeroFill)
264
265 .p2align 4
266 L(done):
267 addi r1, r1, FRAMESIZE /* restore stack pointer */
268 #ifdef USE_AS_STPNCPY
269 mr r3, r19 /* set the return value */
270 #else
271 mr r3, r18 /* set the return value */
272 #endif
273 ld r0, 16(r1) /* read the saved link register */
274 ld r18, -16(r1) /* restore callers save register, r18 */
275 ld r19, -8(r1) /* restore callers save register, r19 */
276 mtlr r0 /* branch to link register */
277 blr /* return */
278
279 L(update1):
280 mr r0, r11
281 mr r19, r5
282
283 .p2align 4
284 L(leftDwords):
285 cmpdi cr7, r0, 0
286 mr r5, r19
287 bne cr7, L(dWordUnrollOFF)
288 b L(byte_by_byte)
289
290 .p2align 4
291 L(updtDestComputeN2ndByte):
292 addi r19, r19, 2 /* update dst by 2 */
293 subf r9, r19, r9 /* compute distance covered */
294 add r8, r9, r5
295 b L(zeroFill)
296
297 .p2align 4
298 L(updtDestComputeN3rdByte):
299 addi r19, r19, 3 /* update dst by 3 */
300 subf r9, r19, r9 /* compute distance covered */
301 add r8, r9, r5
302 b L(zeroFill)
303
304 .p2align 4
305 L(HopBy24):
306 addi r9, r9, 24 /* increment dst by 24 */
307 addi r4, r4, 24 /* increment src by 24 */
308 addi r5, r5, -24 /* decrement length 'n' by 24 */
309 addi r0, r11, -3 /* decrement loop counter */
310 b L(dWordUnrollOFF)
311
312 .p2align 4
313 L(update2):
314 mr r5, r19
315 b L(dWordUnrollOFF)
316
317 .p2align 4
318 L(HopBy40):
319 addi r9, r7, 40 /* increment dst by 40 */
320 addi r4, r6, 40 /* increment src by 40 */
321 addi r5, r5, -40 /* decrement length 'n' by 40 */
322 addi r0, r11, -5 /* decrement loop counter */
323 b L(dWordUnrollOFF)
324
325 L(update3):
326 mr r0, r11
327 b L(dWordUnrollOFF)
328
329 L(HopBy8):
330 addi r9, r3, 8 /* increment dst by 8 */
331 addi r4, r4, 8 /* increment src by 8 */
332 addi r5, r5, -8 /* decrement length 'n' by 8 */
333 addi r0, r11, -1 /* decrement loop counter */
334 b L(dWordUnrollOFF)
335 END(FUNC_NAME)
336 #ifndef USE_AS_STPNCPY
337 libc_hidden_builtin_def (strncpy)
338 #endif