]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/multiarch/memcpy_thunderx2.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / aarch64 / multiarch / memcpy_thunderx2.S
1 /* A Thunderx2 Optimized memcpy implementation for AARCH64.
2 Copyright (C) 2018-2019 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 /* Assumptions:
23 *
24 * ARMv8-a, AArch64, unaligned accesses.
25 *
26 */
27
28 #define dstin x0
29 #define src x1
30 #define count x2
31 #define dst x3
32 #define srcend x4
33 #define dstend x5
34 #define tmp2 x6
35 #define tmp3 x7
36 #define tmp3w w7
37 #define A_l x6
38 #define A_lw w6
39 #define A_h x7
40 #define A_hw w7
41 #define B_l x8
42 #define B_lw w8
43 #define B_h x9
44 #define C_l x10
45 #define C_h x11
46 #define D_l x12
47 #define D_h x13
48 #define E_l src
49 #define E_h count
50 #define F_l srcend
51 #define F_h dst
52 #define G_l count
53 #define G_h dst
54 #define tmp1 x14
55
56 #define A_q q0
57 #define B_q q1
58 #define C_q q2
59 #define D_q q3
60 #define E_q q4
61 #define F_q q5
62 #define G_q q6
63 #define H_q q7
64 #define I_q q16
65 #define J_q q17
66
67 #define A_v v0
68 #define B_v v1
69 #define C_v v2
70 #define D_v v3
71 #define E_v v4
72 #define F_v v5
73 #define G_v v6
74 #define H_v v7
75 #define I_v v16
76 #define J_v v17
77
78 #ifndef MEMMOVE
79 # define MEMMOVE memmove
80 #endif
81 #ifndef MEMCPY
82 # define MEMCPY memcpy
83 #endif
84
85 #if IS_IN (libc)
86
87 #undef MEMCPY
88 #undef MEMMOVE
89 #define MEMCPY __memcpy_thunderx2
90 #define MEMMOVE __memmove_thunderx2
91
92
93 /* Moves are split into 3 main cases: small copies of up to 16 bytes,
94 medium copies of 17..96 bytes which are fully unrolled. Large copies
95 of more than 96 bytes align the destination and use an unrolled loop
96 processing 64 bytes per iteration.
97 Overlapping large forward memmoves use a loop that copies backwards.
98 */
99
100 ENTRY_ALIGN (MEMMOVE, 6)
101
102 DELOUSE (0)
103 DELOUSE (1)
104 DELOUSE (2)
105
106 sub tmp1, dstin, src
107 cmp count, 96
108 ccmp tmp1, count, 2, hi
109 b.lo L(move_long)
110
111 prfm PLDL1KEEP, [src]
112 add srcend, src, count
113 add dstend, dstin, count
114 cmp count, 16
115 b.ls L(copy16)
116 cmp count, 96
117 b.hi L(copy_long)
118
119 /* Medium copies: 17..96 bytes. */
120 sub tmp1, count, 1
121 ldp A_l, A_h, [src]
122 tbnz tmp1, 6, L(copy96)
123 ldp D_l, D_h, [srcend, -16]
124 tbz tmp1, 5, 1f
125 ldp B_l, B_h, [src, 16]
126 ldp C_l, C_h, [srcend, -32]
127 stp B_l, B_h, [dstin, 16]
128 stp C_l, C_h, [dstend, -32]
129 1:
130 stp A_l, A_h, [dstin]
131 stp D_l, D_h, [dstend, -16]
132 ret
133
134 .p2align 4
135 /* Small copies: 0..16 bytes. */
136 L(copy16):
137 cmp count, 8
138 b.lo 1f
139 ldr A_l, [src]
140 ldr A_h, [srcend, -8]
141 str A_l, [dstin]
142 str A_h, [dstend, -8]
143 ret
144 .p2align 4
145 1:
146 tbz count, 2, 1f
147 ldr A_lw, [src]
148 ldr A_hw, [srcend, -4]
149 str A_lw, [dstin]
150 str A_hw, [dstend, -4]
151 ret
152
153 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
154 byte 3 times if count==1, or the 2nd byte twice if count==2. */
155 1:
156 cbz count, 2f
157 lsr tmp1, count, 1
158 ldrb A_lw, [src]
159 ldrb A_hw, [srcend, -1]
160 ldrb B_lw, [src, tmp1]
161 strb A_lw, [dstin]
162 strb B_lw, [dstin, tmp1]
163 strb A_hw, [dstend, -1]
164 2: ret
165
166 .p2align 4
167 /* Copy 64..96 bytes. Copy 64 bytes from the start and
168 32 bytes from the end. */
169 L(copy96):
170 ldp B_l, B_h, [src, 16]
171 ldp C_l, C_h, [src, 32]
172 ldp D_l, D_h, [src, 48]
173 ldp E_l, E_h, [srcend, -32]
174 ldp F_l, F_h, [srcend, -16]
175 stp A_l, A_h, [dstin]
176 stp B_l, B_h, [dstin, 16]
177 stp C_l, C_h, [dstin, 32]
178 stp D_l, D_h, [dstin, 48]
179 stp E_l, E_h, [dstend, -32]
180 stp F_l, F_h, [dstend, -16]
181 ret
182
183 /* Align DST to 16 byte alignment so that we don't cross cache line
184 boundaries on both loads and stores. There are at least 96 bytes
185 to copy, so copy 16 bytes unaligned and then align. The loop
186 copies 64 bytes per iteration and prefetches one iteration ahead. */
187
188 .p2align 4
189 L(copy_long):
190 and tmp1, dstin, 15
191 bic dst, dstin, 15
192 ldp D_l, D_h, [src]
193 sub src, src, tmp1
194 add count, count, tmp1 /* Count is now 16 too large. */
195 ldp A_l, A_h, [src, 16]
196 stp D_l, D_h, [dstin]
197 ldp B_l, B_h, [src, 32]
198 ldp C_l, C_h, [src, 48]
199 ldp D_l, D_h, [src, 64]!
200 subs count, count, 128 + 16 /* Test and readjust count. */
201 b.ls L(last64)
202 L(loop64):
203 stp A_l, A_h, [dst, 16]
204 ldp A_l, A_h, [src, 16]
205 stp B_l, B_h, [dst, 32]
206 ldp B_l, B_h, [src, 32]
207 stp C_l, C_h, [dst, 48]
208 ldp C_l, C_h, [src, 48]
209 stp D_l, D_h, [dst, 64]!
210 ldp D_l, D_h, [src, 64]!
211 subs count, count, 64
212 b.hi L(loop64)
213
214 /* Write the last full set of 64 bytes. The remainder is at most 64
215 bytes, so it is safe to always copy 64 bytes from the end even if
216 there is just 1 byte left. */
217 L(last64):
218 ldp E_l, E_h, [srcend, -64]
219 stp A_l, A_h, [dst, 16]
220 ldp A_l, A_h, [srcend, -48]
221 stp B_l, B_h, [dst, 32]
222 ldp B_l, B_h, [srcend, -32]
223 stp C_l, C_h, [dst, 48]
224 ldp C_l, C_h, [srcend, -16]
225 stp D_l, D_h, [dst, 64]
226 stp E_l, E_h, [dstend, -64]
227 stp A_l, A_h, [dstend, -48]
228 stp B_l, B_h, [dstend, -32]
229 stp C_l, C_h, [dstend, -16]
230 ret
231
232 .p2align 4
233 L(move_long):
234 cbz tmp1, 3f
235
236 add srcend, src, count
237 add dstend, dstin, count
238
239 /* Align dstend to 16 byte alignment so that we don't cross cache line
240 boundaries on both loads and stores. There are at least 96 bytes
241 to copy, so copy 16 bytes unaligned and then align. The loop
242 copies 64 bytes per iteration and prefetches one iteration ahead. */
243
244 and tmp1, dstend, 15
245 ldp D_l, D_h, [srcend, -16]
246 sub srcend, srcend, tmp1
247 sub count, count, tmp1
248 ldp A_l, A_h, [srcend, -16]
249 stp D_l, D_h, [dstend, -16]
250 ldp B_l, B_h, [srcend, -32]
251 ldp C_l, C_h, [srcend, -48]
252 ldp D_l, D_h, [srcend, -64]!
253 sub dstend, dstend, tmp1
254 subs count, count, 128
255 b.ls 2f
256
257 nop
258 1:
259 stp A_l, A_h, [dstend, -16]
260 ldp A_l, A_h, [srcend, -16]
261 stp B_l, B_h, [dstend, -32]
262 ldp B_l, B_h, [srcend, -32]
263 stp C_l, C_h, [dstend, -48]
264 ldp C_l, C_h, [srcend, -48]
265 stp D_l, D_h, [dstend, -64]!
266 ldp D_l, D_h, [srcend, -64]!
267 subs count, count, 64
268 b.hi 1b
269
270 /* Write the last full set of 64 bytes. The remainder is at most 64
271 bytes, so it is safe to always copy 64 bytes from the start even if
272 there is just 1 byte left. */
273 2:
274 ldp G_l, G_h, [src, 48]
275 stp A_l, A_h, [dstend, -16]
276 ldp A_l, A_h, [src, 32]
277 stp B_l, B_h, [dstend, -32]
278 ldp B_l, B_h, [src, 16]
279 stp C_l, C_h, [dstend, -48]
280 ldp C_l, C_h, [src]
281 stp D_l, D_h, [dstend, -64]
282 stp G_l, G_h, [dstin, 48]
283 stp A_l, A_h, [dstin, 32]
284 stp B_l, B_h, [dstin, 16]
285 stp C_l, C_h, [dstin]
286 3: ret
287
288 END (MEMMOVE)
289 libc_hidden_builtin_def (MEMMOVE)
290
291
292 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
293 medium copies of 17..96 bytes which are fully unrolled. Large copies
294 of more than 96 bytes align the destination and use load-and-merge
295 approach in the case src and dst addresses are unaligned not evenly,
296 so that, loads and stores are always aligned.
297 Large copies use an unrolled loop processing 64 bytes per iteration.
298 The current optimized memcpy implementation is not compatible with
299 memmove and is separated from it completely.
300
301 memcpy implementation below is not compatible with memmove
302 because of pipelined loads/stores, which are faster, but they
303 can't be used in the case of overlapping memmove arrays */
304
305 #define MEMCPY_PREFETCH_LDR 640
306
307 ENTRY (MEMCPY)
308 DELOUSE (0)
309 DELOUSE (1)
310 DELOUSE (2)
311
312 add srcend, src, count
313 cmp count, 16
314 b.ls L(memcopy16)
315 ldr A_q, [src], #16
316 add dstend, dstin, count
317 and tmp1, src, 15
318 cmp count, 96
319 b.hi L(memcopy_long)
320
321 /* Medium copies: 17..96 bytes. */
322 ldr E_q, [srcend, -16]
323 cmp count, 64
324 b.gt L(memcpy_copy96)
325 cmp count, 48
326 b.le L(bytes_17_to_48)
327 /* 49..64 bytes */
328 ldp B_q, C_q, [src]
329 str E_q, [dstend, -16]
330 stp A_q, B_q, [dstin]
331 str C_q, [dstin, 32]
332 ret
333
334 L(bytes_17_to_48):
335 /* 17..48 bytes*/
336 cmp count, 32
337 b.gt L(bytes_32_to_48)
338 /* 17..32 bytes*/
339 str A_q, [dstin]
340 str E_q, [dstend, -16]
341 ret
342
343 L(bytes_32_to_48):
344 /* 32..48 */
345 ldr B_q, [src]
346 str A_q, [dstin]
347 str E_q, [dstend, -16]
348 str B_q, [dstin, 16]
349 ret
350
351 .p2align 4
352 /* Small copies: 0..16 bytes. */
353 L(memcopy16):
354 cmp count, 8
355 b.lo L(bytes_0_to_8)
356 ldr A_l, [src]
357 ldr A_h, [srcend, -8]
358 add dstend, dstin, count
359 str A_l, [dstin]
360 str A_h, [dstend, -8]
361 ret
362 .p2align 4
363
364 L(bytes_0_to_8):
365 tbz count, 2, L(bytes_0_to_3)
366 ldr A_lw, [src]
367 ldr A_hw, [srcend, -4]
368 add dstend, dstin, count
369 str A_lw, [dstin]
370 str A_hw, [dstend, -4]
371 ret
372
373 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
374 byte 3 times if count==1, or the 2nd byte twice if count==2. */
375 L(bytes_0_to_3):
376 cbz count, L(end)
377 lsr tmp1, count, 1
378 ldrb A_lw, [src]
379 ldrb A_hw, [srcend, -1]
380 add dstend, dstin, count
381 ldrb B_lw, [src, tmp1]
382 strb A_lw, [dstin]
383 strb B_lw, [dstin, tmp1]
384 strb A_hw, [dstend, -1]
385 L(end): ret
386
387 .p2align 4
388
389 L(memcpy_copy96):
390 /* Copying 65..96 bytes. A_q (first 16 bytes) and
391 E_q(last 16 bytes) are already loaded.
392
393 The size is large enough to benefit from aligned
394 loads */
395 bic src, src, 15
396 ldp B_q, C_q, [src]
397 str A_q, [dstin]
398 /* Loaded 64 bytes, second 16-bytes chunk can be
399 overlapping with the first chunk by tmp1 bytes.
400 Stored 16 bytes. */
401 sub dst, dstin, tmp1
402 add count, count, tmp1
403 /* The range of count being [65..96] becomes [65..111]
404 after tmp [0..15] gets added to it,
405 count now is <bytes-left-to-load>+48 */
406 cmp count, 80
407 b.gt L(copy96_medium)
408 ldr D_q, [src, 32]
409 stp B_q, C_q, [dst, 16]
410 str E_q, [dstend, -16]
411 str D_q, [dst, 48]
412 ret
413
414 .p2align 4
415 L(copy96_medium):
416 ldp D_q, A_q, [src, 32]
417 str B_q, [dst, 16]
418 cmp count, 96
419 b.gt L(copy96_large)
420 str E_q, [dstend, -16]
421 stp C_q, D_q, [dst, 32]
422 str A_q, [dst, 64]
423 ret
424
425 L(copy96_large):
426 ldr F_q, [src, 64]
427 stp C_q, D_q, [dst, 32]
428 str E_q, [dstend, -16]
429 stp A_q, F_q, [dst, 64]
430 ret
431
432 .p2align 4
433 L(memcopy_long):
434 bic src, src, 15
435 ldp B_q, C_q, [src], #32
436 str A_q, [dstin]
437 sub dst, dstin, tmp1
438 add count, count, tmp1
439 add dst, dst, 16
440 and tmp1, dst, 15
441 ldp D_q, E_q, [src], #32
442 str B_q, [dst], #16
443
444 /* Already loaded 64+16 bytes. Check if at
445 least 64 more bytes left */
446 subs count, count, 64+64+16
447 b.lt L(loop128_exit2)
448 cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
449 b.lt L(loop128)
450 cbnz tmp1, L(dst_unaligned)
451 sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
452
453 .p2align 4
454
455 L(loop128_prefetch):
456 str C_q, [dst], #16
457 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
458 str D_q, [dst], #16
459 ldp F_q, G_q, [src], #32
460 str E_q, [dst], #16
461 ldp H_q, A_q, [src], #32
462 str F_q, [dst], #16
463 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
464 str G_q, [dst], #16
465 ldp B_q, C_q, [src], #32
466 str H_q, [dst], #16
467 ldp D_q, E_q, [src], #32
468 stp A_q, B_q, [dst], #32
469 subs count, count, 128
470 b.ge L(loop128_prefetch)
471
472 L(preloop128):
473 add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
474 .p2align 4
475 L(loop128):
476 ldp F_q, G_q, [src], #32
477 str C_q, [dst], #16
478 ldp B_q, A_q, [src], #32
479 str D_q, [dst], #16
480 stp E_q, F_q, [dst], #32
481 stp G_q, B_q, [dst], #32
482 subs count, count, 64
483 b.lt L(loop128_exit1)
484 L(loop128_proceed):
485 ldp B_q, C_q, [src], #32
486 str A_q, [dst], #16
487 ldp D_q, E_q, [src], #32
488 str B_q, [dst], #16
489 subs count, count, 64
490 b.ge L(loop128)
491
492 .p2align 4
493 L(loop128_exit2):
494 stp C_q, D_q, [dst], #32
495 str E_q, [dst], #16
496 b L(copy_long_check32);
497
498 L(loop128_exit1):
499 /* A_q is still not stored and 0..63 bytes left,
500 so, count is -64..-1.
501 Check if less than 32 bytes left (count < -32) */
502 str A_q, [dst], #16
503 L(copy_long_check32):
504 cmn count, 64
505 b.eq L(copy_long_done)
506 cmn count, 32
507 b.le L(copy_long_last32)
508 ldp B_q, C_q, [src]
509 stp B_q, C_q, [dst]
510
511 L(copy_long_last32):
512 ldp F_q, G_q, [srcend, -32]
513 stp F_q, G_q, [dstend, -32]
514
515 L(copy_long_done):
516 ret
517
518 L(dst_unaligned):
519 /* For the unaligned store case the code loads two
520 aligned chunks and then merges them using ext
521 instruction. This can be up to 30% faster than
522 the the simple unaligned store access.
523
524 Current state: tmp1 = dst % 16; C_q, D_q, E_q
525 contains data yet to be stored. src and dst points
526 to next-to-be-processed data. A_q, B_q contains
527 data already stored before, count = bytes left to
528 be load decremented by 64.
529
530 The control is passed here if at least 64 bytes left
531 to be loaded. The code does two aligned loads and then
532 extracts (16-tmp1) bytes from the first register and
533 tmp1 bytes from the next register forming the value
534 for the aligned store.
535
536 As ext instruction can only have it's index encoded
537 as immediate. 15 code chunks process each possible
538 index value. Computed goto is used to reach the
539 required code. */
540
541 /* Store the 16 bytes to dst and align dst for further
542 operations, several bytes will be stored at this
543 address once more */
544 str C_q, [dst], #16
545 ldp F_q, G_q, [src], #32
546 bic dst, dst, 15
547 adrp tmp2, L(ext_table)
548 add tmp2, tmp2, :lo12:L(ext_table)
549 add tmp2, tmp2, tmp1, LSL #2
550 ldr tmp3w, [tmp2]
551 add tmp2, tmp2, tmp3w, SXTW
552 br tmp2
553
554 #define EXT_CHUNK(shft) \
555 .p2align 4 ;\
556 L(ext_size_ ## shft):;\
557 ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
558 ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
559 subs count, count, 32;\
560 b.ge 2f;\
561 1:;\
562 stp A_q, B_q, [dst], #32;\
563 ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
564 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
565 stp H_q, I_q, [dst], #16;\
566 add dst, dst, tmp1;\
567 str G_q, [dst], #16;\
568 b L(copy_long_check32);\
569 2:;\
570 stp A_q, B_q, [dst], #32;\
571 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
572 ldp D_q, J_q, [src], #32;\
573 ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
574 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
575 mov C_v.16b, G_v.16b;\
576 stp H_q, I_q, [dst], #32;\
577 ldp F_q, G_q, [src], #32;\
578 ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
579 ext B_v.16b, D_v.16b, J_v.16b, 16-shft;\
580 mov E_v.16b, J_v.16b;\
581 subs count, count, 64;\
582 b.ge 2b;\
583 b 1b;\
584
585 EXT_CHUNK(1)
586 EXT_CHUNK(2)
587 EXT_CHUNK(3)
588 EXT_CHUNK(4)
589 EXT_CHUNK(5)
590 EXT_CHUNK(6)
591 EXT_CHUNK(7)
592 EXT_CHUNK(8)
593 EXT_CHUNK(9)
594 EXT_CHUNK(10)
595 EXT_CHUNK(11)
596 EXT_CHUNK(12)
597 EXT_CHUNK(13)
598 EXT_CHUNK(14)
599 EXT_CHUNK(15)
600
601 END (MEMCPY)
602 .section .rodata
603 .p2align 4
604
605 L(ext_table):
606 /* The first entry is for the alignment of 0 and is never
607 actually used (could be any value). */
608 .word 0
609 .word L(ext_size_1) -.
610 .word L(ext_size_2) -.
611 .word L(ext_size_3) -.
612 .word L(ext_size_4) -.
613 .word L(ext_size_5) -.
614 .word L(ext_size_6) -.
615 .word L(ext_size_7) -.
616 .word L(ext_size_8) -.
617 .word L(ext_size_9) -.
618 .word L(ext_size_10) -.
619 .word L(ext_size_11) -.
620 .word L(ext_size_12) -.
621 .word L(ext_size_13) -.
622 .word L(ext_size_14) -.
623 .word L(ext_size_15) -.
624
625 libc_hidden_builtin_def (MEMCPY)
626 #endif