1 /* A Thunderx2 Optimized memcpy implementation for AARCH64.
2 Copyright (C) 2018-2019 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
24 * ARMv8-a, AArch64, unaligned accesses.
79 # define MEMMOVE memmove
82 # define MEMCPY memcpy
89 #define MEMCPY __memcpy_thunderx2
90 #define MEMMOVE __memmove_thunderx2
93 /* Moves are split into 3 main cases: small copies of up to 16 bytes,
94 medium copies of 17..96 bytes which are fully unrolled. Large copies
95 of more than 96 bytes align the destination and use an unrolled loop
96 processing 64 bytes per iteration.
97 Overlapping large forward memmoves use a loop that copies backwards.
100 ENTRY_ALIGN (MEMMOVE, 6)
108 ccmp tmp1, count, 2, hi
111 prfm PLDL1KEEP, [src]
112 add srcend, src, count
113 add dstend, dstin, count
119 /* Medium copies: 17..96 bytes. */
122 tbnz tmp1, 6, L(copy96)
123 ldp D_l, D_h, [srcend, -16]
125 ldp B_l, B_h, [src, 16]
126 ldp C_l, C_h, [srcend, -32]
127 stp B_l, B_h, [dstin, 16]
128 stp C_l, C_h, [dstend, -32]
130 stp A_l, A_h, [dstin]
131 stp D_l, D_h, [dstend, -16]
135 /* Small copies: 0..16 bytes. */
140 ldr A_h, [srcend, -8]
142 str A_h, [dstend, -8]
148 ldr A_hw, [srcend, -4]
150 str A_hw, [dstend, -4]
153 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
154 byte 3 times if count==1, or the 2nd byte twice if count==2. */
159 ldrb A_hw, [srcend, -1]
160 ldrb B_lw, [src, tmp1]
162 strb B_lw, [dstin, tmp1]
163 strb A_hw, [dstend, -1]
167 /* Copy 64..96 bytes. Copy 64 bytes from the start and
168 32 bytes from the end. */
170 ldp B_l, B_h, [src, 16]
171 ldp C_l, C_h, [src, 32]
172 ldp D_l, D_h, [src, 48]
173 ldp E_l, E_h, [srcend, -32]
174 ldp F_l, F_h, [srcend, -16]
175 stp A_l, A_h, [dstin]
176 stp B_l, B_h, [dstin, 16]
177 stp C_l, C_h, [dstin, 32]
178 stp D_l, D_h, [dstin, 48]
179 stp E_l, E_h, [dstend, -32]
180 stp F_l, F_h, [dstend, -16]
183 /* Align DST to 16 byte alignment so that we don't cross cache line
184 boundaries on both loads and stores. There are at least 96 bytes
185 to copy, so copy 16 bytes unaligned and then align. The loop
186 copies 64 bytes per iteration and prefetches one iteration ahead. */
194 add count, count, tmp1 /* Count is now 16 too large. */
195 ldp A_l, A_h, [src, 16]
196 stp D_l, D_h, [dstin]
197 ldp B_l, B_h, [src, 32]
198 ldp C_l, C_h, [src, 48]
199 ldp D_l, D_h, [src, 64]!
200 subs count, count, 128 + 16 /* Test and readjust count. */
203 stp A_l, A_h, [dst, 16]
204 ldp A_l, A_h, [src, 16]
205 stp B_l, B_h, [dst, 32]
206 ldp B_l, B_h, [src, 32]
207 stp C_l, C_h, [dst, 48]
208 ldp C_l, C_h, [src, 48]
209 stp D_l, D_h, [dst, 64]!
210 ldp D_l, D_h, [src, 64]!
211 subs count, count, 64
214 /* Write the last full set of 64 bytes. The remainder is at most 64
215 bytes, so it is safe to always copy 64 bytes from the end even if
216 there is just 1 byte left. */
218 ldp E_l, E_h, [srcend, -64]
219 stp A_l, A_h, [dst, 16]
220 ldp A_l, A_h, [srcend, -48]
221 stp B_l, B_h, [dst, 32]
222 ldp B_l, B_h, [srcend, -32]
223 stp C_l, C_h, [dst, 48]
224 ldp C_l, C_h, [srcend, -16]
225 stp D_l, D_h, [dst, 64]
226 stp E_l, E_h, [dstend, -64]
227 stp A_l, A_h, [dstend, -48]
228 stp B_l, B_h, [dstend, -32]
229 stp C_l, C_h, [dstend, -16]
236 add srcend, src, count
237 add dstend, dstin, count
239 /* Align dstend to 16 byte alignment so that we don't cross cache line
240 boundaries on both loads and stores. There are at least 96 bytes
241 to copy, so copy 16 bytes unaligned and then align. The loop
242 copies 64 bytes per iteration and prefetches one iteration ahead. */
245 ldp D_l, D_h, [srcend, -16]
246 sub srcend, srcend, tmp1
247 sub count, count, tmp1
248 ldp A_l, A_h, [srcend, -16]
249 stp D_l, D_h, [dstend, -16]
250 ldp B_l, B_h, [srcend, -32]
251 ldp C_l, C_h, [srcend, -48]
252 ldp D_l, D_h, [srcend, -64]!
253 sub dstend, dstend, tmp1
254 subs count, count, 128
259 stp A_l, A_h, [dstend, -16]
260 ldp A_l, A_h, [srcend, -16]
261 stp B_l, B_h, [dstend, -32]
262 ldp B_l, B_h, [srcend, -32]
263 stp C_l, C_h, [dstend, -48]
264 ldp C_l, C_h, [srcend, -48]
265 stp D_l, D_h, [dstend, -64]!
266 ldp D_l, D_h, [srcend, -64]!
267 subs count, count, 64
270 /* Write the last full set of 64 bytes. The remainder is at most 64
271 bytes, so it is safe to always copy 64 bytes from the start even if
272 there is just 1 byte left. */
274 ldp G_l, G_h, [src, 48]
275 stp A_l, A_h, [dstend, -16]
276 ldp A_l, A_h, [src, 32]
277 stp B_l, B_h, [dstend, -32]
278 ldp B_l, B_h, [src, 16]
279 stp C_l, C_h, [dstend, -48]
281 stp D_l, D_h, [dstend, -64]
282 stp G_l, G_h, [dstin, 48]
283 stp A_l, A_h, [dstin, 32]
284 stp B_l, B_h, [dstin, 16]
285 stp C_l, C_h, [dstin]
289 libc_hidden_builtin_def (MEMMOVE)
292 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
293 medium copies of 17..96 bytes which are fully unrolled. Large copies
294 of more than 96 bytes align the destination and use load-and-merge
295 approach in the case src and dst addresses are unaligned not evenly,
296 so that, loads and stores are always aligned.
297 Large copies use an unrolled loop processing 64 bytes per iteration.
298 The current optimized memcpy implementation is not compatible with
299 memmove and is separated from it completely.
301 memcpy implementation below is not compatible with memmove
302 because of pipelined loads/stores, which are faster, but they
303 can't be used in the case of overlapping memmove arrays */
305 #define MEMCPY_PREFETCH_LDR 640
312 add srcend, src, count
316 add dstend, dstin, count
321 /* Medium copies: 17..96 bytes. */
322 ldr E_q, [srcend, -16]
324 b.gt L(memcpy_copy96)
326 b.le L(bytes_17_to_48)
329 str E_q, [dstend, -16]
330 stp A_q, B_q, [dstin]
337 b.gt L(bytes_32_to_48)
340 str E_q, [dstend, -16]
347 str E_q, [dstend, -16]
352 /* Small copies: 0..16 bytes. */
357 ldr A_h, [srcend, -8]
358 add dstend, dstin, count
360 str A_h, [dstend, -8]
365 tbz count, 2, L(bytes_0_to_3)
367 ldr A_hw, [srcend, -4]
368 add dstend, dstin, count
370 str A_hw, [dstend, -4]
373 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
374 byte 3 times if count==1, or the 2nd byte twice if count==2. */
379 ldrb A_hw, [srcend, -1]
380 add dstend, dstin, count
381 ldrb B_lw, [src, tmp1]
383 strb B_lw, [dstin, tmp1]
384 strb A_hw, [dstend, -1]
390 /* Copying 65..96 bytes. A_q (first 16 bytes) and
391 E_q(last 16 bytes) are already loaded.
393 The size is large enough to benefit from aligned
398 /* Loaded 64 bytes, second 16-bytes chunk can be
399 overlapping with the first chunk by tmp1 bytes.
402 add count, count, tmp1
403 /* The range of count being [65..96] becomes [65..111]
404 after tmp [0..15] gets added to it,
405 count now is <bytes-left-to-load>+48 */
407 b.gt L(copy96_medium)
409 stp B_q, C_q, [dst, 16]
410 str E_q, [dstend, -16]
416 ldp D_q, A_q, [src, 32]
420 str E_q, [dstend, -16]
421 stp C_q, D_q, [dst, 32]
427 stp C_q, D_q, [dst, 32]
428 str E_q, [dstend, -16]
429 stp A_q, F_q, [dst, 64]
435 ldp B_q, C_q, [src], #32
438 add count, count, tmp1
441 ldp D_q, E_q, [src], #32
444 /* Already loaded 64+16 bytes. Check if at
445 least 64 more bytes left */
446 subs count, count, 64+64+16
447 b.lt L(loop128_exit2)
448 cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
450 cbnz tmp1, L(dst_unaligned)
451 sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
457 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
459 ldp F_q, G_q, [src], #32
461 ldp H_q, A_q, [src], #32
463 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
465 ldp B_q, C_q, [src], #32
467 ldp D_q, E_q, [src], #32
468 stp A_q, B_q, [dst], #32
469 subs count, count, 128
470 b.ge L(loop128_prefetch)
473 add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
476 ldp F_q, G_q, [src], #32
478 ldp B_q, A_q, [src], #32
480 stp E_q, F_q, [dst], #32
481 stp G_q, B_q, [dst], #32
482 subs count, count, 64
483 b.lt L(loop128_exit1)
485 ldp B_q, C_q, [src], #32
487 ldp D_q, E_q, [src], #32
489 subs count, count, 64
494 stp C_q, D_q, [dst], #32
496 b L(copy_long_check32);
499 /* A_q is still not stored and 0..63 bytes left,
500 so, count is -64..-1.
501 Check if less than 32 bytes left (count < -32) */
503 L(copy_long_check32):
505 b.eq L(copy_long_done)
507 b.le L(copy_long_last32)
512 ldp F_q, G_q, [srcend, -32]
513 stp F_q, G_q, [dstend, -32]
519 /* For the unaligned store case the code loads two
520 aligned chunks and then merges them using ext
521 instruction. This can be up to 30% faster than
522 the the simple unaligned store access.
524 Current state: tmp1 = dst % 16; C_q, D_q, E_q
525 contains data yet to be stored. src and dst points
526 to next-to-be-processed data. A_q, B_q contains
527 data already stored before, count = bytes left to
528 be load decremented by 64.
530 The control is passed here if at least 64 bytes left
531 to be loaded. The code does two aligned loads and then
532 extracts (16-tmp1) bytes from the first register and
533 tmp1 bytes from the next register forming the value
534 for the aligned store.
536 As ext instruction can only have it's index encoded
537 as immediate. 15 code chunks process each possible
538 index value. Computed goto is used to reach the
541 /* Store the 16 bytes to dst and align dst for further
542 operations, several bytes will be stored at this
545 ldp F_q, G_q, [src], #32
547 adrp tmp2, L(ext_table)
548 add tmp2, tmp2, :lo12:L(ext_table)
549 add tmp2, tmp2, tmp1, LSL #2
551 add tmp2, tmp2, tmp3w, SXTW
554 #define EXT_CHUNK(shft) \
556 L(ext_size_ ## shft):;\
557 ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
558 ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
559 subs count, count, 32;\
562 stp A_q, B_q, [dst], #32;\
563 ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
564 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
565 stp H_q, I_q, [dst], #16;\
567 str G_q, [dst], #16;\
568 b L(copy_long_check32);\
570 stp A_q, B_q, [dst], #32;\
571 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
572 ldp D_q, J_q, [src], #32;\
573 ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
574 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
575 mov C_v.16b, G_v.16b;\
576 stp H_q, I_q, [dst], #32;\
577 ldp F_q, G_q, [src], #32;\
578 ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
579 ext B_v.16b, D_v.16b, J_v.16b, 16-shft;\
580 mov E_v.16b, J_v.16b;\
581 subs count, count, 64;\
606 /* The first entry is for the alignment of 0 and is never
607 actually used (could be any value). */
609 .word L(ext_size_1) -.
610 .word L(ext_size_2) -.
611 .word L(ext_size_3) -.
612 .word L(ext_size_4) -.
613 .word L(ext_size_5) -.
614 .word L(ext_size_6) -.
615 .word L(ext_size_7) -.
616 .word L(ext_size_8) -.
617 .word L(ext_size_9) -.
618 .word L(ext_size_10) -.
619 .word L(ext_size_11) -.
620 .word L(ext_size_12) -.
621 .word L(ext_size_13) -.
622 .word L(ext_size_14) -.
623 .word L(ext_size_15) -.
625 libc_hidden_builtin_def (MEMCPY)