]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/le/power10/memcpy.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / le / power10 / memcpy.S
1 /* Optimized memcpy implementation for POWER10.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21
22 #ifndef MEMCPY
23 # define MEMCPY memcpy
24 #endif
25
26 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
27 Returns 'dst'. */
28
29 .machine power9
30 ENTRY_TOCLESS (MEMCPY, 5)
31 CALL_MCOUNT 3
32
33 /* Copy up to 16 bytes. */
34 sldi r6,r5,56 /* Prepare [l|st]xvl counter. */
35 lxvl v10,r4,r6
36 stxvl v10,r3,r6
37 subic. r6,r5,16 /* Return if len <= 16. */
38 blelr
39
40 /* If len >= 256, assume nothing got copied before and copy
41 again. This might cause issues with overlapped memory, but memcpy
42 is not expected to treat overlapped memory. */
43 cmpdi r5,256
44 bge L(copy_ge_256)
45 /* 16 < len < 256 and the first 16 bytes have already been copied. */
46 addi r10,r3,16 /* Keep r3 intact as return value. */
47 addi r4,r4,16
48 subi r5,r5,16
49 b L(copy_lt_256) /* Avoid the main loop if len < 256. */
50
51 .p2align 5
52 L(copy_ge_256):
53 mr r10,r3 /* Keep r3 intact as return value. */
54 /* Align dst to 16 bytes. */
55 andi. r9,r10,0xf
56 beq L(dst_is_align_16)
57 lxv v10,0(r4)
58 subfic r12,r9,16
59 subf r5,r12,r5
60 add r4,r4,r12
61 stxv v10,0(r3)
62 add r10,r3,r12
63
64 L(dst_is_align_16):
65 srdi r9,r5,7 /* Divide by 128. */
66 mtctr r9
67 addi r6,r4,64
68 addi r7,r10,64
69
70
71 /* Main loop, copy 128 bytes per iteration.
72 Use r6=src+64 and r7=dest+64 in order to reduce the dependency on
73 r4 and r10. */
74 .p2align 5
75 L(copy_128):
76
77 lxv v10, 0(r4)
78 lxv v11, 16(r4)
79 lxv v12, 32(r4)
80 lxv v13, 48(r4)
81
82 addi r4,r4,128
83
84 stxv v10, 0(r10)
85 stxv v11, 16(r10)
86 stxv v12, 32(r10)
87 stxv v13, 48(r10)
88
89 addi r10,r10,128
90
91 lxv v10, 0(r6)
92 lxv v11, 16(r6)
93 lxv v12, 32(r6)
94 lxv v13, 48(r6)
95
96 addi r6,r6,128
97
98 stxv v10, 0(r7)
99 stxv v11, 16(r7)
100 stxv v12, 32(r7)
101 stxv v13, 48(r7)
102
103 addi r7,r7,128
104
105 bdnz L(copy_128)
106
107 clrldi. r5,r5,64-7 /* Have we copied everything? */
108 beqlr
109
110 .p2align 5
111 L(copy_lt_256):
112 cmpdi r5,16
113 ble L(copy_le_16)
114 srdi. r9,r5,5 /* Divide by 32. */
115 beq L(copy_lt_32)
116 mtctr r9
117 /* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce
118 the dependency on r4 and r10. */
119 addi r6,r4,32
120 addi r7,r10,32
121 addi r8,r4,64
122 addi r9,r10,64
123
124 .p2align 5
125 /* Copy 32 bytes at a time, unaligned.
126 The loop is unrolled 3 times in order to reduce the dependency on
127 r4 and r10, copying up-to 96 bytes per iteration. */
128 L(copy_32):
129 lxv v10, 0(r4)
130 lxv v11, 16(r4)
131 stxv v10, 0(r10)
132 stxv v11, 16(r10)
133 bdz L(end_copy_32a)
134 addi r4,r4,96
135 addi r10,r10,96
136
137 lxv v10, 0(r6)
138 lxv v11, 16(r6)
139 addi r6,r6,96
140 stxv v10, 0(r7)
141 stxv v11, 16(r7)
142 bdz L(end_copy_32b)
143 addi r7,r7,96
144
145 lxv v12, 0(r8)
146 lxv v13, 16(r8)
147 addi r8,r8,96
148 stxv v12, 0(r9)
149 stxv v13, 16(r9)
150 addi r9,r9,96
151 bdnz L(copy_32)
152
153 clrldi. r5,r5,64-5 /* Have we copied everything? */
154 beqlr
155 cmpdi r5,16
156 ble L(copy_le_16)
157 b L(copy_lt_32)
158
159 .p2align 5
160 L(end_copy_32a):
161 clrldi. r5,r5,64-5 /* Have we copied everything? */
162 beqlr
163 /* 32 bytes have been copied since the last update of r4 and r10. */
164 addi r4,r4,32
165 addi r10,r10,32
166 cmpdi r5,16
167 ble L(copy_le_16)
168 b L(copy_lt_32)
169
170 .p2align 5
171 L(end_copy_32b):
172 clrldi. r5,r5,64-5 /* Have we copied everything? */
173 beqlr
174 /* The last iteration of the loop copied 64 bytes. Update r4 and r10
175 accordingly. */
176 addi r4,r4,-32
177 addi r10,r10,-32
178 cmpdi r5,16
179 ble L(copy_le_16)
180
181 .p2align 5
182 L(copy_lt_32):
183 lxv v10, 0(r4)
184 stxv v10, 0(r10)
185 addi r4,r4,16
186 addi r10,r10,16
187 subi r5,r5,16
188
189 .p2align 5
190 L(copy_le_16):
191 sldi r6,r5,56
192 lxvl v10,r4,r6
193 stxvl v10,r10,r6
194 blr
195
196
197 END_GEN_TB (MEMCPY,TB_TOCLESS)
198 libc_hidden_builtin_def (memcpy)