]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / loongarch / lp64 / multiarch / memcpy-unaligned.S
1 /* Optimized unaligned memcpy implementation using basic LoongArch instructions.
2 Copyright (C) 2023-2024 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21 #include <sys/regdef.h>
22 #include <sys/asm.h>
23
24 #if IS_IN (libc)
25
26 # define MEMCPY_NAME __memcpy_unaligned
27
28 # define LD_64(reg, n) \
29 ld.d t0, reg, n; \
30 ld.d t1, reg, n + 8; \
31 ld.d t2, reg, n + 16; \
32 ld.d t3, reg, n + 24; \
33 ld.d t4, reg, n + 32; \
34 ld.d t5, reg, n + 40; \
35 ld.d t6, reg, n + 48; \
36 ld.d t7, reg, n + 56;
37
38 # define ST_64(reg, n) \
39 st.d t0, reg, n; \
40 st.d t1, reg, n + 8; \
41 st.d t2, reg, n + 16; \
42 st.d t3, reg, n + 24; \
43 st.d t4, reg, n + 32; \
44 st.d t5, reg, n + 40; \
45 st.d t6, reg, n + 48; \
46 st.d t7, reg, n + 56;
47
48 LEAF(MEMCPY_NAME, 3)
49 add.d a4, a1, a2
50 add.d a3, a0, a2
51 li.w a6, 16
52 bge a6, a2, L(less_16bytes)
53
54 li.w a6, 128
55 blt a6, a2, L(long_bytes)
56 li.w a6, 64
57 blt a6, a2, L(more_64bytes)
58
59 li.w a6, 32
60 blt a6, a2, L(more_32bytes)
61
62 ld.d t0, a1, 0
63 ld.d t1, a1, 8
64 ld.d t2, a4, -16
65 ld.d t3, a4, -8
66
67 st.d t0, a0, 0
68 st.d t1, a0, 8
69 st.d t2, a3, -16
70 st.d t3, a3, -8
71 jr ra
72
73 L(more_64bytes):
74 srli.d t8, a0, 3
75 slli.d t8, t8, 3
76 addi.d t8, t8, 0x8
77 sub.d a7, a0, t8
78
79 ld.d t0, a1, 0
80 sub.d a1, a1, a7
81 st.d t0, a0, 0
82 add.d a7, a7, a2
83 addi.d a7, a7, -0x20
84
85 L(loop_32):
86 ld.d t0, a1, 0
87 ld.d t1, a1, 8
88 ld.d t2, a1, 16
89 ld.d t3, a1, 24
90
91 st.d t0, t8, 0
92 st.d t1, t8, 8
93 st.d t2, t8, 16
94 st.d t3, t8, 24
95
96 addi.d t8, t8, 0x20
97 addi.d a1, a1, 0x20
98 addi.d a7, a7, -0x20
99 blt zero, a7, L(loop_32)
100
101 ld.d t4, a4, -32
102 ld.d t5, a4, -24
103 ld.d t6, a4, -16
104 ld.d t7, a4, -8
105
106 st.d t4, a3, -32
107 st.d t5, a3, -24
108 st.d t6, a3, -16
109 st.d t7, a3, -8
110
111 jr ra
112
113 L(more_32bytes):
114 ld.d t0, a1, 0
115 ld.d t1, a1, 8
116 ld.d t2, a1, 16
117 ld.d t3, a1, 24
118
119 ld.d t4, a4, -32
120 ld.d t5, a4, -24
121 ld.d t6, a4, -16
122 ld.d t7, a4, -8
123
124 st.d t0, a0, 0
125 st.d t1, a0, 8
126 st.d t2, a0, 16
127 st.d t3, a0, 24
128
129 st.d t4, a3, -32
130 st.d t5, a3, -24
131 st.d t6, a3, -16
132 st.d t7, a3, -8
133
134 jr ra
135
136 L(less_16bytes):
137 srai.d a6, a2, 3
138 beqz a6, L(less_8bytes)
139
140 ld.d t0, a1, 0
141 ld.d t1, a4, -8
142 st.d t0, a0, 0
143 st.d t1, a3, -8
144
145 jr ra
146
147 L(less_8bytes):
148 srai.d a6, a2, 2
149 beqz a6, L(less_4bytes)
150
151 ld.w t0, a1, 0
152 ld.w t1, a4, -4
153 st.w t0, a0, 0
154 st.w t1, a3, -4
155
156 jr ra
157
158 L(less_4bytes):
159 srai.d a6, a2, 1
160 beqz a6, L(less_2bytes)
161
162 ld.h t0, a1, 0
163 ld.h t1, a4, -2
164 st.h t0, a0, 0
165 st.h t1, a3, -2
166
167 jr ra
168
169 L(less_2bytes):
170 beqz a2, L(less_1bytes)
171
172 ld.b t0, a1, 0
173 st.b t0, a0, 0
174 jr ra
175
176 L(less_1bytes):
177 jr ra
178
179 L(long_bytes):
180 srli.d t8, a0, 3
181 slli.d t8, t8, 3
182 beq a0, t8, L(start)
183 ld.d t0, a1, 0
184
185 addi.d t8, t8, 0x8
186 st.d t0, a0, 0
187 sub.d a7, a0, t8
188 sub.d a1, a1, a7
189
190 L(start):
191 addi.d a5, a3, -0x80
192 blt a5, t8, L(align_end_proc)
193
194 L(loop_128):
195 LD_64(a1, 0)
196 ST_64(t8, 0)
197 LD_64(a1, 64)
198 addi.d a1, a1, 0x80
199 ST_64(t8, 64)
200 addi.d t8, t8, 0x80
201 bge a5, t8, L(loop_128)
202
203 L(align_end_proc):
204 sub.d a2, a3, t8
205 pcaddi t1, 34
206 andi t2, a2, 0x78
207 sub.d t1, t1, t2
208 jr t1
209
210 ld.d t0, a1, 112
211 st.d t0, t8, 112
212 ld.d t0, a1, 104
213 st.d t0, t8, 104
214 ld.d t0, a1, 96
215 st.d t0, t8, 96
216 ld.d t0, a1, 88
217 st.d t0, t8, 88
218 ld.d t0, a1, 80
219 st.d t0, t8, 80
220 ld.d t0, a1, 72
221 st.d t0, t8, 72
222 ld.d t0, a1, 64
223 st.d t0, t8, 64
224 ld.d t0, a1, 56
225 st.d t0, t8, 56
226 ld.d t0, a1, 48
227 st.d t0, t8, 48
228 ld.d t0, a1, 40
229 st.d t0, t8, 40
230 ld.d t0, a1, 32
231 st.d t0, t8, 32
232 ld.d t0, a1, 24
233 st.d t0, t8, 24
234 ld.d t0, a1, 16
235 st.d t0, t8, 16
236 ld.d t0, a1, 8
237 st.d t0, t8, 8
238 ld.d t0, a1, 0
239 st.d t0, t8, 0
240 ld.d t0, a4, -8
241 st.d t0, a3, -8
242
243 jr ra
244 END(MEMCPY_NAME)
245
246 libc_hidden_builtin_def (MEMCPY_NAME)
247 #endif