]>
Commit | Line | Data |
---|---|---|
d5efd131 MF |
1 | /* Optimized version of the standard memcpy() function. |
2 | This file is part of the GNU C Library. | |
6d7e8eda | 3 | Copyright (C) 2000-2023 Free Software Foundation, Inc. |
d5efd131 MF |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
75efb018 | 16 | License along with the GNU C Library; if not, see |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
d5efd131 MF |
18 | |
19 | /* Return: dest | |
20 | ||
21 | Inputs: | |
22 | in0: dest | |
23 | in1: src | |
24 | in2: byte count | |
25 | ||
26 | An assembly implementation of the algorithm used by the generic C | |
27 | version from glibc. The case when source and sest are aligned is | |
28 | treated separately, for extra performance. | |
29 | ||
30 | In this form, memcpy assumes little endian mode. For big endian mode, | |
31 | sh1 must be computed using an extra instruction: sub sh1 = 64, sh1 | |
32 | and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the | |
33 | shrp instruction. */ | |
34 | ||
35 | #define USE_LFETCH | |
36 | #define USE_FLP | |
37 | #include <sysdep.h> | |
38 | #undef ret | |
39 | ||
40 | #define LFETCH_DIST 500 | |
41 | ||
42 | #define ALIGN_UNROLL_no 4 // no. of elements | |
43 | #define ALIGN_UNROLL_sh 2 // (shift amount) | |
44 | ||
45 | #define MEMLAT 8 | |
46 | #define Nrot ((4*(MEMLAT+2) + 7) & ~7) | |
47 | ||
48 | #define OP_T_THRES 16 | |
49 | #define OPSIZ 8 | |
50 | ||
51 | #define loopcnt r14 | |
52 | #define elemcnt r15 | |
53 | #define saved_pr r16 | |
54 | #define saved_lc r17 | |
55 | #define adest r18 | |
56 | #define dest r19 | |
57 | #define asrc r20 | |
58 | #define src r21 | |
59 | #define len r22 | |
60 | #define tmp2 r23 | |
61 | #define tmp3 r24 | |
62 | #define tmp4 r25 | |
63 | #define ptable r26 | |
64 | #define ploop56 r27 | |
65 | #define loopaddr r28 | |
66 | #define sh1 r29 | |
67 | #define ptr1 r30 | |
68 | #define ptr2 r31 | |
69 | ||
70 | #define movi0 mov | |
71 | ||
72 | #define p_scr p6 | |
73 | #define p_xtr p7 | |
74 | #define p_nxtr p8 | |
75 | #define p_few p9 | |
76 | ||
77 | #if defined(USE_FLP) | |
78 | #define load ldf8 | |
79 | #define store stf8 | |
80 | #define tempreg f6 | |
81 | #define the_r fr | |
82 | #define the_s fs | |
83 | #define the_t ft | |
84 | #define the_q fq | |
85 | #define the_w fw | |
86 | #define the_x fx | |
87 | #define the_y fy | |
88 | #define the_z fz | |
89 | #elif defined(USE_INT) | |
90 | #define load ld8 | |
91 | #define store st8 | |
92 | #define tempreg tmp2 | |
93 | #define the_r r | |
94 | #define the_s s | |
95 | #define the_t t | |
96 | #define the_q q | |
97 | #define the_w w | |
98 | #define the_x x | |
99 | #define the_y y | |
100 | #define the_z z | |
101 | #endif | |
102 | ||
103 | #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO | |
104 | /* Manually force proper loop-alignment. Note: be sure to | |
105 | double-check the code-layout after making any changes to | |
106 | this routine! */ | |
107 | # define ALIGN(n) { nop 0 } | |
108 | #else | |
109 | # define ALIGN(n) .align n | |
110 | #endif | |
111 | ||
112 | #if defined(USE_LFETCH) | |
113 | #define LOOP(shift) \ | |
114 | ALIGN(32); \ | |
115 | .loop##shift##: \ | |
116 | { .mmb \ | |
117 | (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ | |
118 | (p[0]) lfetch.nt1 [ptr1], 16 ; \ | |
119 | nop.b 0 ; \ | |
120 | } { .mib \ | |
121 | (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ | |
122 | (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ | |
c70a4b1d | 123 | nop.b 0 ;; \ |
d5efd131 MF |
124 | } { .mmb \ |
125 | (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ | |
126 | (p[0]) lfetch.nt1 [ptr2], 16 ; \ | |
127 | nop.b 0 ; \ | |
128 | } { .mib \ | |
129 | (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ | |
130 | (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ | |
131 | br.ctop.sptk.many .loop##shift \ | |
132 | ;; } \ | |
133 | { .mib \ | |
134 | br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ | |
135 | } | |
136 | #else | |
137 | #define LOOP(shift) \ | |
138 | ALIGN(32); \ | |
139 | .loop##shift##: \ | |
140 | { .mmb \ | |
141 | (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ | |
142 | nop.b 0 ; \ | |
143 | } { .mib \ | |
144 | (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ | |
145 | (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ | |
c70a4b1d | 146 | nop.b 0 ;; \ |
d5efd131 MF |
147 | } { .mmb \ |
148 | (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ | |
149 | nop.b 0 ; \ | |
150 | } { .mib \ | |
151 | (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ | |
152 | (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ | |
153 | br.ctop.sptk.many .loop##shift \ | |
154 | ;; } \ | |
155 | { .mib \ | |
156 | br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ | |
157 | } | |
158 | #endif | |
159 | ||
160 | ||
161 | ENTRY(memcpy) | |
162 | { .mmi | |
163 | .prologue | |
164 | alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot | |
165 | .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1] | |
166 | .rotp p[MEMLAT+2] | |
167 | .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1] | |
168 | mov ret0 = in0 // return tmp2 = dest | |
169 | .save pr, saved_pr | |
170 | movi0 saved_pr = pr // save the predicate registers | |
171 | } { .mmi | |
172 | and tmp4 = 7, in0 // check if destination is aligned | |
173 | mov dest = in0 // dest | |
174 | mov src = in1 // src | |
175 | ;; } | |
176 | { .mii | |
177 | cmp.eq p_scr, p0 = in2, r0 // if (len == 0) | |
178 | .save ar.lc, saved_lc | |
179 | movi0 saved_lc = ar.lc // save the loop counter | |
180 | .body | |
181 | cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH | |
182 | } { .mbb | |
183 | mov len = in2 // len | |
184 | (p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest | |
185 | (p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte | |
186 | ;; } | |
187 | { .mmi | |
188 | #if defined(USE_LFETCH) | |
189 | lfetch.nt1 [dest] // | |
190 | lfetch.nt1 [src] // | |
191 | #endif | |
192 | shr.u elemcnt = len, 3 // elemcnt = len / 8 | |
193 | } { .mib | |
194 | cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned? | |
195 | sub loopcnt = 7, tmp4 // | |
196 | (p_scr) br.cond.dptk.many .dest_aligned | |
197 | ;; } | |
198 | { .mmi | |
199 | ld1 tmp2 = [src], 1 // | |
200 | sub len = len, loopcnt, 1 // reduce len | |
201 | movi0 ar.lc = loopcnt // | |
202 | } { .mib | |
203 | cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point | |
204 | ;; } | |
205 | ||
206 | .l0: // ---------------------------- // L0: Align src on 8-byte boundary | |
207 | { .mmi | |
208 | st1 [dest] = tmp2, 1 // | |
209 | (p_scr) ld1 tmp2 = [src], 1 // | |
210 | } { .mib | |
211 | cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point | |
212 | add loopcnt = -1, loopcnt | |
213 | br.cloop.dptk.few .l0 // | |
214 | ;; } | |
215 | ||
216 | .dest_aligned: | |
217 | { .mmi | |
218 | and tmp4 = 7, src // ready for alignment check | |
219 | shr.u elemcnt = len, 3 // elemcnt = len / 8 | |
220 | ;; } | |
221 | { .mib | |
222 | cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned | |
223 | tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src | |
224 | } { .mib // is not 16B aligned | |
225 | add ptr2 = LFETCH_DIST, dest // prefetch address | |
226 | add ptr1 = LFETCH_DIST, src | |
227 | (p_scr) br.cond.dptk.many .src_not_aligned | |
228 | ;; } | |
229 | ||
230 | // The optimal case, when dest, and src are aligned | |
231 | ||
232 | .both_aligned: | |
233 | { .mmi | |
234 | .pred.rel "mutex",p_xtr,p_nxtr | |
235 | (p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify | |
236 | (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify | |
237 | movi0 pr.rot = 1 << 16 // set rotating predicates | |
238 | } { .mib | |
239 | (p_scr) br.cond.dpnt.many .copy_full_words | |
240 | ;; } | |
241 | ||
242 | { .mmi | |
243 | (p_xtr) load tempreg = [src], 8 | |
244 | (p_xtr) add elemcnt = -1, elemcnt | |
245 | movi0 ar.ec = MEMLAT + 1 // set the epilog counter | |
246 | ;; } | |
247 | { .mmi | |
248 | (p_xtr) add len = -8, len // | |
249 | add asrc = 16, src // one bank apart (for USE_INT) | |
250 | shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling | |
251 | ;;} | |
252 | { .mmi | |
253 | add loopcnt = -1, loopcnt | |
254 | (p_xtr) store [dest] = tempreg, 8 // copy the "extra" word | |
255 | nop.i 0 | |
256 | ;; } | |
257 | { .mib | |
258 | add adest = 16, dest | |
259 | movi0 ar.lc = loopcnt // set the loop counter | |
260 | ;; } | |
261 | ||
262 | #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO | |
263 | { nop 0 } | |
264 | #else | |
265 | .align 32 | |
266 | #endif | |
267 | #if defined(USE_FLP) | |
268 | .l1: // ------------------------------- // L1: Everything a multiple of 8 | |
269 | { .mmi | |
270 | #if defined(USE_LFETCH) | |
271 | (p[0]) lfetch.nt1 [ptr2],32 | |
272 | #endif | |
273 | (p[0]) ldfp8 the_r[0],the_q[0] = [src], 16 | |
274 | (p[0]) add len = -32, len | |
275 | } {.mmb | |
276 | (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 | |
277 | (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8 | |
278 | ;; } | |
279 | { .mmi | |
280 | #if defined(USE_LFETCH) | |
281 | (p[0]) lfetch.nt1 [ptr1],32 | |
282 | #endif | |
283 | (p[0]) ldfp8 the_s[0], the_t[0] = [src], 16 | |
284 | } {.mmb | |
285 | (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24 | |
286 | (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 | |
287 | br.ctop.dptk.many .l1 | |
288 | ;; } | |
289 | #elif defined(USE_INT) | |
290 | .l1: // ------------------------------- // L1: Everything a multiple of 8 | |
291 | { .mmi | |
292 | (p[0]) load the_r[0] = [src], 8 | |
293 | (p[0]) load the_q[0] = [asrc], 8 | |
294 | (p[0]) add len = -32, len | |
295 | } {.mmb | |
296 | (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 | |
297 | (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8 | |
298 | ;; } | |
299 | { .mmi | |
300 | (p[0]) load the_s[0] = [src], 24 | |
301 | (p[0]) load the_t[0] = [asrc], 24 | |
302 | } {.mmb | |
303 | (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24 | |
304 | (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 | |
305 | #if defined(USE_LFETCH) | |
306 | ;; } | |
307 | { .mmb | |
308 | (p[0]) lfetch.nt1 [ptr2],32 | |
309 | (p[0]) lfetch.nt1 [ptr1],32 | |
310 | #endif | |
311 | br.ctop.dptk.many .l1 | |
312 | ;; } | |
313 | #endif | |
314 | ||
315 | .copy_full_words: | |
316 | { .mib | |
317 | cmp.gt p_scr, p0 = 8, len // | |
318 | shr.u elemcnt = len, 3 // | |
319 | (p_scr) br.cond.dpnt.many .copy_bytes | |
320 | ;; } | |
321 | { .mii | |
322 | load tempreg = [src], 8 | |
323 | add loopcnt = -1, elemcnt // | |
324 | ;; } | |
325 | { .mii | |
326 | cmp.ne p_scr, p0 = 0, loopcnt // | |
327 | mov ar.lc = loopcnt // | |
328 | ;; } | |
329 | ||
330 | .l2: // ------------------------------- // L2: Max 4 words copied separately | |
331 | { .mmi | |
332 | store [dest] = tempreg, 8 | |
333 | (p_scr) load tempreg = [src], 8 // | |
334 | add len = -8, len | |
335 | } { .mib | |
336 | cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point | |
337 | add loopcnt = -1, loopcnt | |
338 | br.cloop.dptk.few .l2 | |
339 | ;; } | |
340 | ||
341 | .copy_bytes: | |
342 | { .mib | |
343 | cmp.eq p_scr, p0 = len, r0 // is len == 0 ? | |
344 | add loopcnt = -1, len // len--; | |
345 | (p_scr) br.cond.spnt .restore_and_exit | |
346 | ;; } | |
347 | { .mii | |
348 | ld1 tmp2 = [src], 1 | |
349 | movi0 ar.lc = loopcnt | |
350 | cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point | |
351 | ;; } | |
352 | ||
353 | .l3: // ------------------------------- // L3: Final byte move | |
354 | { .mmi | |
355 | st1 [dest] = tmp2, 1 | |
356 | (p_scr) ld1 tmp2 = [src], 1 | |
357 | } { .mib | |
358 | cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point | |
359 | add loopcnt = -1, loopcnt | |
360 | br.cloop.dptk.few .l3 | |
361 | ;; } | |
362 | ||
363 | .restore_and_exit: | |
364 | { .mmi | |
365 | movi0 pr = saved_pr, -1 // restore the predicate registers | |
366 | ;; } | |
367 | { .mib | |
368 | movi0 ar.lc = saved_lc // restore the loop counter | |
369 | br.ret.sptk.many b0 | |
370 | ;; } | |
371 | ||
372 | ||
373 | .src_not_aligned: | |
374 | { .mmi | |
375 | cmp.gt p_scr, p0 = 16, len | |
376 | and sh1 = 7, src // sh1 = src % 8 | |
377 | shr.u loopcnt = len, 4 // element-cnt = len / 16 | |
378 | } { .mib | |
379 | add tmp4 = @ltoff(.table), gp | |
380 | add tmp3 = @ltoff(.loop56), gp | |
381 | (p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few | |
382 | ;; } | |
383 | { .mmi | |
384 | and asrc = -8, src // asrc = (-8) -- align src for loop | |
385 | add loopcnt = -1, loopcnt // loopcnt-- | |
386 | shl sh1 = sh1, 3 // sh1 = 8 * (src % 8) | |
387 | } { .mmi | |
388 | ld8 ptable = [tmp4] // ptable = &table | |
389 | ld8 ploop56 = [tmp3] // ploop56 = &loop56 | |
390 | and tmp2 = -16, len // tmp2 = len & -OPSIZ | |
391 | ;; } | |
392 | { .mmi | |
393 | add tmp3 = ptable, sh1 // tmp3 = &table + sh1 | |
394 | add src = src, tmp2 // src += len & (-16) | |
395 | movi0 ar.lc = loopcnt // set LC | |
396 | ;; } | |
397 | { .mmi | |
398 | ld8 tmp4 = [tmp3] // tmp4 = loop offset | |
399 | sub len = len, tmp2 // len -= len & (-16) | |
400 | movi0 ar.ec = MEMLAT + 2 // one more pass needed | |
401 | ;; } | |
402 | { .mmi | |
403 | ld8 s[1] = [asrc], 8 // preload | |
404 | sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset | |
405 | movi0 pr.rot = 1 << 16 // set rotating predicates | |
406 | ;; } | |
407 | { .mib | |
408 | nop.m 0 | |
409 | movi0 b6 = loopaddr | |
410 | br b6 // jump to the appropriate loop | |
411 | ;; } | |
412 | ||
413 | LOOP(8) | |
414 | LOOP(16) | |
415 | LOOP(24) | |
416 | LOOP(32) | |
417 | LOOP(40) | |
418 | LOOP(48) | |
419 | LOOP(56) | |
420 | END(memcpy) | |
421 | libc_hidden_builtin_def (memcpy) | |
422 | ||
423 | .rodata | |
424 | .align 8 | |
425 | .table: | |
426 | data8 0 // dummy entry | |
427 | data8 .loop56 - .loop8 | |
428 | data8 .loop56 - .loop16 | |
429 | data8 .loop56 - .loop24 | |
430 | data8 .loop56 - .loop32 | |
431 | data8 .loop56 - .loop40 | |
432 | data8 .loop56 - .loop48 | |
433 | data8 .loop56 - .loop56 |