]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/memcpy.S
ia64: relocate out of ports/ subdir
[thirdparty/glibc.git] / sysdeps / ia64 / memcpy.S
1 /* Optimized version of the standard memcpy() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000-2014 Free Software Foundation, Inc.
4 Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
5 Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, see
19 <http://www.gnu.org/licenses/>. */
20
21 /* Return: dest
22
23 Inputs:
24 in0: dest
25 in1: src
26 in2: byte count
27
28 An assembly implementation of the algorithm used by the generic C
29 version from glibc. The case when source and sest are aligned is
30 treated separately, for extra performance.
31
32 In this form, memcpy assumes little endian mode. For big endian mode,
33 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
34 and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
35 shrp instruction. */
36
37 #define USE_LFETCH
38 #define USE_FLP
39 #include <sysdep.h>
40 #undef ret
41
42 #define LFETCH_DIST 500
43
44 #define ALIGN_UNROLL_no 4 // no. of elements
45 #define ALIGN_UNROLL_sh 2 // (shift amount)
46
47 #define MEMLAT 8
48 #define Nrot ((4*(MEMLAT+2) + 7) & ~7)
49
50 #define OP_T_THRES 16
51 #define OPSIZ 8
52
53 #define loopcnt r14
54 #define elemcnt r15
55 #define saved_pr r16
56 #define saved_lc r17
57 #define adest r18
58 #define dest r19
59 #define asrc r20
60 #define src r21
61 #define len r22
62 #define tmp2 r23
63 #define tmp3 r24
64 #define tmp4 r25
65 #define ptable r26
66 #define ploop56 r27
67 #define loopaddr r28
68 #define sh1 r29
69 #define ptr1 r30
70 #define ptr2 r31
71
72 #define movi0 mov
73
74 #define p_scr p6
75 #define p_xtr p7
76 #define p_nxtr p8
77 #define p_few p9
78
79 #if defined(USE_FLP)
80 #define load ldf8
81 #define store stf8
82 #define tempreg f6
83 #define the_r fr
84 #define the_s fs
85 #define the_t ft
86 #define the_q fq
87 #define the_w fw
88 #define the_x fx
89 #define the_y fy
90 #define the_z fz
91 #elif defined(USE_INT)
92 #define load ld8
93 #define store st8
94 #define tempreg tmp2
95 #define the_r r
96 #define the_s s
97 #define the_t t
98 #define the_q q
99 #define the_w w
100 #define the_x x
101 #define the_y y
102 #define the_z z
103 #endif
104
105 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
106 /* Manually force proper loop-alignment. Note: be sure to
107 double-check the code-layout after making any changes to
108 this routine! */
109 # define ALIGN(n) { nop 0 }
110 #else
111 # define ALIGN(n) .align n
112 #endif
113
114 #if defined(USE_LFETCH)
115 #define LOOP(shift) \
116 ALIGN(32); \
117 .loop##shift##: \
118 { .mmb \
119 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
120 (p[0]) lfetch.nt1 [ptr1], 16 ; \
121 nop.b 0 ; \
122 } { .mib \
123 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
124 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
125 nop.b 0 ;; \
126 } { .mmb \
127 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
128 (p[0]) lfetch.nt1 [ptr2], 16 ; \
129 nop.b 0 ; \
130 } { .mib \
131 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
132 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
133 br.ctop.sptk.many .loop##shift \
134 ;; } \
135 { .mib \
136 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
137 }
138 #else
139 #define LOOP(shift) \
140 ALIGN(32); \
141 .loop##shift##: \
142 { .mmb \
143 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
144 nop.b 0 ; \
145 } { .mib \
146 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
147 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
148 nop.b 0 ;; \
149 } { .mmb \
150 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
151 nop.b 0 ; \
152 } { .mib \
153 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
154 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
155 br.ctop.sptk.many .loop##shift \
156 ;; } \
157 { .mib \
158 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
159 }
160 #endif
161
162
163 ENTRY(memcpy)
164 { .mmi
165 .prologue
166 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
167 .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
168 .rotp p[MEMLAT+2]
169 .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
170 mov ret0 = in0 // return tmp2 = dest
171 .save pr, saved_pr
172 movi0 saved_pr = pr // save the predicate registers
173 } { .mmi
174 and tmp4 = 7, in0 // check if destination is aligned
175 mov dest = in0 // dest
176 mov src = in1 // src
177 ;; }
178 { .mii
179 cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
180 .save ar.lc, saved_lc
181 movi0 saved_lc = ar.lc // save the loop counter
182 .body
183 cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
184 } { .mbb
185 mov len = in2 // len
186 (p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
187 (p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
188 ;; }
189 { .mmi
190 #if defined(USE_LFETCH)
191 lfetch.nt1 [dest] //
192 lfetch.nt1 [src] //
193 #endif
194 shr.u elemcnt = len, 3 // elemcnt = len / 8
195 } { .mib
196 cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
197 sub loopcnt = 7, tmp4 //
198 (p_scr) br.cond.dptk.many .dest_aligned
199 ;; }
200 { .mmi
201 ld1 tmp2 = [src], 1 //
202 sub len = len, loopcnt, 1 // reduce len
203 movi0 ar.lc = loopcnt //
204 } { .mib
205 cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
206 ;; }
207
208 .l0: // ---------------------------- // L0: Align src on 8-byte boundary
209 { .mmi
210 st1 [dest] = tmp2, 1 //
211 (p_scr) ld1 tmp2 = [src], 1 //
212 } { .mib
213 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
214 add loopcnt = -1, loopcnt
215 br.cloop.dptk.few .l0 //
216 ;; }
217
218 .dest_aligned:
219 { .mmi
220 and tmp4 = 7, src // ready for alignment check
221 shr.u elemcnt = len, 3 // elemcnt = len / 8
222 ;; }
223 { .mib
224 cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
225 tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
226 } { .mib // is not 16B aligned
227 add ptr2 = LFETCH_DIST, dest // prefetch address
228 add ptr1 = LFETCH_DIST, src
229 (p_scr) br.cond.dptk.many .src_not_aligned
230 ;; }
231
232 // The optimal case, when dest, and src are aligned
233
234 .both_aligned:
235 { .mmi
236 .pred.rel "mutex",p_xtr,p_nxtr
237 (p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
238 (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
239 movi0 pr.rot = 1 << 16 // set rotating predicates
240 } { .mib
241 (p_scr) br.cond.dpnt.many .copy_full_words
242 ;; }
243
244 { .mmi
245 (p_xtr) load tempreg = [src], 8
246 (p_xtr) add elemcnt = -1, elemcnt
247 movi0 ar.ec = MEMLAT + 1 // set the epilog counter
248 ;; }
249 { .mmi
250 (p_xtr) add len = -8, len //
251 add asrc = 16, src // one bank apart (for USE_INT)
252 shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
253 ;;}
254 { .mmi
255 add loopcnt = -1, loopcnt
256 (p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
257 nop.i 0
258 ;; }
259 { .mib
260 add adest = 16, dest
261 movi0 ar.lc = loopcnt // set the loop counter
262 ;; }
263
264 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
265 { nop 0 }
266 #else
267 .align 32
268 #endif
269 #if defined(USE_FLP)
270 .l1: // ------------------------------- // L1: Everything a multiple of 8
271 { .mmi
272 #if defined(USE_LFETCH)
273 (p[0]) lfetch.nt1 [ptr2],32
274 #endif
275 (p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
276 (p[0]) add len = -32, len
277 } {.mmb
278 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
279 (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
280 ;; }
281 { .mmi
282 #if defined(USE_LFETCH)
283 (p[0]) lfetch.nt1 [ptr1],32
284 #endif
285 (p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
286 } {.mmb
287 (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
288 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
289 br.ctop.dptk.many .l1
290 ;; }
291 #elif defined(USE_INT)
292 .l1: // ------------------------------- // L1: Everything a multiple of 8
293 { .mmi
294 (p[0]) load the_r[0] = [src], 8
295 (p[0]) load the_q[0] = [asrc], 8
296 (p[0]) add len = -32, len
297 } {.mmb
298 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
299 (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
300 ;; }
301 { .mmi
302 (p[0]) load the_s[0] = [src], 24
303 (p[0]) load the_t[0] = [asrc], 24
304 } {.mmb
305 (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
306 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
307 #if defined(USE_LFETCH)
308 ;; }
309 { .mmb
310 (p[0]) lfetch.nt1 [ptr2],32
311 (p[0]) lfetch.nt1 [ptr1],32
312 #endif
313 br.ctop.dptk.many .l1
314 ;; }
315 #endif
316
317 .copy_full_words:
318 { .mib
319 cmp.gt p_scr, p0 = 8, len //
320 shr.u elemcnt = len, 3 //
321 (p_scr) br.cond.dpnt.many .copy_bytes
322 ;; }
323 { .mii
324 load tempreg = [src], 8
325 add loopcnt = -1, elemcnt //
326 ;; }
327 { .mii
328 cmp.ne p_scr, p0 = 0, loopcnt //
329 mov ar.lc = loopcnt //
330 ;; }
331
332 .l2: // ------------------------------- // L2: Max 4 words copied separately
333 { .mmi
334 store [dest] = tempreg, 8
335 (p_scr) load tempreg = [src], 8 //
336 add len = -8, len
337 } { .mib
338 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
339 add loopcnt = -1, loopcnt
340 br.cloop.dptk.few .l2
341 ;; }
342
343 .copy_bytes:
344 { .mib
345 cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
346 add loopcnt = -1, len // len--;
347 (p_scr) br.cond.spnt .restore_and_exit
348 ;; }
349 { .mii
350 ld1 tmp2 = [src], 1
351 movi0 ar.lc = loopcnt
352 cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
353 ;; }
354
355 .l3: // ------------------------------- // L3: Final byte move
356 { .mmi
357 st1 [dest] = tmp2, 1
358 (p_scr) ld1 tmp2 = [src], 1
359 } { .mib
360 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
361 add loopcnt = -1, loopcnt
362 br.cloop.dptk.few .l3
363 ;; }
364
365 .restore_and_exit:
366 { .mmi
367 movi0 pr = saved_pr, -1 // restore the predicate registers
368 ;; }
369 { .mib
370 movi0 ar.lc = saved_lc // restore the loop counter
371 br.ret.sptk.many b0
372 ;; }
373
374
375 .src_not_aligned:
376 { .mmi
377 cmp.gt p_scr, p0 = 16, len
378 and sh1 = 7, src // sh1 = src % 8
379 shr.u loopcnt = len, 4 // element-cnt = len / 16
380 } { .mib
381 add tmp4 = @ltoff(.table), gp
382 add tmp3 = @ltoff(.loop56), gp
383 (p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
384 ;; }
385 { .mmi
386 and asrc = -8, src // asrc = (-8) -- align src for loop
387 add loopcnt = -1, loopcnt // loopcnt--
388 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
389 } { .mmi
390 ld8 ptable = [tmp4] // ptable = &table
391 ld8 ploop56 = [tmp3] // ploop56 = &loop56
392 and tmp2 = -16, len // tmp2 = len & -OPSIZ
393 ;; }
394 { .mmi
395 add tmp3 = ptable, sh1 // tmp3 = &table + sh1
396 add src = src, tmp2 // src += len & (-16)
397 movi0 ar.lc = loopcnt // set LC
398 ;; }
399 { .mmi
400 ld8 tmp4 = [tmp3] // tmp4 = loop offset
401 sub len = len, tmp2 // len -= len & (-16)
402 movi0 ar.ec = MEMLAT + 2 // one more pass needed
403 ;; }
404 { .mmi
405 ld8 s[1] = [asrc], 8 // preload
406 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
407 movi0 pr.rot = 1 << 16 // set rotating predicates
408 ;; }
409 { .mib
410 nop.m 0
411 movi0 b6 = loopaddr
412 br b6 // jump to the appropriate loop
413 ;; }
414
415 LOOP(8)
416 LOOP(16)
417 LOOP(24)
418 LOOP(32)
419 LOOP(40)
420 LOOP(48)
421 LOOP(56)
422 END(memcpy)
423 libc_hidden_builtin_def (memcpy)
424
425 .rodata
426 .align 8
427 .table:
428 data8 0 // dummy entry
429 data8 .loop56 - .loop8
430 data8 .loop56 - .loop16
431 data8 .loop56 - .loop24
432 data8 .loop56 - .loop32
433 data8 .loop56 - .loop40
434 data8 .loop56 - .loop48
435 data8 .loop56 - .loop56