]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/memcpy.S
ia64: move from main tree
[thirdparty/glibc.git] / sysdeps / ia64 / memcpy.S
1 /* Optimized version of the standard memcpy() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
4 Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
5 Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, write to the Free
19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20 02111-1307 USA. */
21
22 /* Return: dest
23
24 Inputs:
25 in0: dest
26 in1: src
27 in2: byte count
28
29 An assembly implementation of the algorithm used by the generic C
30 version from glibc. The case when source and sest are aligned is
31 treated separately, for extra performance.
32
33 In this form, memcpy assumes little endian mode. For big endian mode,
34 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
35 and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
36 shrp instruction. */
37
38 #define USE_LFETCH
39 #define USE_FLP
40 #include <sysdep.h>
41 #undef ret
42
43 #define LFETCH_DIST 500
44
45 #define ALIGN_UNROLL_no 4 // no. of elements
46 #define ALIGN_UNROLL_sh 2 // (shift amount)
47
48 #define MEMLAT 8
49 #define Nrot ((4*(MEMLAT+2) + 7) & ~7)
50
51 #define OP_T_THRES 16
52 #define OPSIZ 8
53
54 #define loopcnt r14
55 #define elemcnt r15
56 #define saved_pr r16
57 #define saved_lc r17
58 #define adest r18
59 #define dest r19
60 #define asrc r20
61 #define src r21
62 #define len r22
63 #define tmp2 r23
64 #define tmp3 r24
65 #define tmp4 r25
66 #define ptable r26
67 #define ploop56 r27
68 #define loopaddr r28
69 #define sh1 r29
70 #define ptr1 r30
71 #define ptr2 r31
72
73 #define movi0 mov
74
75 #define p_scr p6
76 #define p_xtr p7
77 #define p_nxtr p8
78 #define p_few p9
79
80 #if defined(USE_FLP)
81 #define load ldf8
82 #define store stf8
83 #define tempreg f6
84 #define the_r fr
85 #define the_s fs
86 #define the_t ft
87 #define the_q fq
88 #define the_w fw
89 #define the_x fx
90 #define the_y fy
91 #define the_z fz
92 #elif defined(USE_INT)
93 #define load ld8
94 #define store st8
95 #define tempreg tmp2
96 #define the_r r
97 #define the_s s
98 #define the_t t
99 #define the_q q
100 #define the_w w
101 #define the_x x
102 #define the_y y
103 #define the_z z
104 #endif
105
106 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
107 /* Manually force proper loop-alignment. Note: be sure to
108 double-check the code-layout after making any changes to
109 this routine! */
110 # define ALIGN(n) { nop 0 }
111 #else
112 # define ALIGN(n) .align n
113 #endif
114
115 #if defined(USE_LFETCH)
116 #define LOOP(shift) \
117 ALIGN(32); \
118 .loop##shift##: \
119 { .mmb \
120 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
121 (p[0]) lfetch.nt1 [ptr1], 16 ; \
122 nop.b 0 ; \
123 } { .mib \
124 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
125 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
126 nop.b 0 ;; \
127 } { .mmb \
128 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
129 (p[0]) lfetch.nt1 [ptr2], 16 ; \
130 nop.b 0 ; \
131 } { .mib \
132 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
133 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
134 br.ctop.sptk.many .loop##shift \
135 ;; } \
136 { .mib \
137 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
138 }
139 #else
140 #define LOOP(shift) \
141 ALIGN(32); \
142 .loop##shift##: \
143 { .mmb \
144 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
145 nop.b 0 ; \
146 } { .mib \
147 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
148 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
149 nop.b 0 ;; \
150 } { .mmb \
151 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
152 nop.b 0 ; \
153 } { .mib \
154 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
155 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
156 br.ctop.sptk.many .loop##shift \
157 ;; } \
158 { .mib \
159 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
160 }
161 #endif
162
163
164 ENTRY(memcpy)
165 { .mmi
166 .prologue
167 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
168 .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
169 .rotp p[MEMLAT+2]
170 .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
171 mov ret0 = in0 // return tmp2 = dest
172 .save pr, saved_pr
173 movi0 saved_pr = pr // save the predicate registers
174 } { .mmi
175 and tmp4 = 7, in0 // check if destination is aligned
176 mov dest = in0 // dest
177 mov src = in1 // src
178 ;; }
179 { .mii
180 cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
181 .save ar.lc, saved_lc
182 movi0 saved_lc = ar.lc // save the loop counter
183 .body
184 cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
185 } { .mbb
186 mov len = in2 // len
187 (p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
188 (p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
189 ;; }
190 { .mmi
191 #if defined(USE_LFETCH)
192 lfetch.nt1 [dest] //
193 lfetch.nt1 [src] //
194 #endif
195 shr.u elemcnt = len, 3 // elemcnt = len / 8
196 } { .mib
197 cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
198 sub loopcnt = 7, tmp4 //
199 (p_scr) br.cond.dptk.many .dest_aligned
200 ;; }
201 { .mmi
202 ld1 tmp2 = [src], 1 //
203 sub len = len, loopcnt, 1 // reduce len
204 movi0 ar.lc = loopcnt //
205 } { .mib
206 cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
207 ;; }
208
209 .l0: // ---------------------------- // L0: Align src on 8-byte boundary
210 { .mmi
211 st1 [dest] = tmp2, 1 //
212 (p_scr) ld1 tmp2 = [src], 1 //
213 } { .mib
214 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
215 add loopcnt = -1, loopcnt
216 br.cloop.dptk.few .l0 //
217 ;; }
218
219 .dest_aligned:
220 { .mmi
221 and tmp4 = 7, src // ready for alignment check
222 shr.u elemcnt = len, 3 // elemcnt = len / 8
223 ;; }
224 { .mib
225 cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
226 tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
227 } { .mib // is not 16B aligned
228 add ptr2 = LFETCH_DIST, dest // prefetch address
229 add ptr1 = LFETCH_DIST, src
230 (p_scr) br.cond.dptk.many .src_not_aligned
231 ;; }
232
233 // The optimal case, when dest, and src are aligned
234
235 .both_aligned:
236 { .mmi
237 .pred.rel "mutex",p_xtr,p_nxtr
238 (p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
239 (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
240 movi0 pr.rot = 1 << 16 // set rotating predicates
241 } { .mib
242 (p_scr) br.cond.dpnt.many .copy_full_words
243 ;; }
244
245 { .mmi
246 (p_xtr) load tempreg = [src], 8
247 (p_xtr) add elemcnt = -1, elemcnt
248 movi0 ar.ec = MEMLAT + 1 // set the epilog counter
249 ;; }
250 { .mmi
251 (p_xtr) add len = -8, len //
252 add asrc = 16, src // one bank apart (for USE_INT)
253 shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
254 ;;}
255 { .mmi
256 add loopcnt = -1, loopcnt
257 (p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
258 nop.i 0
259 ;; }
260 { .mib
261 add adest = 16, dest
262 movi0 ar.lc = loopcnt // set the loop counter
263 ;; }
264
265 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
266 { nop 0 }
267 #else
268 .align 32
269 #endif
270 #if defined(USE_FLP)
271 .l1: // ------------------------------- // L1: Everything a multiple of 8
272 { .mmi
273 #if defined(USE_LFETCH)
274 (p[0]) lfetch.nt1 [ptr2],32
275 #endif
276 (p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
277 (p[0]) add len = -32, len
278 } {.mmb
279 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
280 (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
281 ;; }
282 { .mmi
283 #if defined(USE_LFETCH)
284 (p[0]) lfetch.nt1 [ptr1],32
285 #endif
286 (p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
287 } {.mmb
288 (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
289 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
290 br.ctop.dptk.many .l1
291 ;; }
292 #elif defined(USE_INT)
293 .l1: // ------------------------------- // L1: Everything a multiple of 8
294 { .mmi
295 (p[0]) load the_r[0] = [src], 8
296 (p[0]) load the_q[0] = [asrc], 8
297 (p[0]) add len = -32, len
298 } {.mmb
299 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
300 (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
301 ;; }
302 { .mmi
303 (p[0]) load the_s[0] = [src], 24
304 (p[0]) load the_t[0] = [asrc], 24
305 } {.mmb
306 (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
307 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
308 #if defined(USE_LFETCH)
309 ;; }
310 { .mmb
311 (p[0]) lfetch.nt1 [ptr2],32
312 (p[0]) lfetch.nt1 [ptr1],32
313 #endif
314 br.ctop.dptk.many .l1
315 ;; }
316 #endif
317
318 .copy_full_words:
319 { .mib
320 cmp.gt p_scr, p0 = 8, len //
321 shr.u elemcnt = len, 3 //
322 (p_scr) br.cond.dpnt.many .copy_bytes
323 ;; }
324 { .mii
325 load tempreg = [src], 8
326 add loopcnt = -1, elemcnt //
327 ;; }
328 { .mii
329 cmp.ne p_scr, p0 = 0, loopcnt //
330 mov ar.lc = loopcnt //
331 ;; }
332
333 .l2: // ------------------------------- // L2: Max 4 words copied separately
334 { .mmi
335 store [dest] = tempreg, 8
336 (p_scr) load tempreg = [src], 8 //
337 add len = -8, len
338 } { .mib
339 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
340 add loopcnt = -1, loopcnt
341 br.cloop.dptk.few .l2
342 ;; }
343
344 .copy_bytes:
345 { .mib
346 cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
347 add loopcnt = -1, len // len--;
348 (p_scr) br.cond.spnt .restore_and_exit
349 ;; }
350 { .mii
351 ld1 tmp2 = [src], 1
352 movi0 ar.lc = loopcnt
353 cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
354 ;; }
355
356 .l3: // ------------------------------- // L3: Final byte move
357 { .mmi
358 st1 [dest] = tmp2, 1
359 (p_scr) ld1 tmp2 = [src], 1
360 } { .mib
361 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
362 add loopcnt = -1, loopcnt
363 br.cloop.dptk.few .l3
364 ;; }
365
366 .restore_and_exit:
367 { .mmi
368 movi0 pr = saved_pr, -1 // restore the predicate registers
369 ;; }
370 { .mib
371 movi0 ar.lc = saved_lc // restore the loop counter
372 br.ret.sptk.many b0
373 ;; }
374
375
376 .src_not_aligned:
377 { .mmi
378 cmp.gt p_scr, p0 = 16, len
379 and sh1 = 7, src // sh1 = src % 8
380 shr.u loopcnt = len, 4 // element-cnt = len / 16
381 } { .mib
382 add tmp4 = @ltoff(.table), gp
383 add tmp3 = @ltoff(.loop56), gp
384 (p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
385 ;; }
386 { .mmi
387 and asrc = -8, src // asrc = (-8) -- align src for loop
388 add loopcnt = -1, loopcnt // loopcnt--
389 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
390 } { .mmi
391 ld8 ptable = [tmp4] // ptable = &table
392 ld8 ploop56 = [tmp3] // ploop56 = &loop56
393 and tmp2 = -16, len // tmp2 = len & -OPSIZ
394 ;; }
395 { .mmi
396 add tmp3 = ptable, sh1 // tmp3 = &table + sh1
397 add src = src, tmp2 // src += len & (-16)
398 movi0 ar.lc = loopcnt // set LC
399 ;; }
400 { .mmi
401 ld8 tmp4 = [tmp3] // tmp4 = loop offset
402 sub len = len, tmp2 // len -= len & (-16)
403 movi0 ar.ec = MEMLAT + 2 // one more pass needed
404 ;; }
405 { .mmi
406 ld8 s[1] = [asrc], 8 // preload
407 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
408 movi0 pr.rot = 1 << 16 // set rotating predicates
409 ;; }
410 { .mib
411 nop.m 0
412 movi0 b6 = loopaddr
413 br b6 // jump to the appropriate loop
414 ;; }
415
416 LOOP(8)
417 LOOP(16)
418 LOOP(24)
419 LOOP(32)
420 LOOP(40)
421 LOOP(48)
422 LOOP(56)
423 END(memcpy)
424 libc_hidden_builtin_def (memcpy)
425
426 .rodata
427 .align 8
428 .table:
429 data8 0 // dummy entry
430 data8 .loop56 - .loop8
431 data8 .loop56 - .loop16
432 data8 .loop56 - .loop24
433 data8 .loop56 - .loop32
434 data8 .loop56 - .loop40
435 data8 .loop56 - .loop48
436 data8 .loop56 - .loop56