]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/ia64/memmove.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / ia64 / memmove.S
1 /* Optimized version of the standard memmove() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000-2017 Free Software Foundation, Inc.
4 Contributed by Dan Pop <Dan.Pop@cern.ch>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 /* Return: dest
21
22 Inputs:
23 in0: dest
24 in1: src
25 in2: byte count
26
27 The core of the function is the memcpy implementation used in memcpy.S.
28 When bytes have to be copied backwards, only the easy case, when
29 all arguments are multiples of 8, is optimised.
30
31 In this form, it assumes little endian mode. For big endian mode,
32 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
33 or the UM.be bit should be cleared at the beginning and set at the end. */
34
35 #include <sysdep.h>
36 #undef ret
37
38 #define OP_T_THRES 16
39 #define OPSIZ 8
40
41 #define adest r15
42 #define saved_pr r17
43 #define saved_lc r18
44 #define dest r19
45 #define src r20
46 #define len r21
47 #define asrc r22
48 #define tmp2 r23
49 #define tmp3 r24
50 #define tmp4 r25
51 #define ptable r26
52 #define ploop56 r27
53 #define loopaddr r28
54 #define sh1 r29
55 #define loopcnt r30
56 #define value r31
57
58 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
59 # define ALIGN(n) { nop 0 }
60 #else
61 # define ALIGN(n) .align n
62 #endif
63
64 #define LOOP(shift) \
65 ALIGN(32); \
66 .loop##shift##: \
67 (p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
68 (p[MEMLAT+1]) st8 [dest] = value, 8 ; \
69 (p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
70 nop.b 0 ; \
71 nop.b 0 ; \
72 br.ctop.sptk .loop##shift ; \
73 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
74
75 #define MEMLAT 21
76 #define Nrot (((2*MEMLAT+3) + 7) & ~7)
77
78 ENTRY(memmove)
79 .prologue
80 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
81 .rotr r[MEMLAT + 2], q[MEMLAT + 1]
82 .rotp p[MEMLAT + 2]
83 mov ret0 = in0 // return value = dest
84 .save pr, saved_pr
85 mov saved_pr = pr // save the predicate registers
86 .save ar.lc, saved_lc
87 mov saved_lc = ar.lc // save the loop counter
88 .body
89 or tmp3 = in0, in1 ;; // tmp3 = dest | src
90 or tmp3 = tmp3, in2 // tmp3 = dest | src | len
91 mov dest = in0 // dest
92 mov src = in1 // src
93 mov len = in2 // len
94 sub tmp2 = r0, in0 // tmp2 = -dest
95 cmp.eq p6, p0 = in2, r0 // if (len == 0)
96 (p6) br.cond.spnt .restore_and_exit;;// return dest;
97 and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
98 cmp.le p6, p0 = dest, src // if dest <= src it's always safe
99 (p6) br.cond.spnt .forward // to copy forward
100 add tmp3 = src, len;;
101 cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
102 (p6) br.cond.spnt .backward // we have to copy backward
103
104 .forward:
105 shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
106 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
107 (p6) br.cond.sptk .next // goto next;
108
109 // The optimal case, when dest, src and len are all multiples of 8
110
111 and tmp3 = 0xf, len
112 mov pr.rot = 1 << 16 // set rotating predicates
113 mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
114 cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
115 adds loopcnt = -1, loopcnt;; // --loopcnt
116 (p6) ld8 value = [src], 8;;
117 (p6) st8 [dest] = value, 8 // copy the "odd" word
118 mov ar.lc = loopcnt // set the loop counter
119 cmp.eq p6, p0 = 8, len
120 (p6) br.cond.spnt .restore_and_exit;;// the one-word special case
121 adds adest = 8, dest // set adest one word ahead of dest
122 adds asrc = 8, src ;; // set asrc one word ahead of src
123 nop.b 0 // get the "golden" alignment for
124 nop.b 0 // the next loop
125 .l0:
126 (p[0]) ld8 r[0] = [src], 16
127 (p[0]) ld8 q[0] = [asrc], 16
128 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
129 (p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
130 br.ctop.dptk .l0 ;;
131
132 mov pr = saved_pr, -1 // restore the predicate registers
133 mov ar.lc = saved_lc // restore the loop counter
134 br.ret.sptk.many b0
135 .next:
136 cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
137 and loopcnt = 7, tmp2 // loopcnt = -dest % 8
138 (p6) br.cond.spnt .cpyfew // copy byte by byte
139 ;;
140 cmp.eq p6, p0 = loopcnt, r0
141 (p6) br.cond.sptk .dest_aligned
142 sub len = len, loopcnt // len -= -dest % 8
143 adds loopcnt = -1, loopcnt // --loopcnt
144 ;;
145 mov ar.lc = loopcnt
146 .l1: // copy -dest % 8 bytes
147 ld1 value = [src], 1 // value = *src++
148 ;;
149 st1 [dest] = value, 1 // *dest++ = value
150 br.cloop.dptk .l1
151 .dest_aligned:
152 and sh1 = 7, src // sh1 = src % 8
153 and tmp2 = -8, len // tmp2 = len & -OPSIZ
154 and asrc = -8, src // asrc = src & -OPSIZ -- align src
155 shr.u loopcnt = len, 3 // loopcnt = len / 8
156 and len = 7, len;; // len = len % 8
157 adds loopcnt = -1, loopcnt // --loopcnt
158 addl tmp4 = @ltoff(.table), gp
159 addl tmp3 = @ltoff(.loop56), gp
160 mov ar.ec = MEMLAT + 1 // set EC
161 mov pr.rot = 1 << 16;; // set rotating predicates
162 mov ar.lc = loopcnt // set LC
163 cmp.eq p6, p0 = sh1, r0 // is the src aligned?
164 (p6) br.cond.sptk .src_aligned
165 add src = src, tmp2 // src += len & -OPSIZ
166 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
167 ld8 ploop56 = [tmp3] // ploop56 = &loop56
168 ld8 ptable = [tmp4];; // ptable = &table
169 add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
170 mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
171 ld8 tmp4 = [tmp3];; // tmp4 = loop offset
172 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
173 ld8 r[1] = [asrc], 8;; // w0
174 mov b6 = loopaddr;;
175 br b6 // jump to the appropriate loop
176
177 LOOP(8)
178 LOOP(16)
179 LOOP(24)
180 LOOP(32)
181 LOOP(40)
182 LOOP(48)
183 LOOP(56)
184
185 .src_aligned:
186 .l3:
187 (p[0]) ld8 r[0] = [src], 8
188 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
189 br.ctop.dptk .l3
190 .cpyfew:
191 cmp.eq p6, p0 = len, r0 // is len == 0 ?
192 adds len = -1, len // --len;
193 (p6) br.cond.spnt .restore_and_exit ;;
194 mov ar.lc = len
195 .l4:
196 ld1 value = [src], 1
197 ;;
198 st1 [dest] = value, 1
199 br.cloop.dptk .l4 ;;
200 .restore_and_exit:
201 mov pr = saved_pr, -1 // restore the predicate registers
202 mov ar.lc = saved_lc // restore the loop counter
203 br.ret.sptk.many b0
204
205 // In the case of a backward copy, optimise only the case when everything
206 // is a multiple of 8, otherwise copy byte by byte. The backward copy is
207 // used only when the blocks are overlapping and dest > src.
208
209 .backward:
210 shr.u loopcnt = len, 3 // loopcnt = len / 8
211 add src = src, len // src points one byte past the end
212 add dest = dest, len ;; // dest points one byte past the end
213 mov ar.ec = MEMLAT + 1 // set the epilog counter
214 mov pr.rot = 1 << 16 // set rotating predicates
215 adds loopcnt = -1, loopcnt // --loopcnt
216 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
217 (p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
218 adds src = -8, src // src points to the last word
219 adds dest = -8, dest // dest points to the last word
220 mov ar.lc = loopcnt;; // set the loop counter
221 .l5:
222 (p[0]) ld8 r[0] = [src], -8
223 (p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
224 br.ctop.dptk .l5
225 br.cond.sptk .restore_and_exit
226 .bytecopy:
227 adds src = -1, src // src points to the last byte
228 adds dest = -1, dest // dest points to the last byte
229 adds loopcnt = -1, len;; // loopcnt = len - 1
230 mov ar.lc = loopcnt;; // set the loop counter
231 .l6:
232 (p[0]) ld1 r[0] = [src], -1
233 (p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
234 br.ctop.dptk .l6
235 br.cond.sptk .restore_and_exit
236 END(memmove)
237
238 .rodata
239 .align 8
240 .table:
241 data8 0 // dummy entry
242 data8 .loop56 - .loop8
243 data8 .loop56 - .loop16
244 data8 .loop56 - .loop24
245 data8 .loop56 - .loop32
246 data8 .loop56 - .loop40
247 data8 .loop56 - .loop48
248 data8 .loop56 - .loop56
249
250 libc_hidden_builtin_def (memmove)