]>
Commit | Line | Data |
---|---|---|
bb769ab6 UD |
1 | /* Copy SIZE bytes from SRC to DEST. |
2 | For UltraSPARC-III. | |
688903eb | 3 | Copyright (C) 2001-2018 Free Software Foundation, Inc. |
bb769ab6 UD |
4 | This file is part of the GNU C Library. |
5 | Contributed by David S. Miller (davem@redhat.com) | |
6 | ||
7 | The GNU C Library is free software; you can redistribute it and/or | |
41bdb6e2 AJ |
8 | modify it under the terms of the GNU Lesser General Public |
9 | License as published by the Free Software Foundation; either | |
10 | version 2.1 of the License, or (at your option) any later version. | |
bb769ab6 UD |
11 | |
12 | The GNU C Library is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 15 | Lesser General Public License for more details. |
bb769ab6 | 16 | |
41bdb6e2 | 17 | You should have received a copy of the GNU Lesser General Public |
59ba27a6 PE |
18 | License along with the GNU C Library; if not, see |
19 | <http://www.gnu.org/licenses/>. */ | |
bb769ab6 UD |
20 | |
21 | #include <sysdep.h> | |
3ee3a002 | 22 | |
bb769ab6 UD |
23 | #define ASI_BLK_P 0xf0 |
24 | #define FPRS_FEF 0x04 | |
25 | #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs | |
26 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
bb769ab6 UD |
27 | |
28 | #ifndef XCC | |
29 | #define USE_BPR | |
30 | #define XCC xcc | |
31 | #endif | |
32 | ||
4f41c682 | 33 | #if IS_IN (libc) |
3afd5a3b | 34 | |
3ee3a002 UD |
35 | .register %g2,#scratch |
36 | .register %g3,#scratch | |
37 | .register %g6,#scratch | |
38 | ||
bb769ab6 | 39 | .text |
bb769ab6 | 40 | |
88d85d4f DM |
41 | ENTRY(__mempcpy_ultra3) |
42 | ba,pt %XCC, 101f | |
43 | add %o0, %o2, %g5 | |
44 | END(__mempcpy_ultra3) | |
45 | ||
bb769ab6 UD |
46 | /* Special/non-trivial issues of this code: |
47 | * | |
48 | * 1) %o5 is preserved from VISEntryHalf to VISExitHalf | |
49 | * 2) Only low 32 FPU registers are used so that only the | |
50 | * lower half of the FPU register set is dirtied by this | |
51 | * code. This is especially important in the kernel. | |
52 | * 3) This code never prefetches cachelines past the end | |
53 | * of the source buffer. | |
54 | * | |
55 | * The cheetah's flexible spine, oversized liver, enlarged heart, | |
56 | * slender muscular body, and claws make it the swiftest hunter | |
57 | * in Africa and the fastest animal on land. Can reach speeds | |
58 | * of up to 2.4GB per second. | |
59 | */ | |
60 | .align 32 | |
3afd5a3b | 61 | ENTRY(__memcpy_ultra3) |
bb769ab6 UD |
62 | |
63 | 100: /* %o0=dst, %o1=src, %o2=len */ | |
3ee3a002 | 64 | mov %o0, %g5 |
88d85d4f | 65 | 101: |
3ee3a002 UD |
66 | cmp %o2, 0 |
67 | be,pn %XCC, out | |
68 | 218: or %o0, %o1, %o3 | |
69 | cmp %o2, 16 | |
70 | bleu,a,pn %XCC, small_copy | |
71 | or %o3, %o2, %o3 | |
bb769ab6 | 72 | |
3ee3a002 UD |
73 | cmp %o2, 256 |
74 | blu,pt %XCC, medium_copy | |
75 | andcc %o3, 0x7, %g0 | |
bb769ab6 | 76 | |
3ee3a002 UD |
77 | ba,pt %xcc, enter |
78 | andcc %o0, 0x3f, %g2 | |
bb769ab6 | 79 | |
3ee3a002 | 80 | /* Here len >= 256 and condition codes reflect execution |
bb769ab6 UD |
81 | * of "andcc %o0, 0x7, %g2", done by caller. |
82 | */ | |
83 | .align 64 | |
3ee3a002 | 84 | enter: |
bb769ab6 | 85 | /* Is 'dst' already aligned on an 64-byte boundary? */ |
3ee3a002 | 86 | be,pt %XCC, 2f |
bb769ab6 UD |
87 | |
88 | /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number | |
89 | * of bytes to copy to make 'dst' 64-byte aligned. We pre- | |
90 | * subtract this from 'len'. | |
91 | */ | |
3ee3a002 UD |
92 | sub %g2, 0x40, %g2 |
93 | sub %g0, %g2, %g2 | |
94 | sub %o2, %g2, %o2 | |
bb769ab6 UD |
95 | |
96 | /* Copy %g2 bytes from src to dst, one byte at a time. */ | |
3ee3a002 UD |
97 | 1: ldub [%o1 + 0x00], %o3 |
98 | add %o1, 0x1, %o1 | |
99 | add %o0, 0x1, %o0 | |
100 | subcc %g2, 0x1, %g2 | |
bb769ab6 | 101 | |
3ee3a002 UD |
102 | bg,pt %XCC, 1b |
103 | stb %o3, [%o0 + -1] | |
bb769ab6 | 104 | |
3ee3a002 UD |
105 | 2: VISEntryHalf |
106 | and %o1, 0x7, %g1 | |
107 | ba,pt %xcc, begin | |
108 | alignaddr %o1, %g0, %o1 | |
bb769ab6 UD |
109 | |
110 | .align 64 | |
3ee3a002 UD |
111 | begin: |
112 | prefetch [%o1 + 0x000], #one_read | |
113 | prefetch [%o1 + 0x040], #one_read | |
114 | andn %o2, (0x40 - 1), %o4 | |
115 | prefetch [%o1 + 0x080], #one_read | |
116 | prefetch [%o1 + 0x0c0], #one_read | |
117 | ldd [%o1 + 0x000], %f0 | |
118 | prefetch [%o1 + 0x100], #one_read | |
119 | ldd [%o1 + 0x008], %f2 | |
120 | prefetch [%o1 + 0x140], #one_read | |
121 | ldd [%o1 + 0x010], %f4 | |
122 | prefetch [%o1 + 0x180], #one_read | |
123 | faligndata %f0, %f2, %f16 | |
124 | ldd [%o1 + 0x018], %f6 | |
125 | faligndata %f2, %f4, %f18 | |
126 | ldd [%o1 + 0x020], %f8 | |
127 | faligndata %f4, %f6, %f20 | |
128 | ldd [%o1 + 0x028], %f10 | |
129 | faligndata %f6, %f8, %f22 | |
130 | ||
131 | ldd [%o1 + 0x030], %f12 | |
132 | faligndata %f8, %f10, %f24 | |
133 | ldd [%o1 + 0x038], %f14 | |
134 | faligndata %f10, %f12, %f26 | |
135 | ldd [%o1 + 0x040], %f0 | |
136 | ||
137 | sub %o4, 0x80, %o4 | |
138 | add %o1, 0x40, %o1 | |
139 | ba,pt %xcc, loop | |
140 | srl %o4, 6, %o3 | |
141 | ||
142 | .align 64 | |
143 | loop: | |
144 | ldd [%o1 + 0x008], %f2 | |
145 | faligndata %f12, %f14, %f28 | |
146 | ldd [%o1 + 0x010], %f4 | |
147 | faligndata %f14, %f0, %f30 | |
148 | stda %f16, [%o0] ASI_BLK_P | |
149 | ldd [%o1 + 0x018], %f6 | |
150 | faligndata %f0, %f2, %f16 | |
151 | ||
152 | ldd [%o1 + 0x020], %f8 | |
153 | faligndata %f2, %f4, %f18 | |
154 | ldd [%o1 + 0x028], %f10 | |
155 | faligndata %f4, %f6, %f20 | |
156 | ldd [%o1 + 0x030], %f12 | |
157 | faligndata %f6, %f8, %f22 | |
158 | ldd [%o1 + 0x038], %f14 | |
159 | faligndata %f8, %f10, %f24 | |
160 | ||
161 | ldd [%o1 + 0x040], %f0 | |
162 | prefetch [%o1 + 0x180], #one_read | |
163 | faligndata %f10, %f12, %f26 | |
164 | subcc %o3, 0x01, %o3 | |
165 | add %o1, 0x40, %o1 | |
166 | bg,pt %XCC, loop | |
167 | add %o0, 0x40, %o0 | |
bb769ab6 UD |
168 | |
169 | /* Finally we copy the last full 64-byte block. */ | |
3ee3a002 UD |
170 | loopfini: |
171 | ldd [%o1 + 0x008], %f2 | |
172 | faligndata %f12, %f14, %f28 | |
173 | ldd [%o1 + 0x010], %f4 | |
174 | faligndata %f14, %f0, %f30 | |
175 | stda %f16, [%o0] ASI_BLK_P | |
176 | ldd [%o1 + 0x018], %f6 | |
177 | faligndata %f0, %f2, %f16 | |
178 | ldd [%o1 + 0x020], %f8 | |
179 | faligndata %f2, %f4, %f18 | |
180 | ldd [%o1 + 0x028], %f10 | |
181 | faligndata %f4, %f6, %f20 | |
182 | ldd [%o1 + 0x030], %f12 | |
183 | faligndata %f6, %f8, %f22 | |
184 | ldd [%o1 + 0x038], %f14 | |
185 | faligndata %f8, %f10, %f24 | |
186 | cmp %g1, 0 | |
187 | be,pt %XCC, 1f | |
188 | add %o0, 0x40, %o0 | |
189 | ldd [%o1 + 0x040], %f0 | |
190 | 1: faligndata %f10, %f12, %f26 | |
191 | faligndata %f12, %f14, %f28 | |
192 | faligndata %f14, %f0, %f30 | |
193 | stda %f16, [%o0] ASI_BLK_P | |
194 | add %o0, 0x40, %o0 | |
195 | add %o1, 0x40, %o1 | |
196 | membar #Sync | |
bb769ab6 UD |
197 | |
198 | /* Now we copy the (len modulo 64) bytes at the end. | |
199 | * Note how we borrow the %f0 loaded above. | |
200 | * | |
201 | * Also notice how this code is careful not to perform a | |
3ee3a002 | 202 | * load past the end of the src buffer. |
bb769ab6 | 203 | */ |
3ee3a002 UD |
204 | loopend: |
205 | and %o2, 0x3f, %o2 | |
206 | andcc %o2, 0x38, %g2 | |
207 | be,pn %XCC, endcruft | |
208 | subcc %g2, 0x8, %g2 | |
209 | be,pn %XCC, endcruft | |
210 | cmp %g1, 0 | |
211 | ||
212 | be,a,pt %XCC, 1f | |
213 | ldd [%o1 + 0x00], %f0 | |
214 | ||
215 | 1: ldd [%o1 + 0x08], %f2 | |
216 | add %o1, 0x8, %o1 | |
217 | sub %o2, 0x8, %o2 | |
218 | subcc %g2, 0x8, %g2 | |
219 | faligndata %f0, %f2, %f8 | |
220 | std %f8, [%o0 + 0x00] | |
221 | be,pn %XCC, endcruft | |
222 | add %o0, 0x8, %o0 | |
223 | ldd [%o1 + 0x08], %f0 | |
224 | add %o1, 0x8, %o1 | |
225 | sub %o2, 0x8, %o2 | |
226 | subcc %g2, 0x8, %g2 | |
227 | faligndata %f2, %f0, %f8 | |
228 | std %f8, [%o0 + 0x00] | |
229 | bne,pn %XCC, 1b | |
230 | add %o0, 0x8, %o0 | |
bb769ab6 UD |
231 | |
232 | /* If anything is left, we copy it one byte at a time. | |
233 | * Note that %g1 is (src & 0x3) saved above before the | |
234 | * alignaddr was performed. | |
235 | */ | |
3ee3a002 | 236 | endcruft: |
bb769ab6 UD |
237 | cmp %o2, 0 |
238 | add %o1, %g1, %o1 | |
239 | VISExitHalf | |
3ee3a002 UD |
240 | be,pn %XCC, out |
241 | sub %o0, %o1, %o3 | |
bb769ab6 | 242 | |
3ee3a002 UD |
243 | andcc %g1, 0x7, %g0 |
244 | bne,pn %icc, small_copy_unaligned | |
245 | andcc %o2, 0x8, %g0 | |
246 | be,pt %icc, 1f | |
247 | nop | |
248 | ldx [%o1], %o5 | |
249 | stx %o5, [%o1 + %o3] | |
250 | add %o1, 0x8, %o1 | |
bb769ab6 | 251 | |
3ee3a002 UD |
252 | 1: andcc %o2, 0x4, %g0 |
253 | be,pt %icc, 1f | |
254 | nop | |
255 | lduw [%o1], %o5 | |
256 | stw %o5, [%o1 + %o3] | |
257 | add %o1, 0x4, %o1 | |
bb769ab6 | 258 | |
3ee3a002 UD |
259 | 1: andcc %o2, 0x2, %g0 |
260 | be,pt %icc, 1f | |
261 | nop | |
262 | lduh [%o1], %o5 | |
263 | sth %o5, [%o1 + %o3] | |
264 | add %o1, 0x2, %o1 | |
bb769ab6 | 265 | |
3ee3a002 UD |
266 | 1: andcc %o2, 0x1, %g0 |
267 | be,pt %icc, out | |
268 | nop | |
269 | ldub [%o1], %o5 | |
270 | ba,pt %xcc, out | |
271 | stb %o5, [%o1 + %o3] | |
272 | ||
273 | medium_copy: /* 16 < len <= 64 */ | |
274 | bne,pn %XCC, small_copy_unaligned | |
275 | sub %o0, %o1, %o3 | |
276 | ||
277 | medium_copy_aligned: | |
278 | andn %o2, 0x7, %o4 | |
279 | and %o2, 0x7, %o2 | |
280 | 1: subcc %o4, 0x8, %o4 | |
281 | ldx [%o1], %o5 | |
282 | stx %o5, [%o1 + %o3] | |
283 | bgu,pt %XCC, 1b | |
284 | add %o1, 0x8, %o1 | |
285 | andcc %o2, 0x4, %g0 | |
286 | be,pt %XCC, 1f | |
287 | nop | |
288 | sub %o2, 0x4, %o2 | |
289 | lduw [%o1], %o5 | |
290 | stw %o5, [%o1 + %o3] | |
291 | add %o1, 0x4, %o1 | |
292 | 1: cmp %o2, 0 | |
293 | be,pt %XCC, out | |
294 | nop | |
295 | ba,pt %xcc, small_copy_unaligned | |
296 | nop | |
bb769ab6 | 297 | |
3ee3a002 UD |
298 | small_copy: /* 0 < len <= 16 */ |
299 | andcc %o3, 0x3, %g0 | |
300 | bne,pn %XCC, small_copy_unaligned | |
301 | sub %o0, %o1, %o3 | |
bb769ab6 | 302 | |
3ee3a002 UD |
303 | small_copy_aligned: |
304 | subcc %o2, 4, %o2 | |
305 | lduw [%o1], %g1 | |
306 | stw %g1, [%o1 + %o3] | |
307 | bgu,pt %XCC, small_copy_aligned | |
308 | add %o1, 4, %o1 | |
bb769ab6 | 309 | |
3ee3a002 UD |
310 | out: retl |
311 | mov %g5, %o0 | |
bb769ab6 | 312 | |
3ee3a002 UD |
313 | .align 32 |
314 | small_copy_unaligned: | |
315 | subcc %o2, 1, %o2 | |
316 | ldub [%o1], %g1 | |
317 | stb %g1, [%o1 + %o3] | |
318 | bgu,pt %XCC, small_copy_unaligned | |
319 | add %o1, 1, %o1 | |
320 | retl | |
321 | mov %g5, %o0 | |
bb769ab6 | 322 | |
3afd5a3b | 323 | END(__memcpy_ultra3) |
bb769ab6 | 324 | |
3afd5a3b | 325 | #endif |