]>
Commit | Line | Data |
---|---|---|
d4697bc9 | 1 | /* Copyright (C) 2011-2014 Free Software Foundation, Inc. |
63d143a2 CM |
2 | This file is part of the GNU C Library. |
3 | Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
ab84e3ff PE |
16 | License along with the GNU C Library. If not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
63d143a2 CM |
18 | |
19 | #include <string.h> | |
20 | #include <stdint.h> | |
21 | #include <stdlib.h> | |
cd84016e | 22 | #include <memcopy.h> |
63d143a2 CM |
23 | #include <arch/chip.h> |
24 | ||
63d143a2 CM |
25 | /* How many cache lines ahead should we prefetch? */ |
26 | #define PREFETCH_LINES_AHEAD 3 | |
27 | ||
28 | void * | |
29 | __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n) | |
30 | { | |
31 | char *__restrict dst1 = (char *) dstv; | |
32 | const char *__restrict src1 = (const char *) srcv; | |
33 | const char *__restrict src1_end; | |
34 | const char *__restrict prefetch; | |
cd84016e CM |
35 | op_t *__restrict dst8; /* 8-byte pointer to destination memory. */ |
36 | op_t final; /* Final bytes to write to trailing word, if any */ | |
63d143a2 CM |
37 | long i; |
38 | ||
39 | if (n < 16) | |
40 | { | |
41 | for (; n; n--) | |
42 | *dst1++ = *src1++; | |
43 | return dstv; | |
44 | } | |
45 | ||
46 | /* Locate the end of source memory we will copy. Don't prefetch | |
47 | past this. */ | |
48 | src1_end = src1 + n - 1; | |
49 | ||
50 | /* Prefetch ahead a few cache lines, but not past the end. */ | |
51 | prefetch = src1; | |
52 | for (i = 0; i < PREFETCH_LINES_AHEAD; i++) | |
53 | { | |
54 | __insn_prefetch (prefetch); | |
55 | prefetch += CHIP_L2_LINE_SIZE (); | |
cd84016e | 56 | prefetch = (prefetch < src1_end) ? prefetch : src1; |
63d143a2 CM |
57 | } |
58 | ||
59 | /* Copy bytes until dst is word-aligned. */ | |
cd84016e | 60 | for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--) |
63d143a2 CM |
61 | *dst1++ = *src1++; |
62 | ||
63 | /* 8-byte pointer to destination memory. */ | |
cd84016e | 64 | dst8 = (op_t *) dst1; |
63d143a2 | 65 | |
cd84016e | 66 | if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0)) |
63d143a2 | 67 | { |
cd84016e CM |
68 | /* Misaligned copy. Use glibc's _wordcopy_fwd_dest_aligned, but |
69 | inline it to avoid prologue/epilogue. TODO: Consider | |
70 | prefetching and using wh64 as well. */ | |
71 | void * srci; | |
72 | op_t a0, a1, a2, a3; | |
73 | long int dstp = (long int) dst1; | |
74 | long int srcp = (long int) src1; | |
75 | long int len = n / OPSIZ; | |
63d143a2 | 76 | |
cd84016e CM |
77 | /* Save the initial source pointer so we know the number of |
78 | bytes to shift for merging two unaligned results. */ | |
79 | srci = (void *) srcp; | |
63d143a2 | 80 | |
cd84016e CM |
81 | /* Make SRCP aligned by rounding it down to the beginning of the |
82 | `op_t' it points in the middle of. */ | |
83 | srcp &= -OPSIZ; | |
84 | ||
85 | switch (len % 4) | |
86 | { | |
87 | case 2: | |
88 | a1 = ((op_t *) srcp)[0]; | |
89 | a2 = ((op_t *) srcp)[1]; | |
90 | len += 2; | |
91 | srcp += 2 * OPSIZ; | |
92 | goto do1; | |
93 | case 3: | |
94 | a0 = ((op_t *) srcp)[0]; | |
95 | a1 = ((op_t *) srcp)[1]; | |
96 | len += 1; | |
97 | srcp += 2 * OPSIZ; | |
98 | goto do2; | |
99 | case 0: | |
100 | if (OP_T_THRES <= 3 * OPSIZ && len == 0) | |
101 | return dstv; | |
102 | a3 = ((op_t *) srcp)[0]; | |
103 | a0 = ((op_t *) srcp)[1]; | |
104 | len += 0; | |
105 | srcp += 2 * OPSIZ; | |
106 | goto do3; | |
107 | case 1: | |
108 | a2 = ((op_t *) srcp)[0]; | |
109 | a3 = ((op_t *) srcp)[1]; | |
110 | srcp += 2 * OPSIZ; | |
111 | len -= 1; | |
112 | if (OP_T_THRES <= 3 * OPSIZ && len == 0) | |
113 | goto do0; | |
114 | goto do4; /* No-op. */ | |
115 | } | |
63d143a2 | 116 | |
cd84016e CM |
117 | do |
118 | { | |
119 | do4: | |
120 | a0 = ((op_t *) srcp)[0]; | |
121 | a2 = __insn_dblalign (a2, a3, srci); | |
122 | ((op_t *) dstp)[0] = a2; | |
123 | srcp += OPSIZ; | |
124 | dstp += OPSIZ; | |
125 | do3: | |
126 | a1 = ((op_t *) srcp)[0]; | |
127 | a3 = __insn_dblalign (a3, a0, srci); | |
128 | ((op_t *) dstp)[0] = a3; | |
129 | srcp += OPSIZ; | |
130 | dstp += OPSIZ; | |
131 | do2: | |
132 | a2 = ((op_t *) srcp)[0]; | |
133 | a0 = __insn_dblalign (a0, a1, srci); | |
134 | ((op_t *) dstp)[0] = a0; | |
135 | srcp += OPSIZ; | |
136 | dstp += OPSIZ; | |
137 | do1: | |
138 | a3 = ((op_t *) srcp)[0]; | |
139 | a1 = __insn_dblalign (a1, a2, srci); | |
140 | ((op_t *) dstp)[0] = a1; | |
141 | srcp += OPSIZ; | |
142 | dstp += OPSIZ; | |
143 | len -= 4; | |
144 | } | |
145 | while (len != 0); | |
146 | ||
147 | /* This is the right position for do0. Please don't move | |
148 | it into the loop. */ | |
149 | do0: | |
150 | ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci); | |
151 | ||
152 | n = n % OPSIZ; | |
63d143a2 | 153 | if (n == 0) |
cd84016e | 154 | return dstv; |
63d143a2 | 155 | |
cd84016e | 156 | a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0; |
63d143a2 | 157 | |
cd84016e CM |
158 | final = __insn_dblalign (a3, a0, srci); |
159 | dst8 = (op_t *)(dstp + OPSIZ); | |
63d143a2 CM |
160 | } |
161 | else | |
162 | { | |
163 | /* Aligned copy. */ | |
164 | ||
cd84016e | 165 | const op_t *__restrict src8 = (const op_t *) src1; |
63d143a2 CM |
166 | |
167 | /* src8 and dst8 are both word-aligned. */ | |
168 | if (n >= CHIP_L2_LINE_SIZE ()) | |
169 | { | |
170 | /* Copy until 'dst' is cache-line-aligned. */ | |
171 | for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1); | |
cd84016e | 172 | n -= sizeof (op_t)) |
63d143a2 CM |
173 | *dst8++ = *src8++; |
174 | ||
175 | for (; n >= CHIP_L2_LINE_SIZE ();) | |
cd84016e CM |
176 | { |
177 | op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
178 | ||
179 | /* Prefetch and advance to next line to prefetch, but | |
180 | don't go past the end. */ | |
181 | __insn_prefetch (prefetch); | |
182 | prefetch += CHIP_L2_LINE_SIZE (); | |
183 | prefetch = (prefetch < src1_end) ? prefetch : | |
184 | (const char *) src8; | |
185 | ||
186 | /* Do all the loads before wh64. This is necessary if | |
187 | [src8, src8+7] and [dst8, dst8+7] share the same | |
188 | cache line and dst8 <= src8, as can be the case when | |
189 | called from memmove, or with code tested on x86 whose | |
190 | memcpy always works with forward copies. */ | |
191 | tmp0 = *src8++; | |
192 | tmp1 = *src8++; | |
193 | tmp2 = *src8++; | |
194 | tmp3 = *src8++; | |
195 | tmp4 = *src8++; | |
196 | tmp5 = *src8++; | |
197 | tmp6 = *src8++; | |
198 | tmp7 = *src8++; | |
199 | ||
200 | __insn_wh64 (dst8); | |
201 | ||
202 | *dst8++ = tmp0; | |
203 | *dst8++ = tmp1; | |
204 | *dst8++ = tmp2; | |
205 | *dst8++ = tmp3; | |
206 | *dst8++ = tmp4; | |
207 | *dst8++ = tmp5; | |
208 | *dst8++ = tmp6; | |
209 | *dst8++ = tmp7; | |
210 | ||
211 | n -= 64; | |
212 | } | |
63d143a2 CM |
213 | #if CHIP_L2_LINE_SIZE() != 64 |
214 | # error "Fix code that assumes particular L2 cache line size." | |
215 | #endif | |
63d143a2 CM |
216 | } |
217 | ||
cd84016e | 218 | for (; n >= sizeof (op_t); n -= sizeof (op_t)) |
63d143a2 CM |
219 | *dst8++ = *src8++; |
220 | ||
221 | if (__builtin_expect (n == 0, 1)) | |
222 | return dstv; | |
223 | ||
224 | final = *src8; | |
225 | } | |
226 | ||
227 | /* n != 0 if we get here. Write out any trailing bytes. */ | |
228 | dst1 = (char *) dst8; | |
229 | #ifndef __BIG_ENDIAN__ | |
230 | if (n & 4) | |
231 | { | |
232 | *(uint32_t *) dst1 = final; | |
233 | dst1 += 4; | |
234 | final >>= 32; | |
235 | n &= 3; | |
236 | } | |
237 | if (n & 2) | |
238 | { | |
239 | *(uint16_t *) dst1 = final; | |
240 | dst1 += 2; | |
241 | final >>= 16; | |
242 | n &= 1; | |
243 | } | |
244 | if (n) | |
245 | *(uint8_t *) dst1 = final; | |
246 | #else | |
247 | if (n & 4) | |
248 | { | |
249 | *(uint32_t *) dst1 = final >> 32; | |
250 | dst1 += 4; | |
251 | } | |
252 | else | |
253 | { | |
254 | final >>= 32; | |
255 | } | |
256 | if (n & 2) | |
257 | { | |
258 | *(uint16_t *) dst1 = final >> 16; | |
259 | dst1 += 2; | |
260 | } | |
261 | else | |
262 | { | |
263 | final >>= 16; | |
264 | } | |
265 | if (n & 1) | |
266 | *(uint8_t *) dst1 = final >> 8; | |
267 | #endif | |
268 | ||
269 | return dstv; | |
270 | } | |
271 | weak_alias (__memcpy, memcpy) | |
272 | libc_hidden_builtin_def (memcpy) |