]>
Commit | Line | Data |
---|---|---|
d614a753 | 1 | /* Copyright (C) 2012-2020 Free Software Foundation, Inc. |
af43a565 | 2 | This file is part of the GNU C Library. |
d0023399 | 3 | |
af43a565 AJ |
4 | The GNU C Library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public | |
6 | License as published by the Free Software Foundation; either | |
7 | version 2.1 of the License, or (at your option) any later version. | |
8 | ||
9 | The GNU C Library is distributed in the hope that it will be useful, | |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 | Lesser General Public License for more details. | |
13 | ||
14 | You should have received a copy of the GNU Lesser General Public | |
ab84e3ff | 15 | License along with the GNU C Library. If not, see |
5a82c748 | 16 | <https://www.gnu.org/licenses/>. */ |
af43a565 | 17 | |
e6ff7f84 | 18 | #ifdef ANDROID_CHANGES |
4c0c0e2a SE |
19 | # include "machine/asm.h" |
20 | # include "machine/regdef.h" | |
21 | # define USE_MEMMOVE_FOR_OVERLAP | |
22 | # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED | |
23 | # define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE | |
e6ff7f84 | 24 | #elif _LIBC |
4c0c0e2a SE |
25 | # include <sysdep.h> |
26 | # include <regdef.h> | |
27 | # include <sys/asm.h> | |
28 | # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED | |
29 | # define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE | |
e5e72fe9 | 30 | #elif defined _COMPILING_NEWLIB |
4c0c0e2a SE |
31 | # include "machine/asm.h" |
32 | # include "machine/regdef.h" | |
33 | # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED | |
34 | # define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE | |
e6ff7f84 | 35 | #else |
4c0c0e2a SE |
36 | # include <regdef.h> |
37 | # include <sys/asm.h> | |
e6ff7f84 SE |
38 | #endif |
39 | ||
40 | #if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \ | |
41 | (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64) | |
4c0c0e2a SE |
42 | # ifndef DISABLE_PREFETCH |
43 | # define USE_PREFETCH | |
44 | # endif | |
e6ff7f84 SE |
45 | #endif |
46 | ||
d9014c08 | 47 | #if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)) |
4c0c0e2a SE |
48 | # ifndef DISABLE_DOUBLE |
49 | # define USE_DOUBLE | |
50 | # endif | |
e6ff7f84 SE |
51 | #endif |
52 | ||
e6ff7f84 SE |
53 | /* Some asm.h files do not have the L macro definition. */ |
54 | #ifndef L | |
4c0c0e2a SE |
55 | # if _MIPS_SIM == _ABIO32 |
56 | # define L(label) $L ## label | |
57 | # else | |
58 | # define L(label) .L ## label | |
59 | # endif | |
e6ff7f84 SE |
60 | #endif |
61 | ||
62 | /* Some asm.h files do not have the PTR_ADDIU macro definition. */ | |
63 | #ifndef PTR_ADDIU | |
4c0c0e2a SE |
64 | # ifdef USE_DOUBLE |
65 | # define PTR_ADDIU daddiu | |
66 | # else | |
67 | # define PTR_ADDIU addiu | |
68 | # endif | |
e6ff7f84 SE |
69 | #endif |
70 | ||
71 | /* Some asm.h files do not have the PTR_SRA macro definition. */ | |
72 | #ifndef PTR_SRA | |
4c0c0e2a SE |
73 | # ifdef USE_DOUBLE |
74 | # define PTR_SRA dsra | |
75 | # else | |
76 | # define PTR_SRA sra | |
77 | # endif | |
e6ff7f84 SE |
78 | #endif |
79 | ||
882c4b9f SE |
80 | /* New R6 instructions that may not be in asm.h. */ |
81 | #ifndef PTR_LSA | |
82 | # if _MIPS_SIM == _ABI64 | |
83 | # define PTR_LSA dlsa | |
84 | # else | |
85 | # define PTR_LSA lsa | |
86 | # endif | |
87 | #endif | |
af43a565 | 88 | |
e6ff7f84 SE |
89 | /* |
90 | * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load | |
91 | * prefetches appears to offer a slight preformance advantage. | |
92 | * | |
93 | * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE | |
94 | * or PREFETCH_STORE_STREAMED offers a large performance advantage | |
95 | * but PREPAREFORSTORE has some special restrictions to consider. | |
96 | * | |
97 | * Prefetch with the 'prepare for store' hint does not copy a memory | |
98 | * location into the cache, it just allocates a cache line and zeros | |
99 | * it out. This means that if you do not write to the entire cache | |
100 | * line before writing it out to memory some data will get zero'ed out | |
101 | * when the cache line is written back to memory and data will be lost. | |
102 | * | |
103 | * Also if you are using this memcpy to copy overlapping buffers it may | |
104 | * not behave correctly when using the 'prepare for store' hint. If you | |
105 | * use the 'prepare for store' prefetch on a memory area that is in the | |
106 | * memcpy source (as well as the memcpy destination), then you will get | |
107 | * some data zero'ed out before you have a chance to read it and data will | |
108 | * be lost. | |
109 | * | |
110 | * If you are going to use this memcpy routine with the 'prepare for store' | |
111 | * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid | |
112 | * the problem of running memcpy on overlapping buffers. | |
113 | * | |
114 | * There are ifdef'ed sections of this memcpy to make sure that it does not | |
115 | * do prefetches on cache lines that are not going to be completely written. | |
d0023399 | 116 | * This code is only needed and only used when PREFETCH_STORE_HINT is set to |
e6ff7f84 SE |
117 | * PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are |
118 | * 32 bytes and if the cache line is larger it will not work correctly. | |
119 | */ | |
af43a565 | 120 | |
e6ff7f84 SE |
121 | #ifdef USE_PREFETCH |
122 | # define PREFETCH_HINT_LOAD 0 | |
123 | # define PREFETCH_HINT_STORE 1 | |
124 | # define PREFETCH_HINT_LOAD_STREAMED 4 | |
125 | # define PREFETCH_HINT_STORE_STREAMED 5 | |
126 | # define PREFETCH_HINT_LOAD_RETAINED 6 | |
127 | # define PREFETCH_HINT_STORE_RETAINED 7 | |
128 | # define PREFETCH_HINT_WRITEBACK_INVAL 25 | |
129 | # define PREFETCH_HINT_PREPAREFORSTORE 30 | |
130 | ||
131 | /* | |
132 | * If we have not picked out what hints to use at this point use the | |
133 | * standard load and store prefetch hints. | |
134 | */ | |
4c0c0e2a SE |
135 | # ifndef PREFETCH_STORE_HINT |
136 | # define PREFETCH_STORE_HINT PREFETCH_HINT_STORE | |
137 | # endif | |
138 | # ifndef PREFETCH_LOAD_HINT | |
139 | # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD | |
140 | # endif | |
af43a565 | 141 | |
e6ff7f84 SE |
142 | /* |
143 | * We double everything when USE_DOUBLE is true so we do 2 prefetches to | |
d0023399 | 144 | * get 64 bytes in that case. The assumption is that each individual |
e6ff7f84 SE |
145 | * prefetch brings in 32 bytes. |
146 | */ | |
d9014c08 | 147 | |
4c0c0e2a SE |
148 | # ifdef USE_DOUBLE |
149 | # define PREFETCH_CHUNK 64 | |
150 | # define PREFETCH_FOR_LOAD(chunk, reg) \ | |
d9014c08 SE |
151 | pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \ |
152 | pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg) | |
4c0c0e2a | 153 | # define PREFETCH_FOR_STORE(chunk, reg) \ |
d9014c08 SE |
154 | pref PREFETCH_STORE_HINT, (chunk)*64(reg); \ |
155 | pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg) | |
4c0c0e2a SE |
156 | # else |
157 | # define PREFETCH_CHUNK 32 | |
158 | # define PREFETCH_FOR_LOAD(chunk, reg) \ | |
e6ff7f84 | 159 | pref PREFETCH_LOAD_HINT, (chunk)*32(reg) |
4c0c0e2a | 160 | # define PREFETCH_FOR_STORE(chunk, reg) \ |
e6ff7f84 | 161 | pref PREFETCH_STORE_HINT, (chunk)*32(reg) |
4c0c0e2a | 162 | # endif |
d9014c08 | 163 | /* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less |
f24a6d08 OB |
164 | * than PREFETCH_CHUNK, the assumed size of each prefetch. If the real size |
165 | * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE | |
382466e0 | 166 | * hint is used, the code will not work correctly. If PREPAREFORSTORE is not |
d9014c08 | 167 | * used then MAX_PREFETCH_SIZE does not matter. */ |
4c0c0e2a | 168 | # define MAX_PREFETCH_SIZE 128 |
382466e0 | 169 | /* PREFETCH_LIMIT is set based on the fact that we never use an offset greater |
f24a6d08 OB |
170 | * than 5 on a STORE prefetch and that a single prefetch can never be larger |
171 | * than MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set because | |
d9014c08 | 172 | * we actually do two prefetches in that case, one 32 bytes after the other. */ |
4c0c0e2a SE |
173 | # ifdef USE_DOUBLE |
174 | # define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE | |
175 | # else | |
176 | # define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE | |
177 | # endif | |
178 | # if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \ | |
d9014c08 SE |
179 | && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE) |
180 | /* We cannot handle this because the initial prefetches may fetch bytes that | |
b13c7b55 | 181 | * are before the buffer being copied. We start copies with an offset |
d9014c08 SE |
182 | * of 4 so avoid this situation when using PREPAREFORSTORE. */ |
183 | #error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small." | |
4c0c0e2a | 184 | # endif |
e6ff7f84 SE |
185 | #else /* USE_PREFETCH not defined */ |
186 | # define PREFETCH_FOR_LOAD(offset, reg) | |
187 | # define PREFETCH_FOR_STORE(offset, reg) | |
188 | #endif | |
189 | ||
e72ad0ef SE |
190 | #if __mips_isa_rev > 5 |
191 | # if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) | |
192 | # undef PREFETCH_STORE_HINT | |
193 | # define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED | |
194 | # endif | |
195 | # define R6_CODE | |
196 | #endif | |
197 | ||
e6ff7f84 SE |
198 | /* Allow the routine to be named something else if desired. */ |
199 | #ifndef MEMCPY_NAME | |
4c0c0e2a | 200 | # define MEMCPY_NAME memcpy |
e6ff7f84 SE |
201 | #endif |
202 | ||
203 | /* We use these 32/64 bit registers as temporaries to do the copying. */ | |
204 | #define REG0 t0 | |
205 | #define REG1 t1 | |
206 | #define REG2 t2 | |
207 | #define REG3 t3 | |
d9014c08 | 208 | #if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABIO32) || (_MIPS_SIM == _ABIO64)) |
4c0c0e2a SE |
209 | # define REG4 t4 |
210 | # define REG5 t5 | |
211 | # define REG6 t6 | |
212 | # define REG7 t7 | |
e6ff7f84 | 213 | #else |
4c0c0e2a SE |
214 | # define REG4 ta0 |
215 | # define REG5 ta1 | |
216 | # define REG6 ta2 | |
217 | # define REG7 ta3 | |
e6ff7f84 SE |
218 | #endif |
219 | ||
220 | /* We load/store 64 bits at a time when USE_DOUBLE is true. | |
221 | * The C_ prefix stands for CHUNK and is used to avoid macro name | |
222 | * conflicts with system header files. */ | |
223 | ||
224 | #ifdef USE_DOUBLE | |
4c0c0e2a SE |
225 | # define C_ST sd |
226 | # define C_LD ld | |
e525154e | 227 | # ifdef __MIPSEB |
e6ff7f84 SE |
228 | # define C_LDHI ldl /* high part is left in big-endian */ |
229 | # define C_STHI sdl /* high part is left in big-endian */ | |
230 | # define C_LDLO ldr /* low part is right in big-endian */ | |
231 | # define C_STLO sdr /* low part is right in big-endian */ | |
4c0c0e2a | 232 | # else |
e6ff7f84 SE |
233 | # define C_LDHI ldr /* high part is right in little-endian */ |
234 | # define C_STHI sdr /* high part is right in little-endian */ | |
235 | # define C_LDLO ldl /* low part is left in little-endian */ | |
236 | # define C_STLO sdl /* low part is left in little-endian */ | |
4c0c0e2a | 237 | # endif |
882c4b9f | 238 | # define C_ALIGN dalign /* r6 align instruction */ |
e6ff7f84 | 239 | #else |
4c0c0e2a SE |
240 | # define C_ST sw |
241 | # define C_LD lw | |
e525154e | 242 | # ifdef __MIPSEB |
e6ff7f84 SE |
243 | # define C_LDHI lwl /* high part is left in big-endian */ |
244 | # define C_STHI swl /* high part is left in big-endian */ | |
245 | # define C_LDLO lwr /* low part is right in big-endian */ | |
246 | # define C_STLO swr /* low part is right in big-endian */ | |
4c0c0e2a | 247 | # else |
e6ff7f84 SE |
248 | # define C_LDHI lwr /* high part is right in little-endian */ |
249 | # define C_STHI swr /* high part is right in little-endian */ | |
250 | # define C_LDLO lwl /* low part is left in little-endian */ | |
251 | # define C_STLO swl /* low part is left in little-endian */ | |
4c0c0e2a | 252 | # endif |
882c4b9f | 253 | # define C_ALIGN align /* r6 align instruction */ |
e6ff7f84 SE |
254 | #endif |
255 | ||
256 | /* Bookkeeping values for 32 vs. 64 bit mode. */ | |
257 | #ifdef USE_DOUBLE | |
4c0c0e2a SE |
258 | # define NSIZE 8 |
259 | # define NSIZEMASK 0x3f | |
260 | # define NSIZEDMASK 0x7f | |
af43a565 | 261 | #else |
4c0c0e2a SE |
262 | # define NSIZE 4 |
263 | # define NSIZEMASK 0x1f | |
264 | # define NSIZEDMASK 0x3f | |
af43a565 | 265 | #endif |
e6ff7f84 SE |
266 | #define UNIT(unit) ((unit)*NSIZE) |
267 | #define UNITM1(unit) (((unit)*NSIZE)-1) | |
af43a565 | 268 | |
e6ff7f84 SE |
269 | #ifdef ANDROID_CHANGES |
270 | LEAF(MEMCPY_NAME, 0) | |
271 | #else | |
272 | LEAF(MEMCPY_NAME) | |
273 | #endif | |
274 | .set nomips16 | |
af43a565 | 275 | .set noreorder |
e6ff7f84 SE |
276 | /* |
277 | * Below we handle the case where memcpy is called with overlapping src and dst. | |
278 | * Although memcpy is not required to handle this case, some parts of Android | |
279 | * like Skia rely on such usage. We call memmove to handle such cases. | |
280 | */ | |
281 | #ifdef USE_MEMMOVE_FOR_OVERLAP | |
282 | PTR_SUBU t0,a0,a1 | |
283 | PTR_SRA t2,t0,31 | |
284 | xor t1,t0,t2 | |
285 | PTR_SUBU t0,t1,t2 | |
286 | sltu t2,t0,a2 | |
287 | beq t2,zero,L(memcpy) | |
288 | la t9,memmove | |
289 | jr t9 | |
290 | nop | |
291 | L(memcpy): | |
292 | #endif | |
293 | /* | |
f24a6d08 | 294 | * If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of |
e6ff7f84 SE |
295 | * size, copy dst pointer to v0 for the return value. |
296 | */ | |
297 | slti t2,a2,(2 * NSIZE) | |
45d8d77a | 298 | bne t2,zero,L(lasts) |
d9014c08 SE |
299 | #if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH) |
300 | move v0,zero | |
301 | #else | |
e6ff7f84 | 302 | move v0,a0 |
d9014c08 | 303 | #endif |
882c4b9f SE |
304 | |
305 | #ifndef R6_CODE | |
306 | ||
e6ff7f84 SE |
307 | /* |
308 | * If src and dst have different alignments, go to L(unaligned), if they | |
309 | * have the same alignment (but are not actually aligned) do a partial | |
310 | * load/store to make them aligned. If they are both already aligned | |
311 | * we can start copying at L(aligned). | |
312 | */ | |
313 | xor t8,a1,a0 | |
314 | andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */ | |
315 | bne t8,zero,L(unaligned) | |
316 | PTR_SUBU a3, zero, a0 | |
317 | ||
318 | andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ | |
319 | beq a3,zero,L(aligned) /* if a3=0, it is already aligned */ | |
320 | PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ | |
321 | ||
322 | C_LDHI t8,0(a1) | |
323 | PTR_ADDU a1,a1,a3 | |
324 | C_STHI t8,0(a0) | |
325 | PTR_ADDU a0,a0,a3 | |
326 | ||
882c4b9f SE |
327 | #else /* R6_CODE */ |
328 | ||
329 | /* | |
330 | * Align the destination and hope that the source gets aligned too. If it | |
331 | * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6 | |
332 | * align instruction. | |
333 | */ | |
334 | andi t8,a0,7 | |
335 | lapc t9,L(atable) | |
336 | PTR_LSA t9,t8,t9,2 | |
337 | jrc t9 | |
338 | L(atable): | |
339 | bc L(lb0) | |
340 | bc L(lb7) | |
341 | bc L(lb6) | |
342 | bc L(lb5) | |
343 | bc L(lb4) | |
344 | bc L(lb3) | |
345 | bc L(lb2) | |
346 | bc L(lb1) | |
347 | L(lb7): | |
348 | lb a3, 6(a1) | |
349 | sb a3, 6(a0) | |
350 | L(lb6): | |
351 | lb a3, 5(a1) | |
352 | sb a3, 5(a0) | |
353 | L(lb5): | |
354 | lb a3, 4(a1) | |
355 | sb a3, 4(a0) | |
356 | L(lb4): | |
357 | lb a3, 3(a1) | |
358 | sb a3, 3(a0) | |
359 | L(lb3): | |
360 | lb a3, 2(a1) | |
361 | sb a3, 2(a0) | |
362 | L(lb2): | |
363 | lb a3, 1(a1) | |
364 | sb a3, 1(a0) | |
365 | L(lb1): | |
366 | lb a3, 0(a1) | |
367 | sb a3, 0(a0) | |
368 | ||
369 | li t9,8 | |
370 | subu t8,t9,t8 | |
371 | PTR_SUBU a2,a2,t8 | |
372 | PTR_ADDU a0,a0,t8 | |
373 | PTR_ADDU a1,a1,t8 | |
374 | L(lb0): | |
375 | ||
376 | andi t8,a1,(NSIZE-1) | |
377 | lapc t9,L(jtable) | |
378 | PTR_LSA t9,t8,t9,2 | |
379 | jrc t9 | |
380 | L(jtable): | |
381 | bc L(aligned) | |
382 | bc L(r6_unaligned1) | |
383 | bc L(r6_unaligned2) | |
384 | bc L(r6_unaligned3) | |
385 | # ifdef USE_DOUBLE | |
386 | bc L(r6_unaligned4) | |
387 | bc L(r6_unaligned5) | |
388 | bc L(r6_unaligned6) | |
389 | bc L(r6_unaligned7) | |
390 | # endif | |
391 | #endif /* R6_CODE */ | |
392 | ||
393 | L(aligned): | |
394 | ||
e6ff7f84 SE |
395 | /* |
396 | * Now dst/src are both aligned to (word or double word) aligned addresses | |
397 | * Set a2 to count how many bytes we have to copy after all the 64/128 byte | |
d0023399 | 398 | * chunks are copied and a3 to the dst pointer after all the 64/128 byte |
e6ff7f84 SE |
399 | * chunks have been copied. We will loop, incrementing a0 and a1 until a0 |
400 | * equals a3. | |
401 | */ | |
402 | ||
e6ff7f84 SE |
403 | andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ |
404 | beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */ | |
405 | PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ | |
406 | PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ | |
407 | ||
408 | /* When in the loop we may prefetch with the 'prepare to store' hint, | |
409 | * in this case the a0+x should not be past the "t0-32" address. This | |
410 | * means: for x=128 the last "safe" a0 address is "t0-160". Alternatively, | |
411 | * for x=64 the last "safe" a0 address is "t0-96" In the current version we | |
412 | * will use "prefetch hint,128(a0)", so "t0-160" is the limit. | |
413 | */ | |
414 | #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) | |
415 | PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ | |
416 | PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ | |
417 | #endif | |
418 | PREFETCH_FOR_LOAD (0, a1) | |
419 | PREFETCH_FOR_LOAD (1, a1) | |
420 | PREFETCH_FOR_LOAD (2, a1) | |
d9014c08 SE |
421 | PREFETCH_FOR_LOAD (3, a1) |
422 | #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) | |
e6ff7f84 | 423 | PREFETCH_FOR_STORE (1, a0) |
d9014c08 SE |
424 | PREFETCH_FOR_STORE (2, a0) |
425 | PREFETCH_FOR_STORE (3, a0) | |
426 | #endif | |
427 | #if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) | |
4c0c0e2a | 428 | # if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE |
d9014c08 SE |
429 | sltu v1,t9,a0 |
430 | bgtz v1,L(skip_set) | |
e6ff7f84 | 431 | nop |
d9014c08 SE |
432 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) |
433 | L(skip_set): | |
4c0c0e2a | 434 | # else |
d9014c08 | 435 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) |
4c0c0e2a | 436 | # endif |
d9014c08 SE |
437 | #endif |
438 | #if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \ | |
439 | && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) | |
440 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3) | |
4c0c0e2a | 441 | # ifdef USE_DOUBLE |
d9014c08 | 442 | PTR_ADDIU v0,v0,32 |
4c0c0e2a | 443 | # endif |
e6ff7f84 | 444 | #endif |
e6ff7f84 | 445 | L(loop16w): |
e6ff7f84 SE |
446 | C_LD t0,UNIT(0)(a1) |
447 | #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) | |
d9014c08 SE |
448 | sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */ |
449 | bgtz v1,L(skip_pref) | |
e6ff7f84 SE |
450 | #endif |
451 | C_LD t1,UNIT(1)(a1) | |
882c4b9f SE |
452 | #ifdef R6_CODE |
453 | PREFETCH_FOR_STORE (2, a0) | |
454 | #else | |
d9014c08 SE |
455 | PREFETCH_FOR_STORE (4, a0) |
456 | PREFETCH_FOR_STORE (5, a0) | |
882c4b9f | 457 | #endif |
d9014c08 SE |
458 | #if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) |
459 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5) | |
4c0c0e2a | 460 | # ifdef USE_DOUBLE |
d9014c08 | 461 | PTR_ADDIU v0,v0,32 |
4c0c0e2a | 462 | # endif |
d9014c08 SE |
463 | #endif |
464 | L(skip_pref): | |
e6ff7f84 SE |
465 | C_LD REG2,UNIT(2)(a1) |
466 | C_LD REG3,UNIT(3)(a1) | |
467 | C_LD REG4,UNIT(4)(a1) | |
468 | C_LD REG5,UNIT(5)(a1) | |
469 | C_LD REG6,UNIT(6)(a1) | |
470 | C_LD REG7,UNIT(7)(a1) | |
882c4b9f SE |
471 | #ifdef R6_CODE |
472 | PREFETCH_FOR_LOAD (3, a1) | |
473 | #else | |
474 | PREFETCH_FOR_LOAD (4, a1) | |
475 | #endif | |
e6ff7f84 SE |
476 | C_ST t0,UNIT(0)(a0) |
477 | C_ST t1,UNIT(1)(a0) | |
478 | C_ST REG2,UNIT(2)(a0) | |
479 | C_ST REG3,UNIT(3)(a0) | |
480 | C_ST REG4,UNIT(4)(a0) | |
481 | C_ST REG5,UNIT(5)(a0) | |
482 | C_ST REG6,UNIT(6)(a0) | |
483 | C_ST REG7,UNIT(7)(a0) | |
484 | ||
485 | C_LD t0,UNIT(8)(a1) | |
e6ff7f84 | 486 | C_LD t1,UNIT(9)(a1) |
e6ff7f84 SE |
487 | C_LD REG2,UNIT(10)(a1) |
488 | C_LD REG3,UNIT(11)(a1) | |
489 | C_LD REG4,UNIT(12)(a1) | |
490 | C_LD REG5,UNIT(13)(a1) | |
491 | C_LD REG6,UNIT(14)(a1) | |
492 | C_LD REG7,UNIT(15)(a1) | |
882c4b9f | 493 | #ifndef R6_CODE |
e6ff7f84 | 494 | PREFETCH_FOR_LOAD (5, a1) |
882c4b9f | 495 | #endif |
e6ff7f84 SE |
496 | C_ST t0,UNIT(8)(a0) |
497 | C_ST t1,UNIT(9)(a0) | |
498 | C_ST REG2,UNIT(10)(a0) | |
499 | C_ST REG3,UNIT(11)(a0) | |
500 | C_ST REG4,UNIT(12)(a0) | |
501 | C_ST REG5,UNIT(13)(a0) | |
502 | C_ST REG6,UNIT(14)(a0) | |
503 | C_ST REG7,UNIT(15)(a0) | |
504 | PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ | |
e6ff7f84 SE |
505 | bne a0,a3,L(loop16w) |
506 | PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ | |
507 | move a2,t8 | |
508 | ||
509 | /* Here we have src and dest word-aligned but less than 64-bytes or | |
a122dbfb | 510 | * 128 bytes to go. Check for a 32(64) byte chunk and copy if there |
e6ff7f84 SE |
511 | * is one. Otherwise jump down to L(chk1w) to handle the tail end of |
512 | * the copy. | |
513 | */ | |
514 | ||
515 | L(chkw): | |
516 | PREFETCH_FOR_LOAD (0, a1) | |
517 | andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ | |
518 | /* The t8 is the reminder count past 32-bytes */ | |
519 | beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */ | |
520 | nop | |
521 | C_LD t0,UNIT(0)(a1) | |
522 | C_LD t1,UNIT(1)(a1) | |
523 | C_LD REG2,UNIT(2)(a1) | |
524 | C_LD REG3,UNIT(3)(a1) | |
525 | C_LD REG4,UNIT(4)(a1) | |
526 | C_LD REG5,UNIT(5)(a1) | |
527 | C_LD REG6,UNIT(6)(a1) | |
528 | C_LD REG7,UNIT(7)(a1) | |
529 | PTR_ADDIU a1,a1,UNIT(8) | |
530 | C_ST t0,UNIT(0)(a0) | |
531 | C_ST t1,UNIT(1)(a0) | |
532 | C_ST REG2,UNIT(2)(a0) | |
533 | C_ST REG3,UNIT(3)(a0) | |
534 | C_ST REG4,UNIT(4)(a0) | |
535 | C_ST REG5,UNIT(5)(a0) | |
536 | C_ST REG6,UNIT(6)(a0) | |
537 | C_ST REG7,UNIT(7)(a0) | |
538 | PTR_ADDIU a0,a0,UNIT(8) | |
539 | ||
540 | /* | |
f24a6d08 | 541 | * Here we have less than 32(64) bytes to copy. Set up for a loop to |
e6ff7f84 SE |
542 | * copy one word (or double word) at a time. Set a2 to count how many |
543 | * bytes we have to copy after all the word (or double word) chunks are | |
544 | * copied and a3 to the dst pointer after all the (d)word chunks have | |
545 | * been copied. We will loop, incrementing a0 and a1 until a0 equals a3. | |
546 | */ | |
547 | L(chk1w): | |
548 | andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ | |
45d8d77a | 549 | beq a2,t8,L(lastw) |
e6ff7f84 SE |
550 | PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ |
551 | PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ | |
552 | ||
553 | /* copying in words (4-byte or 8-byte chunks) */ | |
554 | L(wordCopy_loop): | |
555 | C_LD REG3,UNIT(0)(a1) | |
e6ff7f84 | 556 | PTR_ADDIU a0,a0,UNIT(1) |
d9014c08 | 557 | PTR_ADDIU a1,a1,UNIT(1) |
e6ff7f84 SE |
558 | bne a0,a3,L(wordCopy_loop) |
559 | C_ST REG3,UNIT(-1)(a0) | |
af43a565 | 560 | |
45d8d77a SE |
561 | /* If we have been copying double words, see if we can copy a single word |
562 | before doing byte copies. We can have, at most, one word to copy. */ | |
563 | ||
564 | L(lastw): | |
565 | #ifdef USE_DOUBLE | |
566 | andi t8,a2,3 /* a2 is the remainder past 4 byte chunks. */ | |
567 | beq t8,a2,L(lastb) | |
8a71d2e2 | 568 | move a2,t8 |
45d8d77a SE |
569 | lw REG3,0(a1) |
570 | sw REG3,0(a0) | |
571 | PTR_ADDIU a0,a0,4 | |
572 | PTR_ADDIU a1,a1,4 | |
45d8d77a SE |
573 | #endif |
574 | ||
e6ff7f84 SE |
575 | /* Copy the last 8 (or 16) bytes */ |
576 | L(lastb): | |
577 | blez a2,L(leave) | |
578 | PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ | |
579 | L(lastbloop): | |
580 | lb v1,0(a1) | |
e6ff7f84 | 581 | PTR_ADDIU a0,a0,1 |
d9014c08 | 582 | PTR_ADDIU a1,a1,1 |
e6ff7f84 SE |
583 | bne a0,a3,L(lastbloop) |
584 | sb v1,-1(a0) | |
585 | L(leave): | |
586 | j ra | |
af43a565 | 587 | nop |
882c4b9f | 588 | |
45d8d77a SE |
589 | /* We jump here with a memcpy of less than 8 or 16 bytes, depending on |
590 | whether or not USE_DOUBLE is defined. Instead of just doing byte | |
591 | copies, check the alignment and size and use lw/sw if possible. | |
592 | Otherwise, do byte copies. */ | |
593 | ||
594 | L(lasts): | |
595 | andi t8,a2,3 | |
596 | beq t8,a2,L(lastb) | |
597 | ||
598 | andi t9,a0,3 | |
599 | bne t9,zero,L(lastb) | |
600 | andi t9,a1,3 | |
601 | bne t9,zero,L(lastb) | |
602 | ||
603 | PTR_SUBU a3,a2,t8 | |
604 | PTR_ADDU a3,a0,a3 | |
605 | ||
606 | L(wcopy_loop): | |
607 | lw REG3,0(a1) | |
608 | PTR_ADDIU a0,a0,4 | |
609 | PTR_ADDIU a1,a1,4 | |
610 | bne a0,a3,L(wcopy_loop) | |
611 | sw REG3,-4(a0) | |
612 | ||
613 | b L(lastb) | |
614 | move a2,t8 | |
615 | ||
882c4b9f | 616 | #ifndef R6_CODE |
e6ff7f84 SE |
617 | /* |
618 | * UNALIGNED case, got here with a3 = "negu a0" | |
619 | * This code is nearly identical to the aligned code above | |
620 | * but only the destination (not the source) gets aligned | |
621 | * so we need to do partial loads of the source followed | |
622 | * by normal stores to the destination (once we have aligned | |
623 | * the destination). | |
624 | */ | |
625 | ||
626 | L(unaligned): | |
627 | andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ | |
628 | beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */ | |
629 | PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ | |
630 | ||
631 | C_LDHI v1,UNIT(0)(a1) | |
632 | C_LDLO v1,UNITM1(1)(a1) | |
633 | PTR_ADDU a1,a1,a3 | |
634 | C_STHI v1,UNIT(0)(a0) | |
635 | PTR_ADDU a0,a0,a3 | |
636 | ||
637 | /* | |
638 | * Now the destination (but not the source) is aligned | |
639 | * Set a2 to count how many bytes we have to copy after all the 64/128 byte | |
640 | * chunks are copied and a3 to the dst pointer after all the 64/128 byte | |
641 | * chunks have been copied. We will loop, incrementing a0 and a1 until a0 | |
642 | * equals a3. | |
643 | */ | |
af43a565 | 644 | |
e6ff7f84 SE |
645 | L(ua_chk16w): |
646 | andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ | |
647 | beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */ | |
648 | PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ | |
649 | PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ | |
af43a565 | 650 | |
882c4b9f | 651 | # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) |
e6ff7f84 SE |
652 | PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ |
653 | PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ | |
882c4b9f | 654 | # endif |
e6ff7f84 SE |
655 | PREFETCH_FOR_LOAD (0, a1) |
656 | PREFETCH_FOR_LOAD (1, a1) | |
657 | PREFETCH_FOR_LOAD (2, a1) | |
882c4b9f | 658 | # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) |
e6ff7f84 | 659 | PREFETCH_FOR_STORE (1, a0) |
d9014c08 SE |
660 | PREFETCH_FOR_STORE (2, a0) |
661 | PREFETCH_FOR_STORE (3, a0) | |
882c4b9f SE |
662 | # endif |
663 | # if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) | |
664 | # if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) | |
d9014c08 SE |
665 | sltu v1,t9,a0 |
666 | bgtz v1,L(ua_skip_set) | |
e6ff7f84 | 667 | nop |
d9014c08 SE |
668 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) |
669 | L(ua_skip_set): | |
882c4b9f | 670 | # else |
d9014c08 | 671 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) |
882c4b9f | 672 | # endif |
4c0c0e2a | 673 | # endif |
e6ff7f84 SE |
674 | L(ua_loop16w): |
675 | PREFETCH_FOR_LOAD (3, a1) | |
676 | C_LDHI t0,UNIT(0)(a1) | |
e6ff7f84 | 677 | C_LDHI t1,UNIT(1)(a1) |
d9014c08 | 678 | C_LDHI REG2,UNIT(2)(a1) |
882c4b9f | 679 | # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) |
d9014c08 SE |
680 | sltu v1,t9,a0 |
681 | bgtz v1,L(ua_skip_pref) | |
882c4b9f | 682 | # endif |
d9014c08 SE |
683 | C_LDHI REG3,UNIT(3)(a1) |
684 | PREFETCH_FOR_STORE (4, a0) | |
685 | PREFETCH_FOR_STORE (5, a0) | |
686 | L(ua_skip_pref): | |
687 | C_LDHI REG4,UNIT(4)(a1) | |
688 | C_LDHI REG5,UNIT(5)(a1) | |
689 | C_LDHI REG6,UNIT(6)(a1) | |
690 | C_LDHI REG7,UNIT(7)(a1) | |
691 | C_LDLO t0,UNITM1(1)(a1) | |
e6ff7f84 | 692 | C_LDLO t1,UNITM1(2)(a1) |
e6ff7f84 | 693 | C_LDLO REG2,UNITM1(3)(a1) |
e6ff7f84 | 694 | C_LDLO REG3,UNITM1(4)(a1) |
e6ff7f84 | 695 | C_LDLO REG4,UNITM1(5)(a1) |
e6ff7f84 | 696 | C_LDLO REG5,UNITM1(6)(a1) |
e6ff7f84 | 697 | C_LDLO REG6,UNITM1(7)(a1) |
e6ff7f84 SE |
698 | C_LDLO REG7,UNITM1(8)(a1) |
699 | PREFETCH_FOR_LOAD (4, a1) | |
700 | C_ST t0,UNIT(0)(a0) | |
701 | C_ST t1,UNIT(1)(a0) | |
702 | C_ST REG2,UNIT(2)(a0) | |
703 | C_ST REG3,UNIT(3)(a0) | |
704 | C_ST REG4,UNIT(4)(a0) | |
705 | C_ST REG5,UNIT(5)(a0) | |
706 | C_ST REG6,UNIT(6)(a0) | |
707 | C_ST REG7,UNIT(7)(a0) | |
708 | C_LDHI t0,UNIT(8)(a1) | |
e6ff7f84 | 709 | C_LDHI t1,UNIT(9)(a1) |
e6ff7f84 | 710 | C_LDHI REG2,UNIT(10)(a1) |
e6ff7f84 | 711 | C_LDHI REG3,UNIT(11)(a1) |
e6ff7f84 | 712 | C_LDHI REG4,UNIT(12)(a1) |
e6ff7f84 | 713 | C_LDHI REG5,UNIT(13)(a1) |
e6ff7f84 | 714 | C_LDHI REG6,UNIT(14)(a1) |
e6ff7f84 | 715 | C_LDHI REG7,UNIT(15)(a1) |
d9014c08 SE |
716 | C_LDLO t0,UNITM1(9)(a1) |
717 | C_LDLO t1,UNITM1(10)(a1) | |
718 | C_LDLO REG2,UNITM1(11)(a1) | |
719 | C_LDLO REG3,UNITM1(12)(a1) | |
720 | C_LDLO REG4,UNITM1(13)(a1) | |
721 | C_LDLO REG5,UNITM1(14)(a1) | |
722 | C_LDLO REG6,UNITM1(15)(a1) | |
e6ff7f84 SE |
723 | C_LDLO REG7,UNITM1(16)(a1) |
724 | PREFETCH_FOR_LOAD (5, a1) | |
725 | C_ST t0,UNIT(8)(a0) | |
726 | C_ST t1,UNIT(9)(a0) | |
727 | C_ST REG2,UNIT(10)(a0) | |
728 | C_ST REG3,UNIT(11)(a0) | |
729 | C_ST REG4,UNIT(12)(a0) | |
730 | C_ST REG5,UNIT(13)(a0) | |
731 | C_ST REG6,UNIT(14)(a0) | |
732 | C_ST REG7,UNIT(15)(a0) | |
733 | PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ | |
e6ff7f84 SE |
734 | bne a0,a3,L(ua_loop16w) |
735 | PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ | |
736 | move a2,t8 | |
737 | ||
738 | /* Here we have src and dest word-aligned but less than 64-bytes or | |
a122dbfb | 739 | * 128 bytes to go. Check for a 32(64) byte chunk and copy if there |
e6ff7f84 SE |
740 | * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of |
741 | * the copy. */ | |
742 | ||
743 | L(ua_chkw): | |
744 | PREFETCH_FOR_LOAD (0, a1) | |
745 | andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ | |
746 | /* t8 is the reminder count past 32-bytes */ | |
747 | beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */ | |
748 | nop | |
749 | C_LDHI t0,UNIT(0)(a1) | |
e6ff7f84 | 750 | C_LDHI t1,UNIT(1)(a1) |
e6ff7f84 | 751 | C_LDHI REG2,UNIT(2)(a1) |
e6ff7f84 | 752 | C_LDHI REG3,UNIT(3)(a1) |
e6ff7f84 | 753 | C_LDHI REG4,UNIT(4)(a1) |
e6ff7f84 | 754 | C_LDHI REG5,UNIT(5)(a1) |
e6ff7f84 | 755 | C_LDHI REG6,UNIT(6)(a1) |
e6ff7f84 | 756 | C_LDHI REG7,UNIT(7)(a1) |
d9014c08 SE |
757 | C_LDLO t0,UNITM1(1)(a1) |
758 | C_LDLO t1,UNITM1(2)(a1) | |
759 | C_LDLO REG2,UNITM1(3)(a1) | |
760 | C_LDLO REG3,UNITM1(4)(a1) | |
761 | C_LDLO REG4,UNITM1(5)(a1) | |
762 | C_LDLO REG5,UNITM1(6)(a1) | |
763 | C_LDLO REG6,UNITM1(7)(a1) | |
e6ff7f84 SE |
764 | C_LDLO REG7,UNITM1(8)(a1) |
765 | PTR_ADDIU a1,a1,UNIT(8) | |
766 | C_ST t0,UNIT(0)(a0) | |
767 | C_ST t1,UNIT(1)(a0) | |
768 | C_ST REG2,UNIT(2)(a0) | |
769 | C_ST REG3,UNIT(3)(a0) | |
770 | C_ST REG4,UNIT(4)(a0) | |
771 | C_ST REG5,UNIT(5)(a0) | |
772 | C_ST REG6,UNIT(6)(a0) | |
773 | C_ST REG7,UNIT(7)(a0) | |
774 | PTR_ADDIU a0,a0,UNIT(8) | |
775 | /* | |
f24a6d08 | 776 | * Here we have less than 32(64) bytes to copy. Set up for a loop to |
e6ff7f84 SE |
777 | * copy one word (or double word) at a time. |
778 | */ | |
779 | L(ua_chk1w): | |
780 | andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ | |
781 | beq a2,t8,L(ua_smallCopy) | |
782 | PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ | |
783 | PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ | |
784 | ||
785 | /* copying in words (4-byte or 8-byte chunks) */ | |
786 | L(ua_wordCopy_loop): | |
787 | C_LDHI v1,UNIT(0)(a1) | |
788 | C_LDLO v1,UNITM1(1)(a1) | |
e6ff7f84 | 789 | PTR_ADDIU a0,a0,UNIT(1) |
d9014c08 | 790 | PTR_ADDIU a1,a1,UNIT(1) |
e6ff7f84 SE |
791 | bne a0,a3,L(ua_wordCopy_loop) |
792 | C_ST v1,UNIT(-1)(a0) | |
793 | ||
794 | /* Copy the last 8 (or 16) bytes */ | |
795 | L(ua_smallCopy): | |
796 | beqz a2,L(leave) | |
797 | PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ | |
798 | L(ua_smallCopy_loop): | |
799 | lb v1,0(a1) | |
e6ff7f84 | 800 | PTR_ADDIU a0,a0,1 |
d9014c08 | 801 | PTR_ADDIU a1,a1,1 |
e6ff7f84 SE |
802 | bne a0,a3,L(ua_smallCopy_loop) |
803 | sb v1,-1(a0) | |
804 | ||
805 | j ra | |
806 | nop | |
807 | ||
882c4b9f SE |
808 | #else /* R6_CODE */ |
809 | ||
e525154e | 810 | # ifdef __MIPSEB |
882c4b9f SE |
811 | # define SWAP_REGS(X,Y) X, Y |
812 | # define ALIGN_OFFSET(N) (N) | |
813 | # else | |
814 | # define SWAP_REGS(X,Y) Y, X | |
815 | # define ALIGN_OFFSET(N) (NSIZE-N) | |
816 | # endif | |
817 | # define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \ | |
818 | andi REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes. */ \ | |
819 | beq REG7, a2, L(lastb); /* Check for bytes to copy by word */ \ | |
820 | PTR_SUBU a3, a2, REG7; /* a3 is number of bytes to be copied in */ \ | |
821 | /* (d)word chunks. */ \ | |
822 | move a2, REG7; /* a2 is # of bytes to copy byte by byte */ \ | |
823 | /* after word loop is finished. */ \ | |
824 | PTR_ADDU REG6, a0, a3; /* REG6 is the dst address after loop. */ \ | |
825 | PTR_SUBU REG2, a1, t8; /* REG2 is the aligned src address. */ \ | |
826 | PTR_ADDU a1, a1, a3; /* a1 is addr of source after word loop. */ \ | |
827 | C_LD t0, UNIT(0)(REG2); /* Load first part of source. */ \ | |
828 | L(r6_ua_wordcopy##BYTEOFFSET): \ | |
829 | C_LD t1, UNIT(1)(REG2); /* Load second part of source. */ \ | |
830 | C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET); \ | |
831 | PTR_ADDIU a0, a0, UNIT(1); /* Increment destination pointer. */ \ | |
832 | PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \ | |
833 | move t0, t1; /* Move second part of source to first. */ \ | |
834 | bne a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET); \ | |
835 | C_ST REG3, UNIT(-1)(a0); \ | |
836 | j L(lastb); \ | |
837 | nop | |
838 | ||
839 | /* We are generating R6 code, the destination is 4 byte aligned and | |
840 | the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the | |
841 | alignment of the source. */ | |
842 | ||
843 | L(r6_unaligned1): | |
844 | R6_UNALIGNED_WORD_COPY(1) | |
845 | L(r6_unaligned2): | |
846 | R6_UNALIGNED_WORD_COPY(2) | |
847 | L(r6_unaligned3): | |
848 | R6_UNALIGNED_WORD_COPY(3) | |
849 | # ifdef USE_DOUBLE | |
850 | L(r6_unaligned4): | |
851 | R6_UNALIGNED_WORD_COPY(4) | |
852 | L(r6_unaligned5): | |
853 | R6_UNALIGNED_WORD_COPY(5) | |
854 | L(r6_unaligned6): | |
855 | R6_UNALIGNED_WORD_COPY(6) | |
856 | L(r6_unaligned7): | |
857 | R6_UNALIGNED_WORD_COPY(7) | |
858 | # endif | |
859 | #endif /* R6_CODE */ | |
860 | ||
e6ff7f84 | 861 | .set at |
af43a565 | 862 | .set reorder |
e6ff7f84 | 863 | END(MEMCPY_NAME) |
d9014c08 | 864 | #ifndef ANDROID_CHANGES |
4c0c0e2a | 865 | # ifdef _LIBC |
e6ff7f84 | 866 | libc_hidden_builtin_def (MEMCPY_NAME) |
4c0c0e2a | 867 | # endif |
d9014c08 | 868 | #endif |