]> git.ipfire.org Git - people/ms/linux.git/blame - arch/powerpc/lib/copy_32.S
powerpc32: memcpy: only use dcbz once cache is enabled
[people/ms/linux.git] / arch / powerpc / lib / copy_32.S
CommitLineData
14cf11af
PM
1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
14cf11af
PM
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15
16#define COPY_16_BYTES \
17 lwz r7,4(r4); \
18 lwz r8,8(r4); \
19 lwz r9,12(r4); \
20 lwzu r10,16(r4); \
21 stw r7,4(r6); \
22 stw r8,8(r6); \
23 stw r9,12(r6); \
24 stwu r10,16(r6)
25
26#define COPY_16_BYTES_WITHEX(n) \
278 ## n ## 0: \
28 lwz r7,4(r4); \
298 ## n ## 1: \
30 lwz r8,8(r4); \
318 ## n ## 2: \
32 lwz r9,12(r4); \
338 ## n ## 3: \
34 lwzu r10,16(r4); \
358 ## n ## 4: \
36 stw r7,4(r6); \
378 ## n ## 5: \
38 stw r8,8(r6); \
398 ## n ## 6: \
40 stw r9,12(r6); \
418 ## n ## 7: \
42 stwu r10,16(r6)
43
44#define COPY_16_BYTES_EXCODE(n) \
459 ## n ## 0: \
46 addi r5,r5,-(16 * n); \
47 b 104f; \
489 ## n ## 1: \
49 addi r5,r5,-(16 * n); \
50 b 105f; \
51.section __ex_table,"a"; \
52 .align 2; \
53 .long 8 ## n ## 0b,9 ## n ## 0b; \
54 .long 8 ## n ## 1b,9 ## n ## 0b; \
55 .long 8 ## n ## 2b,9 ## n ## 0b; \
56 .long 8 ## n ## 3b,9 ## n ## 0b; \
57 .long 8 ## n ## 4b,9 ## n ## 1b; \
58 .long 8 ## n ## 5b,9 ## n ## 1b; \
59 .long 8 ## n ## 6b,9 ## n ## 1b; \
60 .long 8 ## n ## 7b,9 ## n ## 1b; \
61 .text
62
63 .text
64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
025c0186 65 .stabs "copy_32.S",N_SO,0,0,0f
14cf11af
PM
660:
67
7dffb720
SR
68CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
14cf11af 71
df087e45
LC
72/*
73 * Use dcbz on the complete cache lines in the destination
74 * to set them to zero. This requires that the destination
75 * area is cacheable. -- paulus
76 */
5b2a32e8 77_GLOBAL(memset)
c152f149
LC
78 rlwimi r4,r4,8,16,23
79 rlwimi r4,r4,16,0,15
80
df087e45
LC
81 addi r6,r3,-4
82 cmplwi 0,r5,4
83 blt 7f
84 stwu r4,4(r6)
85 beqlr
86 andi. r0,r6,3
87 add r5,r0,r5
88 subf r6,r0,r6
c152f149
LC
89 cmplwi 0,r4,0
90 bne 2f /* Use normal procedure if r4 is not zero */
91
df087e45
LC
92 clrlwi r7,r6,32-LG_CACHELINE_BYTES
93 add r8,r7,r5
94 srwi r9,r8,LG_CACHELINE_BYTES
95 addic. r9,r9,-1 /* total number of complete cachelines */
96 ble 2f
97 xori r0,r7,CACHELINE_MASK & ~3
98 srwi. r0,r0,2
99 beq 3f
100 mtctr r0
1014: stwu r4,4(r6)
102 bdnz 4b
1033: mtctr r9
104 li r7,4
10510: dcbz r7,r6
106 addi r6,r6,CACHELINE_BYTES
107 bdnz 10b
108 clrlwi r5,r8,32-LG_CACHELINE_BYTES
109 addi r5,r5,4
df087e45 110
c152f149 1112: srwi r0,r5,2
14cf11af
PM
112 mtctr r0
113 bdz 6f
1141: stwu r4,4(r6)
115 bdnz 1b
1166: andi. r5,r5,3
1177: cmpwi 0,r5,0
118 beqlr
119 mtctr r5
120 addi r6,r6,3
1218: stbu r4,1(r6)
122 bdnz 8b
123 blr
124
df087e45
LC
125/*
126 * This version uses dcbz on the complete cache lines in the
127 * destination area to reduce memory traffic. This requires that
128 * the destination area is cacheable.
129 * We only use this version if the source and dest don't overlap.
130 * -- paulus.
1cd03890
LC
131 *
132 * During early init, cache might not be active yet, so dcbz cannot be used.
133 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
134 * replaced by a nop once cache is active. This is done in machine_init()
df087e45 135 */
0b05e2d6
LC
136_GLOBAL(memmove)
137 cmplw 0,r3,r4
138 bgt backwards_memcpy
139 /* fall through */
140
141_GLOBAL(memcpy)
1cd03890 142 b generic_memcpy
df087e45
LC
143 add r7,r3,r5 /* test if the src & dst overlap */
144 add r8,r4,r5
145 cmplw 0,r4,r7
146 cmplw 1,r3,r8
147 crand 0,0,4 /* cr0.lt &= cr1.lt */
0b05e2d6 148 blt generic_memcpy /* if regions overlap */
df087e45
LC
149
150 addi r4,r4,-4
151 addi r6,r3,-4
152 neg r0,r3
153 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
154 beq 58f
155
156 cmplw 0,r5,r0 /* is this more than total to do? */
157 blt 63f /* if not much to do */
158 andi. r8,r0,3 /* get it word-aligned first */
159 subf r5,r0,r5
160 mtctr r8
161 beq+ 61f
16270: lbz r9,4(r4) /* do some bytes */
df087e45
LC
163 addi r4,r4,1
164 addi r6,r6,1
295ffb41 165 stb r9,3(r6)
df087e45
LC
166 bdnz 70b
16761: srwi. r0,r0,2
168 mtctr r0
169 beq 58f
17072: lwzu r9,4(r4) /* do some words */
171 stwu r9,4(r6)
172 bdnz 72b
173
17458: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
175 clrlwi r5,r5,32-LG_CACHELINE_BYTES
176 li r11,4
177 mtctr r0
178 beq 63f
17953:
180 dcbz r11,r6
181 COPY_16_BYTES
182#if L1_CACHE_BYTES >= 32
183 COPY_16_BYTES
184#if L1_CACHE_BYTES >= 64
185 COPY_16_BYTES
186 COPY_16_BYTES
187#if L1_CACHE_BYTES >= 128
188 COPY_16_BYTES
189 COPY_16_BYTES
190 COPY_16_BYTES
191 COPY_16_BYTES
192#endif
193#endif
194#endif
195 bdnz 53b
196
19763: srwi. r0,r5,2
198 mtctr r0
199 beq 64f
20030: lwzu r0,4(r4)
201 stwu r0,4(r6)
202 bdnz 30b
203
20464: andi. r0,r5,3
205 mtctr r0
206 beq+ 65f
295ffb41
LC
207 addi r4,r4,3
208 addi r6,r6,3
20940: lbzu r0,1(r4)
210 stbu r0,1(r6)
df087e45
LC
211 bdnz 40b
21265: blr
213
0b05e2d6 214_GLOBAL(generic_memcpy)
14cf11af
PM
215 srwi. r7,r5,3
216 addi r6,r3,-4
217 addi r4,r4,-4
218 beq 2f /* if less than 8 bytes to do */
219 andi. r0,r6,3 /* get dest word aligned */
220 mtctr r7
221 bne 5f
2221: lwz r7,4(r4)
223 lwzu r8,8(r4)
224 stw r7,4(r6)
225 stwu r8,8(r6)
226 bdnz 1b
227 andi. r5,r5,7
2282: cmplwi 0,r5,4
229 blt 3f
230 lwzu r0,4(r4)
231 addi r5,r5,-4
232 stwu r0,4(r6)
2333: cmpwi 0,r5,0
234 beqlr
235 mtctr r5
236 addi r4,r4,3
237 addi r6,r6,3
2384: lbzu r0,1(r4)
239 stbu r0,1(r6)
240 bdnz 4b
241 blr
2425: subfic r0,r0,4
243 mtctr r0
2446: lbz r7,4(r4)
245 addi r4,r4,1
246 stb r7,4(r6)
247 addi r6,r6,1
248 bdnz 6b
249 subf r5,r0,r5
250 rlwinm. r7,r5,32-3,3,31
251 beq 2b
252 mtctr r7
253 b 1b
254
255_GLOBAL(backwards_memcpy)
256 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
257 add r6,r3,r5
258 add r4,r4,r5
259 beq 2f
260 andi. r0,r6,3
261 mtctr r7
262 bne 5f
2631: lwz r7,-4(r4)
264 lwzu r8,-8(r4)
265 stw r7,-4(r6)
266 stwu r8,-8(r6)
267 bdnz 1b
268 andi. r5,r5,7
2692: cmplwi 0,r5,4
270 blt 3f
271 lwzu r0,-4(r4)
272 subi r5,r5,4
273 stwu r0,-4(r6)
2743: cmpwi 0,r5,0
275 beqlr
276 mtctr r5
2774: lbzu r0,-1(r4)
278 stbu r0,-1(r6)
279 bdnz 4b
280 blr
2815: mtctr r0
2826: lbzu r7,-1(r4)
283 stbu r7,-1(r6)
284 bdnz 6b
285 subf r5,r0,r5
286 rlwinm. r7,r5,32-3,3,31
287 beq 2b
288 mtctr r7
289 b 1b
290
291_GLOBAL(__copy_tofrom_user)
292 addi r4,r4,-4
293 addi r6,r3,-4
294 neg r0,r3
295 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
296 beq 58f
297
298 cmplw 0,r5,r0 /* is this more than total to do? */
299 blt 63f /* if not much to do */
300 andi. r8,r0,3 /* get it word-aligned first */
301 mtctr r8
302 beq+ 61f
30370: lbz r9,4(r4) /* do some bytes */
30471: stb r9,4(r6)
305 addi r4,r4,1
306 addi r6,r6,1
307 bdnz 70b
30861: subf r5,r0,r5
309 srwi. r0,r0,2
310 mtctr r0
311 beq 58f
31272: lwzu r9,4(r4) /* do some words */
31373: stwu r9,4(r6)
314 bdnz 72b
315
316 .section __ex_table,"a"
317 .align 2
318 .long 70b,100f
319 .long 71b,101f
320 .long 72b,102f
321 .long 73b,103f
322 .text
323
32458: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
325 clrlwi r5,r5,32-LG_CACHELINE_BYTES
326 li r11,4
327 beq 63f
328
14cf11af
PM
329 /* Here we decide how far ahead to prefetch the source */
330 li r3,4
331 cmpwi r0,1
332 li r7,0
333 ble 114f
334 li r7,1
335#if MAX_COPY_PREFETCH > 1
336 /* Heuristically, for large transfers we prefetch
337 MAX_COPY_PREFETCH cachelines ahead. For small transfers
338 we prefetch 1 cacheline ahead. */
339 cmpwi r0,MAX_COPY_PREFETCH
340 ble 112f
341 li r7,MAX_COPY_PREFETCH
342112: mtctr r7
343111: dcbt r3,r4
344 addi r3,r3,CACHELINE_BYTES
345 bdnz 111b
346#else
347 dcbt r3,r4
348 addi r3,r3,CACHELINE_BYTES
349#endif /* MAX_COPY_PREFETCH > 1 */
350
351114: subf r8,r7,r0
352 mr r0,r7
353 mtctr r8
354
35553: dcbt r3,r4
35654: dcbz r11,r6
357 .section __ex_table,"a"
358 .align 2
359 .long 54b,105f
360 .text
361/* the main body of the cacheline loop */
362 COPY_16_BYTES_WITHEX(0)
7dffb720 363#if L1_CACHE_BYTES >= 32
14cf11af 364 COPY_16_BYTES_WITHEX(1)
7dffb720 365#if L1_CACHE_BYTES >= 64
14cf11af
PM
366 COPY_16_BYTES_WITHEX(2)
367 COPY_16_BYTES_WITHEX(3)
7dffb720 368#if L1_CACHE_BYTES >= 128
14cf11af
PM
369 COPY_16_BYTES_WITHEX(4)
370 COPY_16_BYTES_WITHEX(5)
371 COPY_16_BYTES_WITHEX(6)
372 COPY_16_BYTES_WITHEX(7)
373#endif
374#endif
375#endif
376 bdnz 53b
377 cmpwi r0,0
378 li r3,4
379 li r7,0
380 bne 114b
14cf11af
PM
381
38263: srwi. r0,r5,2
383 mtctr r0
384 beq 64f
38530: lwzu r0,4(r4)
38631: stwu r0,4(r6)
387 bdnz 30b
388
38964: andi. r0,r5,3
390 mtctr r0
391 beq+ 65f
39240: lbz r0,4(r4)
39341: stb r0,4(r6)
394 addi r4,r4,1
395 addi r6,r6,1
396 bdnz 40b
39765: li r3,0
398 blr
399
400/* read fault, initial single-byte copy */
401100: li r9,0
402 b 90f
403/* write fault, initial single-byte copy */
404101: li r9,1
40590: subf r5,r8,r5
406 li r3,0
407 b 99f
408/* read fault, initial word copy */
409102: li r9,0
410 b 91f
411/* write fault, initial word copy */
412103: li r9,1
41391: li r3,2
414 b 99f
415
416/*
417 * this stuff handles faults in the cacheline loop and branches to either
418 * 104f (if in read part) or 105f (if in write part), after updating r5
419 */
420 COPY_16_BYTES_EXCODE(0)
7dffb720 421#if L1_CACHE_BYTES >= 32
14cf11af 422 COPY_16_BYTES_EXCODE(1)
7dffb720 423#if L1_CACHE_BYTES >= 64
14cf11af
PM
424 COPY_16_BYTES_EXCODE(2)
425 COPY_16_BYTES_EXCODE(3)
7dffb720 426#if L1_CACHE_BYTES >= 128
14cf11af
PM
427 COPY_16_BYTES_EXCODE(4)
428 COPY_16_BYTES_EXCODE(5)
429 COPY_16_BYTES_EXCODE(6)
430 COPY_16_BYTES_EXCODE(7)
431#endif
432#endif
433#endif
434
435/* read fault in cacheline loop */
436104: li r9,0
437 b 92f
438/* fault on dcbz (effectively a write fault) */
439/* or write fault in cacheline loop */
440105: li r9,1
44192: li r3,LG_CACHELINE_BYTES
442 mfctr r8
443 add r0,r0,r8
444 b 106f
445/* read fault in final word loop */
446108: li r9,0
447 b 93f
448/* write fault in final word loop */
449109: li r9,1
45093: andi. r5,r5,3
451 li r3,2
452 b 99f
453/* read fault in final byte loop */
454110: li r9,0
455 b 94f
456/* write fault in final byte loop */
457111: li r9,1
45894: li r5,0
459 li r3,0
460/*
461 * At this stage the number of bytes not copied is
462 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
463 */
46499: mfctr r0
465106: slw r3,r0,r3
466 add. r3,r3,r5
467 beq 120f /* shouldn't happen */
468 cmpwi 0,r9,0
469 bne 120f
470/* for a read fault, first try to continue the copy one byte at a time */
471 mtctr r3
472130: lbz r0,4(r4)
473131: stb r0,4(r6)
474 addi r4,r4,1
475 addi r6,r6,1
476 bdnz 130b
477/* then clear out the destination: r3 bytes starting at 4(r6) */
478132: mfctr r3
479 srwi. r0,r3,2
480 li r9,0
481 mtctr r0
482 beq 113f
483112: stwu r9,4(r6)
484 bdnz 112b
485113: andi. r0,r3,3
486 mtctr r0
487 beq 120f
488114: stb r9,4(r6)
489 addi r6,r6,1
490 bdnz 114b
491120: blr
492
493 .section __ex_table,"a"
494 .align 2
495 .long 30b,108b
496 .long 31b,109b
497 .long 40b,110b
498 .long 41b,111b
499 .long 130b,132b
500 .long 131b,120b
501 .long 112b,120b
502 .long 114b,120b
503 .text