]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power4/memset.S
PowerPC LE memset
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power4 / memset.S
1 /* Optimized memset implementation for PowerPC64.
2 Copyright (C) 1997-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
22 Returns 's'.
23
24 The memset is done in three sizes: byte (8 bits), word (32 bits),
25 cache line (256 bits). There is a special case for setting cache lines
26 to 0, to take advantage of the dcbz instruction. */
27
28 .machine power4
29 EALIGN (memset, 5, 0)
30 CALL_MCOUNT 3
31
32 #define rTMP r0
33 #define rRTN r3 /* Initial value of 1st argument. */
34 #define rMEMP0 r3 /* Original value of 1st arg. */
35 #define rCHR r4 /* Char to set in each byte. */
36 #define rLEN r5 /* Length of region to set. */
37 #define rMEMP r6 /* Address at which we are storing. */
38 #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
39 #define rMEMP2 r8
40
41 #define rNEG64 r8 /* Constant -64 for clearing with dcbz. */
42 #define rCLS r8 /* Cache line size obtained from static. */
43 #define rCLM r9 /* Cache line size mask to check for cache alignment. */
44 L(_memset):
45 /* Take care of case for size <= 4. */
46 cmpldi cr1, rLEN, 8
47 andi. rALIGN, rMEMP0, 7
48 mr rMEMP, rMEMP0
49 ble- cr1, L(small)
50
51 /* Align to doubleword boundary. */
52 cmpldi cr5, rLEN, 31
53 insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */
54 beq+ L(aligned2)
55 mtcrf 0x01, rMEMP0
56 subfic rALIGN, rALIGN, 8
57 cror 28,30,31 /* Detect odd word aligned. */
58 add rMEMP, rMEMP, rALIGN
59 sub rLEN, rLEN, rALIGN
60 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
61 bt 29, L(g4)
62 /* Process the even word of doubleword. */
63 bf+ 31, L(g2)
64 stb rCHR, 0(rMEMP0)
65 bt 30, L(g4x)
66 L(g2):
67 sth rCHR, -6(rMEMP)
68 L(g4x):
69 stw rCHR, -4(rMEMP)
70 b L(aligned)
71 /* Process the odd word of doubleword. */
72 L(g4):
73 bf 28, L(g4x) /* If false, word aligned on odd word. */
74 bf+ 31, L(g0)
75 stb rCHR, 0(rMEMP0)
76 bt 30, L(aligned)
77 L(g0):
78 sth rCHR, -2(rMEMP)
79
80 /* Handle the case of size < 31. */
81 L(aligned2):
82 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
83 L(aligned):
84 mtcrf 0x01, rLEN
85 ble cr5, L(medium)
86 /* Align to 32-byte boundary. */
87 andi. rALIGN, rMEMP, 0x18
88 subfic rALIGN, rALIGN, 0x20
89 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
90 beq L(caligned)
91 mtcrf 0x01, rALIGN
92 add rMEMP, rMEMP, rALIGN
93 sub rLEN, rLEN, rALIGN
94 cmplwi cr1, rALIGN, 0x10
95 mr rMEMP2, rMEMP
96 bf 28, L(a1)
97 stdu rCHR, -8(rMEMP2)
98 L(a1): blt cr1, L(a2)
99 std rCHR, -8(rMEMP2)
100 stdu rCHR, -16(rMEMP2)
101 L(a2):
102
103 /* Now aligned to a 32 byte boundary. */
104 L(caligned):
105 cmpldi cr1, rCHR, 0
106 clrrdi. rALIGN, rLEN, 5
107 mtcrf 0x01, rLEN
108 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
109 L(nondcbz):
110 srdi rTMP, rALIGN, 5
111 mtctr rTMP
112 beq L(medium) /* We may not actually get to do a full line. */
113 clrldi. rLEN, rLEN, 59
114 add rMEMP, rMEMP, rALIGN
115 li rNEG64, -0x40
116 bdz L(cloopdone)
117
118 L(c3): dcbtst rNEG64, rMEMP
119 std rCHR, -8(rMEMP)
120 std rCHR, -16(rMEMP)
121 std rCHR, -24(rMEMP)
122 stdu rCHR, -32(rMEMP)
123 bdnz L(c3)
124 L(cloopdone):
125 std rCHR, -8(rMEMP)
126 std rCHR, -16(rMEMP)
127 cmpldi cr1, rLEN, 16
128 std rCHR, -24(rMEMP)
129 stdu rCHR, -32(rMEMP)
130 beqlr
131 add rMEMP, rMEMP, rALIGN
132 b L(medium_tail2)
133
134 .align 5
135 /* Clear lines of memory in 128-byte chunks. */
136 L(zloopstart):
137 /* If the remaining length is less the 32 bytes, don't bother getting
138 the cache line size. */
139 beq L(medium)
140 li rCLS,128 /* cache line size is 128 */
141
142 /* Now we know the cache line size, and it is not 32-bytes, but
143 we may not yet be aligned to the cache line. May have a partial
144 line to fill, so touch it 1st. */
145 dcbt 0,rMEMP
146 L(getCacheAligned):
147 cmpldi cr1,rLEN,32
148 andi. rTMP,rMEMP,127
149 blt cr1,L(handletail32)
150 beq L(cacheAligned)
151 addi rMEMP,rMEMP,32
152 addi rLEN,rLEN,-32
153 std rCHR,-32(rMEMP)
154 std rCHR,-24(rMEMP)
155 std rCHR,-16(rMEMP)
156 std rCHR,-8(rMEMP)
157 b L(getCacheAligned)
158
159 /* Now we are aligned to the cache line and can use dcbz. */
160 L(cacheAligned):
161 cmpld cr1,rLEN,rCLS
162 blt cr1,L(handletail32)
163 dcbz 0,rMEMP
164 subf rLEN,rCLS,rLEN
165 add rMEMP,rMEMP,rCLS
166 b L(cacheAligned)
167
168 /* We are here because the cache line size was set and was not 32-bytes
169 and the remainder (rLEN) is less than the actual cache line size.
170 So set up the preconditions for L(nondcbz) and go there. */
171 L(handletail32):
172 clrrwi. rALIGN, rLEN, 5
173 b L(nondcbz)
174
175 .align 5
176 L(small):
177 /* Memset of 8 bytes or less. */
178 cmpldi cr6, rLEN, 4
179 cmpldi cr5, rLEN, 1
180 ble cr6,L(le4)
181 subi rLEN, rLEN, 4
182 stb rCHR,0(rMEMP)
183 stb rCHR,1(rMEMP)
184 stb rCHR,2(rMEMP)
185 stb rCHR,3(rMEMP)
186 addi rMEMP,rMEMP, 4
187 cmpldi cr5, rLEN, 1
188 L(le4):
189 cmpldi cr1, rLEN, 3
190 bltlr cr5
191 stb rCHR, 0(rMEMP)
192 beqlr cr5
193 stb rCHR, 1(rMEMP)
194 bltlr cr1
195 stb rCHR, 2(rMEMP)
196 beqlr cr1
197 stb rCHR, 3(rMEMP)
198 blr
199
200 /* Memset of 0-31 bytes. */
201 .align 5
202 L(medium):
203 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
204 cmpldi cr1, rLEN, 16
205 L(medium_tail2):
206 add rMEMP, rMEMP, rLEN
207 L(medium_tail):
208 bt- 31, L(medium_31t)
209 bt- 30, L(medium_30t)
210 L(medium_30f):
211 bt- 29, L(medium_29t)
212 L(medium_29f):
213 bge- cr1, L(medium_27t)
214 bflr- 28
215 std rCHR, -8(rMEMP)
216 blr
217
218 L(medium_31t):
219 stbu rCHR, -1(rMEMP)
220 bf- 30, L(medium_30f)
221 L(medium_30t):
222 sthu rCHR, -2(rMEMP)
223 bf- 29, L(medium_29f)
224 L(medium_29t):
225 stwu rCHR, -4(rMEMP)
226 blt- cr1, L(medium_27f)
227 L(medium_27t):
228 std rCHR, -8(rMEMP)
229 stdu rCHR, -16(rMEMP)
230 L(medium_27f):
231 bflr- 28
232 L(medium_28t):
233 std rCHR, -8(rMEMP)
234 blr
235 END_GEN_TB (memset,TB_TOCLESS)
236 libc_hidden_builtin_def (memset)
237
238 /* Copied from bzero.S to prevent the linker from inserting a stub
239 between bzero and memset. */
240 ENTRY (__bzero)
241 CALL_MCOUNT 3
242 mr r5,r4
243 li r4,0
244 b L(_memset)
245 END_GEN_TB (__bzero,TB_TOCLESS)
246
247 weak_alias (__bzero, bzero)