]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/memset.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / memset.S
1 /* Optimized memset implementation for PowerPC64.
2 Copyright (C) 1997-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 .section ".toc","aw"
22 .LC0:
23 .tc __cache_line_size[TC],__cache_line_size
24 .section ".text"
25 .align 2
26
27 /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
28 Returns 's'.
29
30 The memset is done in three sizes: byte (8 bits), word (32 bits),
31 cache line (256 bits). There is a special case for setting cache lines
32 to 0, to take advantage of the dcbz instruction. */
33
34 #ifndef MEMSET
35 # define MEMSET memset
36 #endif
37
38 ENTRY (MEMSET, 5)
39 CALL_MCOUNT 3
40
41 #define rTMP r0
42 #define rRTN r3 /* Initial value of 1st argument. */
43 #define rMEMP0 r3 /* Original value of 1st arg. */
44 #define rCHR r4 /* Char to set in each byte. */
45 #define rLEN r5 /* Length of region to set. */
46 #define rMEMP r6 /* Address at which we are storing. */
47 #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
48 #define rMEMP2 r8
49
50 #define rNEG64 r8 /* Constant -64 for clearing with dcbz. */
51 #define rCLS r8 /* Cache line size obtained from static. */
52 #define rCLM r9 /* Cache line size mask to check for cache alignment. */
53 L(_memset):
54 /* Take care of case for size <= 4. */
55 cmpldi cr1, rLEN, 8
56 andi. rALIGN, rMEMP0, 7
57 mr rMEMP, rMEMP0
58 ble- cr1, L(small)
59
60 /* Align to doubleword boundary. */
61 cmpldi cr5, rLEN, 31
62 insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */
63 beq+ L(aligned2)
64 mtcrf 0x01, rMEMP0
65 subfic rALIGN, rALIGN, 8
66 cror 28,30,31 /* Detect odd word aligned. */
67 add rMEMP, rMEMP, rALIGN
68 sub rLEN, rLEN, rALIGN
69 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
70 bt 29, L(g4)
71 /* Process the even word of doubleword. */
72 bf+ 31, L(g2)
73 stb rCHR, 0(rMEMP0)
74 bt 30, L(g4x)
75 L(g2):
76 sth rCHR, -6(rMEMP)
77 L(g4x):
78 stw rCHR, -4(rMEMP)
79 b L(aligned)
80 /* Process the odd word of doubleword. */
81 L(g4):
82 bf 28, L(g4x) /* If false, word aligned on odd word. */
83 bf+ 31, L(g0)
84 stb rCHR, 0(rMEMP0)
85 bt 30, L(aligned)
86 L(g0):
87 sth rCHR, -2(rMEMP)
88
89 /* Handle the case of size < 31. */
90 L(aligned2):
91 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
92 L(aligned):
93 mtcrf 0x01, rLEN
94 ble cr5, L(medium)
95 /* Align to 32-byte boundary. */
96 andi. rALIGN, rMEMP, 0x18
97 subfic rALIGN, rALIGN, 0x20
98 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
99 beq L(caligned)
100 mtcrf 0x01, rALIGN
101 add rMEMP, rMEMP, rALIGN
102 sub rLEN, rLEN, rALIGN
103 cmplwi cr1, rALIGN, 0x10
104 mr rMEMP2, rMEMP
105 bf 28, L(a1)
106 stdu rCHR, -8(rMEMP2)
107 L(a1): blt cr1, L(a2)
108 std rCHR, -8(rMEMP2)
109 stdu rCHR, -16(rMEMP2)
110 L(a2):
111
112 /* Now aligned to a 32 byte boundary. */
113 L(caligned):
114 cmpldi cr1, rCHR, 0
115 clrrdi. rALIGN, rLEN, 5
116 mtcrf 0x01, rLEN
117 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
118 L(nondcbz):
119 srdi rTMP, rALIGN, 5
120 mtctr rTMP
121 beq L(medium) /* We may not actually get to do a full line. */
122 clrldi. rLEN, rLEN, 59
123 add rMEMP, rMEMP, rALIGN
124 li rNEG64, -0x40
125 bdz L(cloopdone)
126
127 L(c3): dcbtst rNEG64, rMEMP
128 std rCHR, -8(rMEMP)
129 std rCHR, -16(rMEMP)
130 std rCHR, -24(rMEMP)
131 stdu rCHR, -32(rMEMP)
132 bdnz L(c3)
133 L(cloopdone):
134 std rCHR, -8(rMEMP)
135 std rCHR, -16(rMEMP)
136 cmpldi cr1, rLEN, 16
137 std rCHR, -24(rMEMP)
138 stdu rCHR, -32(rMEMP)
139 beqlr
140 add rMEMP, rMEMP, rALIGN
141 b L(medium_tail2)
142
143 .align 5
144 /* Clear lines of memory in 128-byte chunks. */
145 L(zloopstart):
146 /* If the remaining length is less the 32 bytes, don't bother getting
147 the cache line size. */
148 beq L(medium)
149 ld rCLS,.LC0@toc(r2)
150 lwz rCLS,0(rCLS)
151 /* If the cache line size was not set just goto to L(nondcbz) which is
152 safe for any cache line size. */
153 cmpldi cr1,rCLS,0
154 beq cr1,L(nondcbz)
155
156
157 /* Now we know the cache line size, and it is not 32-bytes, but
158 we may not yet be aligned to the cache line. May have a partial
159 line to fill, so touch it 1st. */
160 dcbt 0,rMEMP
161 addi rCLM,rCLS,-1
162 L(getCacheAligned):
163 cmpldi cr1,rLEN,32
164 and. rTMP,rCLM,rMEMP
165 blt cr1,L(handletail32)
166 beq L(cacheAligned)
167 addi rMEMP,rMEMP,32
168 addi rLEN,rLEN,-32
169 std rCHR,-32(rMEMP)
170 std rCHR,-24(rMEMP)
171 std rCHR,-16(rMEMP)
172 std rCHR,-8(rMEMP)
173 b L(getCacheAligned)
174
175 /* Now we are aligned to the cache line and can use dcbz. */
176 L(cacheAligned):
177 cmpld cr1,rLEN,rCLS
178 blt cr1,L(handletail32)
179 dcbz 0,rMEMP
180 subf rLEN,rCLS,rLEN
181 add rMEMP,rMEMP,rCLS
182 b L(cacheAligned)
183
184 /* We are here because the cache line size was set and was not 32-bytes
185 and the remainder (rLEN) is less than the actual cache line size.
186 So set up the preconditions for L(nondcbz) and go there. */
187 L(handletail32):
188 clrrwi. rALIGN, rLEN, 5
189 b L(nondcbz)
190
191 .align 5
192 L(small):
193 /* Memset of 8 bytes or less. */
194 cmpldi cr6, rLEN, 4
195 cmpldi cr5, rLEN, 1
196 ble cr6,L(le4)
197 subi rLEN, rLEN, 4
198 stb rCHR,0(rMEMP)
199 stb rCHR,1(rMEMP)
200 stb rCHR,2(rMEMP)
201 stb rCHR,3(rMEMP)
202 addi rMEMP,rMEMP, 4
203 cmpldi cr5, rLEN, 1
204 L(le4):
205 cmpldi cr1, rLEN, 3
206 bltlr cr5
207 stb rCHR, 0(rMEMP)
208 beqlr cr5
209 stb rCHR, 1(rMEMP)
210 bltlr cr1
211 stb rCHR, 2(rMEMP)
212 beqlr cr1
213 stb rCHR, 3(rMEMP)
214 blr
215
216 /* Memset of 0-31 bytes. */
217 .align 5
218 L(medium):
219 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
220 cmpldi cr1, rLEN, 16
221 L(medium_tail2):
222 add rMEMP, rMEMP, rLEN
223 L(medium_tail):
224 bt- 31, L(medium_31t)
225 bt- 30, L(medium_30t)
226 L(medium_30f):
227 bt- 29, L(medium_29t)
228 L(medium_29f):
229 bge- cr1, L(medium_27t)
230 bflr- 28
231 std rCHR, -8(rMEMP)
232 blr
233
234 L(medium_31t):
235 stbu rCHR, -1(rMEMP)
236 bf- 30, L(medium_30f)
237 L(medium_30t):
238 sthu rCHR, -2(rMEMP)
239 bf- 29, L(medium_29f)
240 L(medium_29t):
241 stwu rCHR, -4(rMEMP)
242 blt- cr1, L(medium_27f)
243 L(medium_27t):
244 std rCHR, -8(rMEMP)
245 stdu rCHR, -16(rMEMP)
246 L(medium_27f):
247 bflr- 28
248 L(medium_28t):
249 std rCHR, -8(rMEMP)
250 blr
251 END_GEN_TB (MEMSET,TB_TOCLESS)
252 libc_hidden_builtin_def (memset)
253
254 #ifndef NO_BZERO_IMPL
255 /* Copied from bzero.S to prevent the linker from inserting a stub
256 between bzero and memset. */
257 ENTRY (__bzero)
258 CALL_MCOUNT 3
259 mr r5,r4
260 li r4,0
261 b L(_memset)
262 END_GEN_TB (__bzero,TB_TOCLESS)
263
264 weak_alias (__bzero, bzero)
265 #endif