]>
Commit | Line | Data |
---|---|---|
04067002 | 1 | /* Optimized memset implementation for PowerPC64. |
bfff8b1b | 2 | Copyright (C) 1997-2017 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 UD |
20 | |
21 | /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); | |
22 | Returns 's'. | |
23 | ||
24 | The memset is done in three sizes: byte (8 bits), word (32 bits), | |
25 | cache line (1024 bits). There is a special case for setting cache lines | |
26 | to 0, to take advantage of the dcbz instruction. */ | |
27 | ||
a88f47a7 | 28 | .machine power4 |
b5510883 | 29 | EALIGN (memset, 5, 0) |
04067002 UD |
30 | CALL_MCOUNT |
31 | ||
32 | #define rTMP r0 | |
33 | #define rRTN r3 /* Initial value of 1st argument. */ | |
34 | #define rMEMP0 r3 /* Original value of 1st arg. */ | |
35 | #define rCHR r4 /* Char to set in each byte. */ | |
36 | #define rLEN r5 /* Length of region to set. */ | |
37 | #define rMEMP r6 /* Address at which we are storing. */ | |
38 | #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ | |
39 | #define rMEMP2 r8 | |
40 | ||
41 | #define rNEG64 r8 /* Constant -64 for clearing with dcbz. */ | |
42 | #define rCLS r8 /* Cache line size (known to be 128). */ | |
43 | #define rCLM r9 /* Cache line size mask to check for cache alignment. */ | |
44 | L(_memset): | |
45 | /* Take care of case for size <= 4. */ | |
46 | cmplwi cr1, rLEN, 4 | |
47 | andi. rALIGN, rMEMP0, 3 | |
48 | mr rMEMP, rMEMP0 | |
49 | ble- cr1, L(small) | |
50 | ||
51 | /* Align to word boundary. */ | |
52 | cmplwi cr5, rLEN, 31 | |
d298c416 | 53 | insrwi rCHR, rCHR, 8, 16 /* Replicate byte to halfword. */ |
04067002 UD |
54 | beq+ L(aligned) |
55 | mtcrf 0x01, rMEMP0 | |
56 | subfic rALIGN, rALIGN, 4 | |
57 | add rMEMP, rMEMP, rALIGN | |
58 | sub rLEN, rLEN, rALIGN | |
59 | bf+ 31, L(g0) | |
60 | stb rCHR, 0(rMEMP0) | |
61 | bt 30, L(aligned) | |
62 | L(g0): | |
63 | sth rCHR, -2(rMEMP) | |
64 | ||
65 | /* Handle the case of size < 31. */ | |
66 | L(aligned): | |
67 | mtcrf 0x01, rLEN | |
d298c416 | 68 | insrwi rCHR, rCHR, 16, 0 /* Replicate halfword to word. */ |
04067002 UD |
69 | ble cr5, L(medium) |
70 | /* Align to 32-byte boundary. */ | |
71 | andi. rALIGN, rMEMP, 0x1C | |
72 | subfic rALIGN, rALIGN, 0x20 | |
73 | beq L(caligned) | |
74 | mtcrf 0x01, rALIGN | |
75 | add rMEMP, rMEMP, rALIGN | |
76 | sub rLEN, rLEN, rALIGN | |
77 | cmplwi cr1, rALIGN, 0x10 | |
78 | mr rMEMP2, rMEMP | |
79 | bf 28, L(a1) | |
80 | stw rCHR, -4(rMEMP2) | |
81 | stwu rCHR, -8(rMEMP2) | |
82 | L(a1): blt cr1, L(a2) | |
83 | stw rCHR, -4(rMEMP2) | |
84 | stw rCHR, -8(rMEMP2) | |
85 | stw rCHR, -12(rMEMP2) | |
86 | stwu rCHR, -16(rMEMP2) | |
87 | L(a2): bf 29, L(caligned) | |
88 | stw rCHR, -4(rMEMP2) | |
89 | ||
90 | /* Now aligned to a 32 byte boundary. */ | |
91 | L(caligned): | |
92 | cmplwi cr1, rCHR, 0 | |
93 | clrrwi. rALIGN, rLEN, 5 | |
94 | mtcrf 0x01, rLEN | |
95 | beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ | |
96 | L(nondcbz): | |
97 | srwi rTMP, rALIGN, 5 | |
98 | mtctr rTMP | |
99 | beq L(medium) /* We may not actually get to do a full line. */ | |
100 | clrlwi. rLEN, rLEN, 27 | |
101 | add rMEMP, rMEMP, rALIGN | |
102 | li rNEG64, -0x40 | |
103 | bdz L(cloopdone) | |
104 | ||
105 | .align 4 | |
106 | L(c3): dcbtst rNEG64, rMEMP | |
107 | stw rCHR, -4(rMEMP) | |
108 | stw rCHR, -8(rMEMP) | |
109 | stw rCHR, -12(rMEMP) | |
110 | stw rCHR, -16(rMEMP) | |
111 | stw rCHR, -20(rMEMP) | |
112 | stw rCHR, -24(rMEMP) | |
113 | stw rCHR, -28(rMEMP) | |
114 | stwu rCHR, -32(rMEMP) | |
115 | bdnz L(c3) | |
116 | L(cloopdone): | |
117 | stw rCHR, -4(rMEMP) | |
118 | stw rCHR, -8(rMEMP) | |
119 | stw rCHR, -12(rMEMP) | |
120 | stw rCHR, -16(rMEMP) | |
121 | cmplwi cr1, rLEN, 16 | |
122 | stw rCHR, -20(rMEMP) | |
123 | stw rCHR, -24(rMEMP) | |
124 | stw rCHR, -28(rMEMP) | |
125 | stwu rCHR, -32(rMEMP) | |
126 | beqlr | |
127 | add rMEMP, rMEMP, rALIGN | |
128 | b L(medium_tail2) | |
129 | ||
130 | .align 5 | |
131 | /* Clear lines of memory in 128-byte chunks. */ | |
132 | L(zloopstart): | |
133 | /* If the remaining length is less the 32 bytes, don't bother getting | |
134 | the cache line size. */ | |
135 | beq L(medium) | |
136 | li rCLS,128 /* cache line size is 128 */ | |
137 | dcbt 0,rMEMP | |
138 | L(getCacheAligned): | |
139 | cmplwi cr1,rLEN,32 | |
140 | andi. rTMP,rMEMP,127 | |
141 | blt cr1,L(handletail32) | |
142 | beq L(cacheAligned) | |
143 | addi rMEMP,rMEMP,32 | |
144 | addi rLEN,rLEN,-32 | |
145 | stw rCHR,-32(rMEMP) | |
146 | stw rCHR,-28(rMEMP) | |
147 | stw rCHR,-24(rMEMP) | |
148 | stw rCHR,-20(rMEMP) | |
149 | stw rCHR,-16(rMEMP) | |
150 | stw rCHR,-12(rMEMP) | |
151 | stw rCHR,-8(rMEMP) | |
152 | stw rCHR,-4(rMEMP) | |
153 | b L(getCacheAligned) | |
154 | ||
155 | /* Now we are aligned to the cache line and can use dcbz. */ | |
156 | .align 4 | |
157 | L(cacheAligned): | |
158 | cmplw cr1,rLEN,rCLS | |
159 | blt cr1,L(handletail32) | |
160 | dcbz 0,rMEMP | |
161 | subf rLEN,rCLS,rLEN | |
162 | add rMEMP,rMEMP,rCLS | |
163 | b L(cacheAligned) | |
164 | ||
9c84384c | 165 | /* We are here because the cache line size was set and the remainder |
04067002 UD |
166 | (rLEN) is less than the actual cache line size. |
167 | So set up the preconditions for L(nondcbz) and go there. */ | |
168 | L(handletail32): | |
169 | clrrwi. rALIGN, rLEN, 5 | |
170 | b L(nondcbz) | |
171 | ||
172 | .align 5 | |
173 | L(small): | |
174 | /* Memset of 4 bytes or less. */ | |
175 | cmplwi cr5, rLEN, 1 | |
176 | cmplwi cr1, rLEN, 3 | |
177 | bltlr cr5 | |
178 | stb rCHR, 0(rMEMP) | |
179 | beqlr cr5 | |
180 | stb rCHR, 1(rMEMP) | |
181 | bltlr cr1 | |
182 | stb rCHR, 2(rMEMP) | |
183 | beqlr cr1 | |
184 | stb rCHR, 3(rMEMP) | |
185 | blr | |
186 | ||
187 | /* Memset of 0-31 bytes. */ | |
188 | .align 5 | |
189 | L(medium): | |
190 | cmplwi cr1, rLEN, 16 | |
191 | L(medium_tail2): | |
192 | add rMEMP, rMEMP, rLEN | |
193 | L(medium_tail): | |
194 | bt- 31, L(medium_31t) | |
195 | bt- 30, L(medium_30t) | |
196 | L(medium_30f): | |
197 | bt- 29, L(medium_29t) | |
198 | L(medium_29f): | |
199 | bge- cr1, L(medium_27t) | |
200 | bflr- 28 | |
201 | stw rCHR, -4(rMEMP) | |
202 | stw rCHR, -8(rMEMP) | |
203 | blr | |
204 | ||
205 | L(medium_31t): | |
206 | stbu rCHR, -1(rMEMP) | |
207 | bf- 30, L(medium_30f) | |
208 | L(medium_30t): | |
209 | sthu rCHR, -2(rMEMP) | |
210 | bf- 29, L(medium_29f) | |
211 | L(medium_29t): | |
212 | stwu rCHR, -4(rMEMP) | |
213 | blt- cr1, L(medium_27f) | |
214 | L(medium_27t): | |
215 | stw rCHR, -4(rMEMP) | |
216 | stw rCHR, -8(rMEMP) | |
217 | stw rCHR, -12(rMEMP) | |
218 | stwu rCHR, -16(rMEMP) | |
219 | L(medium_27f): | |
220 | bflr- 28 | |
221 | L(medium_28t): | |
222 | stw rCHR, -4(rMEMP) | |
223 | stw rCHR, -8(rMEMP) | |
224 | blr | |
b5510883 | 225 | END (memset) |
04067002 | 226 | libc_hidden_builtin_def (memset) |