]>
Commit | Line | Data |
---|---|---|
d89060d6 | 1 | /* Optimized strncmp implementation for PowerPC64/POWER9. |
688903eb | 2 | Copyright (C) 2016-2018 Free Software Foundation, Inc. |
d89060d6 RS |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
d89060d6 RS |
18 | #include <sysdep.h> |
19 | ||
20 | /* Implements the function | |
21 | ||
22 | int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n) | |
23 | ||
24 | The implementation uses unaligned doubleword access to avoid specialized | |
25 | code paths depending of data alignment for first 32 bytes and uses | |
26 | vectorised loops after that. */ | |
27 | ||
3bc426e1 WSM |
28 | #ifndef STRNCMP |
29 | # define STRNCMP strncmp | |
30 | #endif | |
31 | ||
d89060d6 | 32 | /* TODO: Change this to actual instructions when minimum binutils is upgraded |
7793ad7a | 33 | to 2.27. Macros are defined below for these newer instructions in order |
d89060d6 | 34 | to maintain compatibility. */ |
7793ad7a | 35 | #define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21))) |
d89060d6 | 36 | |
7793ad7a | 37 | #define VEXTUBRX(t,a,b) .long (0x1000070d \ |
d89060d6 RS |
38 | | ((t)<<(32-11)) \ |
39 | | ((a)<<(32-16)) \ | |
40 | | ((b)<<(32-21)) ) | |
41 | ||
7793ad7a | 42 | #define VCMPNEZB(t,a,b) .long (0x10000507 \ |
d89060d6 RS |
43 | | ((t)<<(32-11)) \ |
44 | | ((a)<<(32-16)) \ | |
45 | | ((b)<<(32-21)) ) | |
46 | ||
47 | /* Get 16 bytes for unaligned case. | |
48 | reg1: Vector to hold next 16 bytes. | |
49 | reg2: Address to read from. | |
50 | reg3: Permute control vector. */ | |
7793ad7a | 51 | #define GET16BYTES(reg1, reg2, reg3) \ |
d89060d6 RS |
52 | lvx reg1, 0, reg2; \ |
53 | vperm v8, v2, reg1, reg3; \ | |
54 | vcmpequb. v8, v0, v8; \ | |
55 | beq cr6, 1f; \ | |
56 | vspltisb v9, 0; \ | |
57 | b 2f; \ | |
58 | .align 4; \ | |
59 | 1: \ | |
60 | cmplw cr6, r5, r11; \ | |
61 | ble cr6, 2f; \ | |
62 | addi r6, reg2, 16; \ | |
63 | lvx v9, 0, r6; \ | |
64 | 2: \ | |
65 | vperm reg1, v9, reg1, reg3; | |
66 | ||
67 | /* TODO: change this to .machine power9 when minimum binutils | |
68 | is upgraded to 2.27. */ | |
69 | .machine power7 | |
d5b41185 | 70 | ENTRY_TOCLESS (STRNCMP, 4) |
d89060d6 RS |
71 | /* Check if size is 0. */ |
72 | cmpdi cr0, r5, 0 | |
73 | beq cr0, L(ret0) | |
74 | li r0, 0 | |
75 | ||
76 | /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using | |
77 | the code: | |
78 | ||
79 | (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) | |
80 | ||
81 | with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */ | |
82 | rldicl r8, r3, 0, 52 | |
83 | cmpldi cr7, r8, 4096-32 | |
84 | bgt cr7, L(pagecross) | |
85 | rldicl r9, r4, 0, 52 | |
86 | cmpldi cr7, r9, 4096-32 | |
87 | bgt cr7, L(pagecross) | |
88 | ||
89 | /* For short strings up to 32 bytes, load both s1 and s2 using | |
90 | unaligned dwords and compare. */ | |
91 | ||
92 | ld r7, 0(r3) | |
93 | ld r9, 0(r4) | |
94 | li r8, 0 | |
95 | cmpb r8, r7, r8 | |
96 | cmpb r6, r7, r9 | |
97 | orc. r8, r8, r6 | |
98 | bne cr0, L(different1) | |
99 | ||
100 | /* If the strings compared are equal, but size is less or equal | |
101 | to 8, return 0. */ | |
102 | cmpldi cr7, r5, 8 | |
103 | li r9, 0 | |
104 | ble cr7, L(ret1) | |
105 | addi r5, r5, -8 | |
106 | ||
107 | ld r7, 8(r3) | |
108 | ld r9, 8(r4) | |
109 | cmpb r8, r7, r8 | |
110 | cmpb r6, r7, r9 | |
111 | orc. r8, r8, r6 | |
112 | bne cr0, L(different1) | |
113 | cmpldi cr7, r5, 8 | |
114 | mr r9, r8 | |
115 | ble cr7, L(ret1) | |
116 | /* Update pointers and size. */ | |
117 | addi r5, r5, -8 | |
118 | addi r3, r3, 16 | |
119 | addi r4, r4, 16 | |
120 | ||
121 | ld r7, 0(r3) | |
122 | ld r9, 0(r4) | |
123 | li r8, 0 | |
124 | cmpb r8, r7, r8 | |
125 | cmpb r6, r7, r9 | |
126 | orc. r8, r8, r6 | |
127 | bne cr0, L(different1) | |
128 | cmpldi cr7, r5, 8 | |
129 | li r9, 0 | |
130 | ble cr7, L(ret1) | |
131 | addi r5, r5, -8 | |
132 | ||
133 | ld r7, 8(r3) | |
134 | ld r9, 8(r4) | |
135 | cmpb r8, r7, r8 | |
136 | cmpb r6, r7, r9 | |
137 | orc. r8, r8, r6 | |
138 | bne cr0, L(different1) | |
139 | cmpldi cr7, r5, 8 | |
140 | mr r9, r8 | |
141 | ble cr7, L(ret1) | |
142 | ||
143 | /* Update pointers and size. */ | |
144 | addi r5, r5, -8 | |
145 | addi r3, r3, 16 | |
146 | addi r4, r4, 16 | |
147 | L(align): | |
148 | /* Now it has checked for first 32 bytes, align source1 to doubleword | |
149 | and adjust source2 address. */ | |
150 | vspltisb v0, 0 | |
151 | vspltisb v2, -1 | |
152 | or r6, r4, r3 | |
153 | andi. r6, r6, 0xF | |
154 | beq cr0, L(aligned) | |
155 | lvsr v6, 0, r4 /* Compute mask. */ | |
156 | clrldi r6, r4, 60 | |
157 | subfic r11, r6, 16 | |
158 | andi. r6, r3, 0xF | |
159 | beq cr0, L(s1_align) | |
160 | /* Both s1 and s2 are unaligned. */ | |
161 | GET16BYTES(v5, r4, v6) | |
162 | lvsr v10, 0, r3 /* Compute mask. */ | |
163 | clrldi r6, r3, 60 | |
164 | subfic r11, r6, 16 | |
165 | GET16BYTES(v4, r3, v10) | |
166 | VCMPNEZB(v7, v5, v4) | |
167 | beq cr6, L(match) | |
168 | b L(different) | |
169 | ||
170 | /* Align s1 to qw and adjust s2 address. */ | |
171 | .align 4 | |
172 | L(match): | |
173 | cmpldi cr7, r5, 16 | |
174 | ble cr7, L(ret0) | |
175 | subf r5, r11, r5 | |
176 | add r3, r3, r11 | |
177 | add r4, r4, r11 | |
178 | andi. r11, r4, 0xF | |
179 | beq cr0, L(aligned) | |
180 | lvsr v6, 0, r4 | |
181 | clrldi r6, r4, 60 | |
182 | subfic r11, r6, 16 | |
183 | /* There are 2 loops depending on the input alignment. | |
184 | Each loop gets 16 bytes from s1 and s2, checks for null | |
185 | and compares them. Loops until a mismatch or null occurs. */ | |
186 | L(s1_align): | |
187 | lvx v4, 0, r3 | |
188 | GET16BYTES(v5, r4, v6) | |
189 | VCMPNEZB(v7, v5, v4) | |
190 | bne cr6, L(different) | |
191 | cmpldi cr7, r5, 16 | |
192 | ble cr7, L(ret0) | |
193 | addi r5, r5, -16 | |
194 | addi r3, r3, 16 | |
195 | addi r4, r4, 16 | |
196 | ||
197 | lvx v4, 0, r3 | |
198 | GET16BYTES(v5, r4, v6) | |
199 | VCMPNEZB(v7, v5, v4) | |
200 | bne cr6, L(different) | |
201 | cmpldi cr7, r5, 16 | |
202 | ble cr7, L(ret0) | |
203 | addi r5, r5, -16 | |
204 | addi r3, r3, 16 | |
205 | addi r4, r4, 16 | |
206 | ||
207 | lvx v4, 0, r3 | |
208 | GET16BYTES(v5, r4, v6) | |
209 | VCMPNEZB(v7, v5, v4) | |
210 | bne cr6, L(different) | |
211 | cmpldi cr7, r5, 16 | |
212 | ble cr7, L(ret0) | |
213 | addi r5, r5, -16 | |
214 | addi r3, r3, 16 | |
215 | addi r4, r4, 16 | |
216 | ||
217 | lvx v4, 0, r3 | |
218 | GET16BYTES(v5, r4, v6) | |
219 | VCMPNEZB(v7, v5, v4) | |
220 | bne cr6, L(different) | |
221 | cmpldi cr7, r5, 16 | |
222 | ble cr7, L(ret0) | |
223 | addi r5, r5, -16 | |
224 | addi r3, r3, 16 | |
225 | addi r4, r4, 16 | |
226 | b L(s1_align) | |
227 | .align 4 | |
228 | L(aligned): | |
229 | lvx v4, 0, r3 | |
230 | lvx v5, 0, r4 | |
231 | VCMPNEZB(v7, v5, v4) | |
232 | bne cr6, L(different) | |
233 | cmpldi cr7, r5, 16 | |
234 | ble cr7, L(ret0) | |
235 | addi r5, r5, -16 | |
236 | addi r3, r3, 16 | |
237 | addi r4, r4, 16 | |
238 | ||
239 | lvx v4, 0, r3 | |
240 | lvx v5, 0, r4 | |
241 | VCMPNEZB(v7, v5, v4) | |
242 | bne cr6, L(different) | |
243 | cmpldi cr7, r5, 16 | |
244 | ble cr7, L(ret0) | |
245 | addi r5, r5, -16 | |
246 | addi r3, r3, 16 | |
247 | addi r4, r4, 16 | |
248 | ||
249 | lvx v4, 0, r3 | |
250 | lvx v5, 0, r4 | |
251 | VCMPNEZB(v7, v5, v4) | |
252 | bne cr6, L(different) | |
253 | cmpldi cr7, r5, 16 | |
254 | ble cr7, L(ret0) | |
255 | addi r5, r5, -16 | |
256 | addi r3, r3, 16 | |
257 | addi r4, r4, 16 | |
258 | ||
259 | lvx v4, 0, r3 | |
260 | lvx v5, 0, r4 | |
261 | VCMPNEZB(v7, v5, v4) | |
262 | bne cr6, L(different) | |
263 | cmpldi cr7, r5, 16 | |
264 | ble cr7, L(ret0) | |
265 | addi r5, r5, -16 | |
266 | addi r3, r3, 16 | |
267 | addi r4, r4, 16 | |
268 | b L(aligned) | |
269 | /* Calculate and return the difference. */ | |
270 | L(different): | |
271 | VCTZLSBB(r6, v7) | |
272 | cmplw cr7, r5, r6 | |
273 | ble cr7, L(ret0) | |
274 | VEXTUBRX(r5, r6, v4) | |
275 | VEXTUBRX(r4, r6, v5) | |
276 | subf r3, r4, r5 | |
277 | extsw r3, r3 | |
278 | blr | |
279 | ||
280 | .align 4 | |
281 | L(ret0): | |
282 | li r9, 0 | |
283 | L(ret1): | |
284 | mr r3, r9 | |
285 | blr | |
286 | ||
287 | /* The code now checks if r8 and r5 are different by issuing a | |
288 | cmpb and shifts the result based on its output: | |
289 | ||
290 | leadzero = (__builtin_ffsl (z1) - 1); | |
291 | leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero; | |
292 | r1 = (r1 >> leadzero) & 0xFFUL; | |
293 | r2 = (r2 >> leadzero) & 0xFFUL; | |
294 | return r1 - r2; */ | |
295 | ||
296 | .align 4 | |
297 | L(different1): | |
298 | neg r11, r8 | |
299 | sldi r5, r5, 3 | |
300 | and r8, r11, r8 | |
301 | addi r5, r5, -8 | |
302 | cntlzd r8, r8 | |
303 | subfic r8, r8, 63 | |
304 | extsw r8, r8 | |
305 | cmpld cr7, r8, r5 | |
306 | ble cr7, L(different2) | |
307 | mr r8, r5 | |
308 | L(different2): | |
309 | extsw r8, r8 | |
310 | srd r7, r7, r8 | |
311 | srd r9, r9, r8 | |
312 | rldicl r3, r7, 0, 56 | |
313 | rldicl r9, r9, 0, 56 | |
314 | subf r9, r9, 3 | |
315 | extsw r9, r9 | |
316 | mr r3, r9 | |
317 | blr | |
318 | ||
319 | /* If unaligned 16 bytes reads across a 4K page boundary, it uses | |
320 | a simple byte a byte comparison until the page alignment for s1 | |
321 | is reached. */ | |
322 | .align 4 | |
323 | L(pagecross): | |
324 | lbz r7, 0(r3) | |
325 | lbz r9, 0(r4) | |
326 | subfic r8, r8,4095 | |
327 | cmplw cr7, r9, r7 | |
328 | bne cr7, L(byte_ne_3) | |
329 | cmpdi cr7, r9, 0 | |
330 | beq cr7, L(byte_ne_0) | |
331 | addi r5, r5, -1 | |
332 | subf r7, r8, r5 | |
333 | subf r9, r7, r5 | |
334 | addi r9, r9, 1 | |
335 | mtctr r9 | |
336 | b L(pagecross_loop1) | |
337 | ||
338 | .align 4 | |
339 | L(pagecross_loop0): | |
340 | beq cr7, L(ret0) | |
341 | lbz r9, 0(r3) | |
342 | lbz r8, 0(r4) | |
343 | addi r5, r5, -1 | |
344 | cmplw cr7, r9, r8 | |
345 | cmpdi cr5, r9, 0 | |
346 | bne cr7, L(byte_ne_2) | |
347 | beq cr5, L(byte_ne_0) | |
348 | L(pagecross_loop1): | |
349 | cmpdi cr7, r5, 0 | |
350 | addi r3, r3, 1 | |
351 | addi r4, r4, 1 | |
352 | bdnz L(pagecross_loop0) | |
353 | cmpdi cr7, r7, 0 | |
354 | li r9, 0 | |
355 | bne+ cr7, L(align) | |
356 | b L(ret1) | |
357 | ||
358 | .align 4 | |
359 | L(byte_ne_0): | |
360 | li r7, 0 | |
361 | L(byte_ne_1): | |
362 | subf r9, r9, r7 | |
363 | extsw r9, r9 | |
364 | b L(ret1) | |
365 | ||
366 | .align 4 | |
367 | L(byte_ne_2): | |
368 | extsw r7, r9 | |
369 | mr r9, r8 | |
370 | b L(byte_ne_1) | |
371 | L(byte_ne_3): | |
372 | extsw r7, r7 | |
373 | b L(byte_ne_1) | |
3bc426e1 | 374 | END(STRNCMP) |
d89060d6 | 375 | libc_hidden_builtin_def(strncmp) |